Merge commit 'v2.6.36-rc3' into x86/memblock

Conflicts: arch/x86/kernel/trampoline.c mm/memblock.c Merge reason: Resolve the conflicts, update to latest upstream. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2010-08-31 03:45:21 -0400
committer: Ingo Molnar <mingo@elte.hu> 2010-08-31 03:45:46 -0400
commit: daab7fc734a53fdeaf844b7c03053118ad1769da (patch)
tree: 575deb3cdcc6dda562acaed6f7c29bc81ae01cf2 /mm/vmscan.c
parent: 774ea0bcb27f57b6fd521b3b6c43237782fed4b9 (diff)
parent: 2bfc96a127bc1cc94d26bfaa40159966064f9c8c (diff)
1 files changed, 305 insertions, 243 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..c391c320dbaf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
+                trace_mm_vmscan_writepage(page,
+                        trace_reclaim_flags(page, sync_writeback));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+        struct pagevec freed_pvec;
+        struct page *page, *tmp;
+        pagevec_init(&freed_pvec, 1);
+        list_for_each_entry_safe(page, tmp, free_pages, lru) {
+                list_del(&page->lru);
+                if (!pagevec_add(&freed_pvec, page)) {
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
+        }
+        pagevec_free(&freed_pvec);
+}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                        enum pageout_io sync_writeback)
 {
        LIST_HEAD(ret_pages);
-        struct pagevec freed_pvec;
+        LIST_HEAD(free_pages);
        int pgactivate = 0;
        unsigned long nr_reclaimed = 0;
        cond_resched();
-        pagevec_init(&freed_pvec, 1);
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                __clear_page_locked(page);
 free_it:
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_free(&freed_pvec);
+                /*
-                        pagevec_reinit(&freed_pvec);
+                 * Is there need to periodically free_page_list? It would
-                }
+                 * appear not as the counts should be low
+                 */
+                list_add(&page->lru, &free_pages);
                continue;
 cull_mlocked:
@@ -832,9 +856,10 @@ keep:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
+        free_page_list(&free_pages);
        list_splice(&ret_pages, page_list);
-        if (pagevec_count(&freed_pvec))
-                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                unsigned long *scanned, int order, int mode, int file)
 {
        unsigned long nr_taken = 0;
+        unsigned long nr_lumpy_taken = 0;
+        unsigned long nr_lumpy_dirty = 0;
+        unsigned long nr_lumpy_failed = 0;
        unsigned long scan;
        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
                                nr_taken++;
+                                nr_lumpy_taken++;
+                                if (PageDirty(cursor_page))
+                                        nr_lumpy_dirty++;
                                scan++;
+                        } else {
+                                if (mode == ISOLATE_BOTH &&
+                                                page_count(cursor_page))
+                                        nr_lumpy_failed++;
                        }
                }
        }
        *scanned = scan;
+        trace_mm_vmscan_lru_isolate(order,
+                        nr_to_scan, scan,
+                        nr_taken,
+                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+                        mode);
        return nr_taken;
 }
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
                        ClearPageActive(page);
                        nr_active++;
                }
-                count[lru]++;
+                if (count)
+                        count[lru]++;
        }
        return nr_active;
@@ -1112,174 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
 }
 /*
- * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * TODO: Try merging with migrations version of putback_lru_pages
- * of reclaimed pages
 */
-static unsigned long shrink_inactive_list(unsigned long max_scan,
+static noinline_for_stack void
-                        struct zone *zone, struct scan_control *sc,
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
-                        int priority, int file)
+                                unsigned long nr_anon, unsigned long nr_file,
+                                struct list_head *page_list)
 {
-        LIST_HEAD(page_list);
+        struct page *page;
        struct pagevec pvec;
-        unsigned long nr_scanned = 0;
-        unsigned long nr_reclaimed = 0;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        while (unlikely(too_many_isolated(zone, file, sc))) {
+        pagevec_init(&pvec, 1);
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                /* We are about to die and free our memory. Return now. */
+        /*
-                if (fatal_signal_pending(current))
+         * Put back any unfreeable pages.
-                        return SWAP_CLUSTER_MAX;
+         */
+        spin_lock(&zone->lru_lock);
+        while (!list_empty(page_list)) {
+                int lru;
+                page = lru_to_page(page_list);
+                VM_BUG_ON(PageLRU(page));
+                list_del(&page->lru);
+                if (unlikely(!page_evictable(page, NULL))) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        putback_lru_page(page);
+                        spin_lock_irq(&zone->lru_lock);
+                        continue;
+                }
+                SetPageLRU(page);
+                lru = page_lru(page);
+                add_page_to_lru_list(zone, page, lru);
+                if (is_active_lru(lru)) {
+                        int file = is_file_lru(lru);
+                        reclaim_stat->recent_rotated[file]++;
+                }
+                if (!pagevec_add(&pvec, page)) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        __pagevec_release(&pvec);
+                        spin_lock_irq(&zone->lru_lock);
+                }
        }
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+        spin_unlock_irq(&zone->lru_lock);
+        pagevec_release(&pvec);
+}
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+                                        struct scan_control *sc,
+                                        unsigned long *nr_anon,
+                                        unsigned long *nr_file,
+                                        struct list_head *isolated_list)
+{
+        unsigned long nr_active;
+        unsigned int count[NR_LRU_LISTS] = { 0, };
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        pagevec_init(&pvec, 1);
+        nr_active = clear_active_flags(isolated_list, count);
+        __count_vm_events(PGDEACTIVATE, nr_active);
-        lru_add_drain();
+        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
-        spin_lock_irq(&zone->lru_lock);
+                              -count[LRU_ACTIVE_FILE]);
-        do {
+        __mod_zone_page_state(zone, NR_INACTIVE_FILE,
-                struct page *page;
+                              -count[LRU_INACTIVE_FILE]);
-                unsigned long nr_taken;
+        __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                unsigned long nr_scan;
+                              -count[LRU_ACTIVE_ANON]);
-                unsigned long nr_freed;
+        __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                unsigned long nr_active;
+                              -count[LRU_INACTIVE_ANON]);
-                unsigned int count[NR_LRU_LISTS] = { 0, };
-                int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
-                unsigned long nr_anon;
-                unsigned long nr_file;
-                if (scanning_global_lru(sc)) {
+        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-                        nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
+        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-                                                        &page_list, &nr_scan,
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
-                                                        sc->order, mode,
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
-                                                        zone, 0, file);
-                        zone->pages_scanned += nr_scan;
-                        if (current_is_kswapd())
-                                __count_zone_vm_events(PGSCAN_KSWAPD, zone,
-                                                       nr_scan);
-                        else
-                                __count_zone_vm_events(PGSCAN_DIRECT, zone,
-                                                       nr_scan);
-                } else {
-                        nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
-                                                        &page_list, &nr_scan,
-                                                        sc->order, mode,
-                                                        zone, sc->mem_cgroup,
-                                                        0, file);
-                        /*
-                         * mem_cgroup_isolate_pages() keeps track of
-                         * scanned pages on its own.
-                         */
-                }
-                if (nr_taken == 0)
+        reclaim_stat->recent_scanned[0] += *nr_anon;
-                        goto done;
+        reclaim_stat->recent_scanned[1] += *nr_file;
+}
-                nr_active = clear_active_flags(&page_list, count);
+/*
-                __count_vm_events(PGDEACTIVATE, nr_active);
+ * Returns true if the caller should wait to clean dirty/writeback pages.
+ *
+ * If we are direct reclaiming for contiguous pages and we do not reclaim
+ * everything in the list, try again and wait for writeback IO to complete.
+ * This will stall high-order allocations noticeably. Only do that when really
+ * need to free the pages under high memory pressure.
+ */
+static inline bool should_reclaim_stall(unsigned long nr_taken,
+                                        unsigned long nr_freed,
+                                        int priority,
+                                        struct scan_control *sc)
+{
+        int lumpy_stall_priority;
-                __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+        /* kswapd should not stall on sync IO */
-                                                -count[LRU_ACTIVE_FILE]);
+        if (current_is_kswapd())
-                __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+                return false;
-                                                -count[LRU_INACTIVE_FILE]);
-                __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                                                -count[LRU_ACTIVE_ANON]);
-                __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                                                -count[LRU_INACTIVE_ANON]);
-                nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+        /* Only stall on lumpy reclaim */
-                nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+        if (!sc->lumpy_reclaim_mode)
-                __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+                return false;
-                __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-                reclaim_stat->recent_scanned[0] += nr_anon;
+        /* If we have relaimed everything on the isolated list, no stall */
-                reclaim_stat->recent_scanned[1] += nr_file;
+        if (nr_freed == nr_taken)
+                return false;
-                spin_unlock_irq(&zone->lru_lock);
+        /*
+         * For high-order allocations, there are two stall thresholds.
+         * High-cost allocations stall immediately where as lower
+         * order allocations such as stacks require the scanning
+         * priority to be much higher before stalling.
+         */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                lumpy_stall_priority = DEF_PRIORITY;
+        else
+                lumpy_stall_priority = DEF_PRIORITY / 3;
-                nr_scanned += nr_scan;
+        return priority <= lumpy_stall_priority;
-                nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+}
+/*
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+                        struct scan_control *sc, int priority, int file)
+{
+        LIST_HEAD(page_list);
+        unsigned long nr_scanned;
+        unsigned long nr_reclaimed = 0;
+        unsigned long nr_taken;
+        unsigned long nr_active;
+        unsigned long nr_anon;
+        unsigned long nr_file;
+        while (unlikely(too_many_isolated(zone, file, sc))) {
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                /* We are about to die and free our memory. Return now. */
+                if (fatal_signal_pending(current))
+                        return SWAP_CLUSTER_MAX;
+        }
+        lru_add_drain();
+        spin_lock_irq(&zone->lru_lock);
+        if (scanning_global_lru(sc)) {
+                nr_taken = isolate_pages_global(nr_to_scan,
+                        &page_list, &nr_scanned, sc->order,
+                        sc->lumpy_reclaim_mode ?
+                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                        zone, 0, file);
+                zone->pages_scanned += nr_scanned;
+                if (current_is_kswapd())
+                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+                                               nr_scanned);
+                else
+                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
+                                               nr_scanned);
+        } else {
+                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                        &page_list, &nr_scanned, sc->order,
+                        sc->lumpy_reclaim_mode ?
+                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                        zone, sc->mem_cgroup,
+                        0, file);
                /*
-                 * If we are direct reclaiming for contiguous pages and we do
+                 * mem_cgroup_isolate_pages() keeps track of
-                 * not reclaim everything in the list, try again and wait
+                 * scanned pages on its own.
-                 * for IO to complete. This will stall high-order allocations
-                 * but that should be acceptable to the caller
                 */
-                if (nr_freed < nr_taken && !current_is_kswapd() &&
+        }
-                    sc->lumpy_reclaim_mode) {
-                        congestion_wait(BLK_RW_ASYNC, HZ/10);
-                        /*
+        if (nr_taken == 0) {
-                         * The attempt at page out may have made some
+                spin_unlock_irq(&zone->lru_lock);
-                         * of the pages active, mark them inactive again.
+                return 0;
-                         */
+        }
-                        nr_active = clear_active_flags(&page_list, count);
-                        count_vm_events(PGDEACTIVATE, nr_active);
-                        nr_freed += shrink_page_list(&page_list, sc,
+        update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
-                                                        PAGEOUT_IO_SYNC);
-                }
-                nr_reclaimed += nr_freed;
+        spin_unlock_irq(&zone->lru_lock);
-                local_irq_disable();
+        nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
-                if (current_is_kswapd())
-                        __count_vm_events(KSWAPD_STEAL, nr_freed);
+        /* Check if we should syncronously wait for writeback */
-                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                spin_lock(&zone->lru_lock);
                /*
-                 * Put back any unfreeable pages.
+                 * The attempt at page out may have made some
+                 * of the pages active, mark them inactive again.
                 */
-                while (!list_empty(&page_list)) {
+                nr_active = clear_active_flags(&page_list, NULL);
-                        int lru;
+                count_vm_events(PGDEACTIVATE, nr_active);
-                        page = lru_to_page(&page_list);
-                        VM_BUG_ON(PageLRU(page));
-                        list_del(&page->lru);
-                        if (unlikely(!page_evictable(page, NULL))) {
-                                spin_unlock_irq(&zone->lru_lock);
-                                putback_lru_page(page);
-                                spin_lock_irq(&zone->lru_lock);
-                                continue;
-                        }
-                        SetPageLRU(page);
-                        lru = page_lru(page);
-                        add_page_to_lru_list(zone, page, lru);
-                        if (is_active_lru(lru)) {
-                                int file = is_file_lru(lru);
-                                reclaim_stat->recent_rotated[file]++;
-                        }
-                        if (!pagevec_add(&pvec, page)) {
-                                spin_unlock_irq(&zone->lru_lock);
-                                __pagevec_release(&pvec);
-                                spin_lock_irq(&zone->lru_lock);
-                        }
-                }
-                __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-                __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
-        } while (nr_scanned < max_scan);
+                nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+        }
-done:
+        local_irq_disable();
-        spin_unlock_irq(&zone->lru_lock);
+        if (current_is_kswapd())
-        pagevec_release(&pvec);
+                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
-        return nr_reclaimed;
+        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
-}
-/*
+        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
- * We are about to scan this zone at a certain priority level.  If that priority
+        return nr_reclaimed;
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-        if (priority < zone->prev_priority)
-                zone->prev_priority = priority;
 }
 /*
@@ -1583,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        }
        /*
+         * With swappiness at 100, anonymous and file have the same priority.
+         * This scanning priority is essentially the inverse of IO cost.
+         */
+        anon_prio = sc->swappiness;
+        file_prio = 200 - sc->swappiness;
+        /*
         * OK, so we have swap space and a fair amount of page cache
         * pages.  We use the recently rotated / recently scanned
         * ratios to determine how valuable each cache is.
@@ -1593,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
+        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
-                spin_lock_irq(&zone->lru_lock);
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
-                spin_unlock_irq(&zone->lru_lock);
        }
        if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
-                spin_lock_irq(&zone->lru_lock);
                reclaim_stat->recent_scanned[1] /= 2;
                reclaim_stat->recent_rotated[1] /= 2;
-                spin_unlock_irq(&zone->lru_lock);
        }
        /*
-         * With swappiness at 100, anonymous and file have the same priority.
-         * This scanning priority is essentially the inverse of IO cost.
-         */
-        anon_prio = sc->swappiness;
-        file_prio = 200 - sc->swappiness;
-        /*
         * The amount of pressure on anon vs file pages is inversely
         * proportional to the fraction of recently scanned pages on
         * each list that were recently referenced and in active use.
@@ -1624,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
+        spin_unlock_irq(&zone->lru_lock);
        fraction[0] = ap;
        fraction[1] = fp;
@@ -1729,13 +1807,12 @@ static void shrink_zone(int priority, struct zone *zone,
 static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
-        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        struct zoneref *z;
        struct zone *zone;
        bool all_unreclaimable = true;
-        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                        sc->nodemask) {
+                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
                if (!populated_zone(zone))
                        continue;
                /*
@@ -1745,17 +1822,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                if (scanning_global_lru(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
-                        note_zone_scanning_priority(zone, priority);
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
-                } else {
-                        /*
-                         * Ignore cpuset limitation here. We just want to reduce
-                         * # of used pages by us regardless of memory shortage.
-                         */
-                        mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
-                                                        priority);
                }
                shrink_zone(priority, zone, sc);
@@ -1787,10 +1855,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        bool all_unreclaimable;
        unsigned long total_scanned = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
-        unsigned long lru_pages = 0;
        struct zoneref *z;
        struct zone *zone;
-        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        unsigned long writeback_threshold;
        get_mems_allowed();
@@ -1798,18 +1864,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        if (scanning_global_lru(sc))
                count_vm_event(ALLOCSTALL);
-        /*
-         * mem_cgroup will not do shrink_slab.
-         */
-        if (scanning_global_lru(sc)) {
-                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                                continue;
-                        lru_pages += zone_reclaimable_pages(zone);
-                }
-        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
@@ -1821,6 +1875,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 * over limit cgroups
                 */
                if (scanning_global_lru(sc)) {
+                        unsigned long lru_pages = 0;
+                        for_each_zone_zonelist(zone, z, zonelist,
+                                        gfp_zone(sc->gfp_mask)) {
+                                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                        continue;
+                                lru_pages += zone_reclaimable_pages(zone);
+                        }
                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                        if (reclaim_state) {
                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1924,6 @@ out:
        if (priority < 0)
                priority = 0;
-        if (scanning_global_lru(sc)) {
-                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                                continue;
-                        zone->prev_priority = priority;
-                }
-        } else
-                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
        delayacct_freepages_end();
        put_mems_allowed();
@@ -1888,6 +1940,7 @@ out:
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
+        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
@@ -1900,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .nodemask = nodemask,
        };
-        return do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_direct_reclaim_begin(order,
+                                sc.may_writepage,
+                                gfp_mask);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+        return nr_reclaimed;
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1908,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                unsigned int swappiness,
-                                                struct zone *zone, int nid)
+                                                struct zone *zone)
 {
        struct scan_control sc = {
+                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
@@ -1918,13 +1980,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .order = 0,
                .mem_cgroup = mem,
        };
-        nodemask_t nm  = nodemask_of_node(nid);
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-        sc.nodemask = &nm;
-        sc.nr_reclaimed = 0;
+        trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
-        sc.nr_scanned = 0;
+                                                      sc.may_writepage,
+                                                      sc.gfp_mask);
        /*
         * NOTE: Although we can get the priority field, using it
         * here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
         * the priority and make it zero.
         */
        shrink_zone(0, zone, &sc);
+        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
        return sc.nr_reclaimed;
 }
@@ -1942,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           unsigned int swappiness)
 {
        struct zonelist *zonelist;
+        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -1956,7 +2022,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
-        return do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_memcg_reclaim_begin(0,
+                                            sc.may_writepage,
+                                            sc.gfp_mask);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+        return nr_reclaimed;
 }
 #endif
@@ -2028,22 +2103,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .order = order,
                .mem_cgroup = NULL,
        };
-        /*
-         * temp_priority is used to remember the scanning priority at which
-         * this zone was successfully refilled to
-         * free_pages == high_wmark_pages(zone).
-         */
-        int temp_priority[MAX_NR_ZONES];
 loop_again:
        total_scanned = 0;
        sc.nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
-        for (i = 0; i < pgdat->nr_zones; i++)
-                temp_priority[i] = DEF_PRIORITY;
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
@@ -2103,7 +2168,6 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
-                        int nid, zid;
                        if (!populated_zone(zone))
                                continue;
@@ -2111,18 +2175,14 @@ loop_again:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
-                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
-                        note_zone_scanning_priority(zone, priority);
-                        nid = pgdat->node_id;
-                        zid = zone_idx(zone);
                        /*
                         * Call soft limit reclaim before calling shrink_zone.
                         * For now we ignore the return value
                         */
-                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
-                                                        nid, zid);
                        /*
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
@@ -2186,16 +2246,6 @@ loop_again:
                        break;
        }
 out:
-        /*
-         * Note within each zone the priority level at which this zone was
-         * brought into a happy state.  So that the next thread which scans this
-         * zone will start out at that priority level.
-         */
-        for (i = 0; i < pgdat->nr_zones; i++) {
-                struct zone *zone = pgdat->node_zones + i;
-                zone->prev_priority = temp_priority[i];
-        }
        if (!all_zones_ok) {
                cond_resched();
@@ -2299,9 +2349,10 @@ static int kswapd(void *p)
                                 * premature sleep. If not, then go fully
                                 * to sleep until explicitly woken up
                                 */
-                                if (!sleeping_prematurely(pgdat, order, remaining))
+                                if (!sleeping_prematurely(pgdat, order, remaining)) {
+                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                                        schedule();
-                                else {
+                                } else {
                                        if (remaining)
                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
                                        else
@@ -2321,8 +2372,10 @@ static int kswapd(void *p)
                 * We can speed up thawing tasks if we don't call balance_pgdat
                 * after returning from the refrigerator
                 */
-                if (!ret)
+                if (!ret) {
+                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
                        balance_pgdat(pgdat, order);
+                }
        }
        return 0;
 }
@@ -2342,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
        if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2644,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swappiness = vm_swappiness,
                .order = order,
        };
-        unsigned long slab_reclaimable;
+        unsigned long nr_slab_pages0, nr_slab_pages1;
-        disable_swap_token();
        cond_resched();
        /*
         * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 */
                priority = ZONE_RECLAIM_PRIORITY;
                do {
-                        note_zone_scanning_priority(zone, priority);
                        shrink_zone(priority, zone, &sc);
                        priority--;
                } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
        }
-        slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+        nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-        if (slab_reclaimable > zone->min_slab_pages) {
+        if (nr_slab_pages0 > zone->min_slab_pages) {
                /*
                 * shrink_slab() does not currently allow us to determine how
                 * many pages were freed in this zone. So we take the current
@@ -2629,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * Note that shrink_slab will free memory on all zones and may
                 * take a long time.
                 */
-                while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+                for (;;) {
-                        zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+                        unsigned long lru_pages = zone_reclaimable_pages(zone);
-                                slab_reclaimable - nr_pages)
-                        ;
+                        /* No reclaimable slab or very low memory pressure */
+                        if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+                                break;
+                        /* Freed enough memory */
+                        nr_slab_pages1 = zone_page_state(zone,
+                                                        NR_SLAB_RECLAIMABLE);
+                        if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+                                break;
+                }
                /*
                 * Update nr_reclaimed by the number of slab pages we
                 * reclaimed from this zone.
                 */
-                sc.nr_reclaimed += slab_reclaimable -
+                nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-                        zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+                if (nr_slab_pages1 < nr_slab_pages0)
+                        sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
        }
        p->reclaim_state = NULL;
author	Ingo Molnar <mingo@elte.hu>	2010-08-31 03:45:21 -0400
committer	Ingo Molnar <mingo@elte.hu>	2010-08-31 03:45:46 -0400
commit	daab7fc734a53fdeaf844b7c03053118ad1769da (patch)
tree	575deb3cdcc6dda562acaed6f7c29bc81ae01cf2 /mm/vmscan.c
parent	774ea0bcb27f57b6fd521b3b6c43237782fed4b9 (diff)
parent	2bfc96a127bc1cc94d26bfaa40159966064f9c8c (diff)