1 files changed, 230 insertions, 111 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e..ff2ebe9458a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -46,8 +47,6 @@ struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
-        unsigned long nr_mapped;        /* From page_state */
        /* This context's GFP mask */
        gfp_t gfp_mask;
@@ -61,6 +60,8 @@ struct scan_control {
         * In this context, it doesn't matter that we scan the
         * whole list at once. */
        int swap_cluster_max;
+        int swappiness;
 };
 /*
@@ -108,7 +109,7 @@ struct shrinker {
 * From 0 .. 100.  Higher means more swappy.
 */
 int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages;    /* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -214,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                                break;
                        if (shrink_ret < nr_before)
                                ret += nr_before - shrink_ret;
-                        mod_page_state(slabs_scanned, this_scan);
+                        count_vm_events(SLABS_SCANNED, this_scan);
                        total_scan -= this_scan;
                        cond_resched();
@@ -288,11 +289,23 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
+/* possible outcome of pageout() */
+typedef enum {
+        /* failed to write page out, page is locked */
+        PAGE_KEEP,
+        /* move page to the active list, page is locked */
+        PAGE_ACTIVATE,
+        /* page has been sent to the disk successfully, page is unlocked */
+        PAGE_SUCCESS,
+        /* page is clean and locked */
+        PAGE_CLEAN,
+} pageout_t;
 /*
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -337,6 +350,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
                struct writeback_control wbc = {
                        .sync_mode = WB_SYNC_NONE,
                        .nr_to_write = SWAP_CLUSTER_MAX,
+                        .range_start = 0,
+                        .range_end = LLONG_MAX,
                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -554,7 +569,7 @@ keep:
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
                __pagevec_release_nonlru(&freed_pvec);
-        mod_page_state(pgactivate, pgactivate);
+        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -644,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                nr_reclaimed += nr_freed;
                local_irq_disable();
                if (current_is_kswapd()) {
-                        __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
-                        __mod_page_state(kswapd_steal, nr_freed);
+                        __count_vm_events(KSWAPD_STEAL, nr_freed);
                } else
-                        __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
-                __mod_page_state_zone(zone, pgsteal, nr_freed);
+                __count_vm_events(PGACTIVATE, nr_freed);
                if (nr_taken == 0)
                        goto done;
@@ -727,7 +742,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * how much memory
                 * is mapped.
                 */
-                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+                                global_page_state(NR_ANON_PAGES)) * 100) /
+                                        vm_total_pages;
                /*
                 * Now decide how much we really want to unmap some pages.  The
@@ -741,7 +758,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * A 100% value of vm_swappiness overrides this algorithm
                 * altogether.
                 */
-                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
                /*
                 * Now use this metric to decide whether to start moving mapped
@@ -824,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
        }
        zone->nr_active += pgmoved;
-        spin_unlock(&zone->lru_lock);
-        __mod_page_state_zone(zone, pgrefill, pgscanned);
+        __count_zone_vm_events(PGREFILL, zone, pgscanned);
-        __mod_page_state(pgdeactivate, pgdeactivate);
+        __count_vm_events(PGDEACTIVATE, pgdeactivate);
-        local_irq_enable();
+        spin_unlock_irq(&zone->lru_lock);
        pagevec_release(&pvec);
 }
@@ -957,9 +973,10 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                .may_writepage = !laptop_mode,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .may_swap = 1,
+                .swappiness = vm_swappiness,
        };
-        inc_page_state(allocstall);
+        count_vm_event(ALLOCSTALL);
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
@@ -972,7 +989,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-                sc.nr_mapped = read_page_state(nr_mapped);
                sc.nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
@@ -1021,10 +1037,6 @@ out:
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at pages_high.
 *
- * If `nr_pages' is non-zero then it is the number of pages which are to be
- * reclaimed, regardless of the zone occupancies.  This is a software suspend
- * special.
- *
 * Returns the number of pages which were actually freed.
 *
 * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1054,8 @@ out:
 * the page allocator fallback scheme to ensure that aging of pages is balanced
 * across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
-                                int order)
 {
-        unsigned long to_free = nr_pages;
        int all_zones_ok;
        int priority;
        int i;
@@ -1055,16 +1065,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_swap = 1,
-                .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .swappiness = vm_swappiness,
        };
 loop_again:
        total_scanned = 0;
        nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
-        sc.nr_mapped = read_page_state(nr_mapped);
+        count_vm_event(PAGEOUTRUN);
-        inc_page_state(pageoutrun);
        for (i = 0; i < pgdat->nr_zones; i++) {
                struct zone *zone = pgdat->node_zones + i;
@@ -1082,31 +1091,26 @@ loop_again:
                all_zones_ok = 1;
-                if (nr_pages == 0) {
+                /*
-                        /*
+                 * Scan in the highmem->dma direction for the highest
-                         * Scan in the highmem->dma direction for the highest
+                 * zone which needs scanning
-                         * zone which needs scanning
+                 */
-                         */
+                for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-                        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+                        struct zone *zone = pgdat->node_zones + i;
-                                struct zone *zone = pgdat->node_zones + i;
-                                if (!populated_zone(zone))
+                        if (!populated_zone(zone))
-                                        continue;
+                                continue;
-                                if (zone->all_unreclaimable &&
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                                                priority != DEF_PRIORITY)
+                                continue;
-                                        continue;
-                                if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                                zone->pages_high, 0, 0)) {
+                                               0, 0)) {
-                                        end_zone = i;
+                                end_zone = i;
-                                        goto scan;
+                                goto scan;
-                                }
                        }
-                        goto out;
-                } else {
-                        end_zone = pgdat->nr_zones - 1;
                }
+                goto out;
 scan:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1137,9 @@ scan:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
-                        if (nr_pages == 0) {    /* Not software suspend */
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                if (!zone_watermark_ok(zone, order,
+                                               end_zone, 0))
-                                                zone->pages_high, end_zone, 0))
+                                all_zones_ok = 0;
-                                        all_zones_ok = 0;
-                        }
                        zone->temp_priority = priority;
                        if (zone->prev_priority > priority)
                                zone->prev_priority = priority;
@@ -1162,8 +1164,6 @@ scan:
                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
-                if (nr_pages && to_free > nr_reclaimed)
-                        continue;       /* swsusp: need to do more work */
                if (all_zones_ok)
                        break;          /* kswapd: all done */
                /*
@@ -1179,7 +1179,7 @@ scan:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
+                if (nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        }
 out:
@@ -1220,7 +1220,6 @@ static int kswapd(void *p)
        };
        cpumask_t cpumask;
-        daemonize("kswapd%d", pgdat->node_id);
        cpumask = node_to_cpumask(pgdat->node_id);
        if (!cpus_empty(cpumask))
                set_cpus_allowed(tsk, cpumask);
@@ -1261,7 +1260,7 @@ static int kswapd(void *p)
                }
                finish_wait(&pgdat->kswapd_wait, &wait);
-                balance_pgdat(pgdat, 0, order);
+                balance_pgdat(pgdat, order);
        }
        return 0;
 }
@@ -1290,35 +1289,152 @@ void wakeup_kswapd(struct zone *zone, int order)
 #ifdef CONFIG_PM
 /*
- * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
+ * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
- * pages.
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+                                      int prio, struct scan_control *sc)
+{
+        struct zone *zone;
+        unsigned long nr_to_scan, ret = 0;
+        for_each_zone(zone) {
+                if (!populated_zone(zone))
+                        continue;
+                if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+                        continue;
+                /* For pass = 0 we don't shrink the active list */
+                if (pass > 0) {
+                        zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+                        if (zone->nr_scan_active >= nr_pages || pass > 3) {
+                                zone->nr_scan_active = 0;
+                                nr_to_scan = min(nr_pages, zone->nr_active);
+                                shrink_active_list(nr_to_scan, zone, sc);
+                        }
+                }
+                zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+                if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+                        zone->nr_scan_inactive = 0;
+                        nr_to_scan = min(nr_pages, zone->nr_inactive);
+                        ret += shrink_inactive_list(nr_to_scan, zone, sc);
+                        if (ret >= nr_pages)
+                                return ret;
+                }
+        }
+        return ret;
+}
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
 */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
-        pg_data_t *pgdat;
+        unsigned long lru_pages, nr_slab;
-        unsigned long nr_to_free = nr_pages;
        unsigned long ret = 0;
-        unsigned retry = 2;
+        int pass;
-        struct reclaim_state reclaim_state = {
+        struct reclaim_state reclaim_state;
-                .reclaimed_slab = 0,
+        struct zone *zone;
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .may_swap = 0,
+                .swap_cluster_max = nr_pages,
+                .may_writepage = 1,
+                .swappiness = vm_swappiness,
        };
        current->reclaim_state = &reclaim_state;
-repeat:
-        for_each_online_pgdat(pgdat) {
+        lru_pages = 0;
-                unsigned long freed;
+        for_each_zone(zone)
+                lru_pages += zone->nr_active + zone->nr_inactive;
-                freed = balance_pgdat(pgdat, nr_to_free, 0);
-                ret += freed;
+        nr_slab = global_page_state(NR_SLAB);
-                nr_to_free -= freed;
+        /* If slab caches are huge, it's better to hit them first */
-                if ((long)nr_to_free <= 0)
+        while (nr_slab >= lru_pages) {
+                reclaim_state.reclaimed_slab = 0;
+                shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                if (!reclaim_state.reclaimed_slab)
                        break;
+                ret += reclaim_state.reclaimed_slab;
+                if (ret >= nr_pages)
+                        goto out;
+                nr_slab -= reclaim_state.reclaimed_slab;
        }
-        if (retry-- && ret < nr_pages) {
-                blk_congestion_wait(WRITE, HZ/5);
+        /*
-                goto repeat;
+         * We try to shrink LRUs in 5 passes:
+         * 0 = Reclaim from inactive_list only
+         * 1 = Reclaim from active list but don't reclaim mapped
+         * 2 = 2nd pass of type 1
+         * 3 = Reclaim mapped (normal reclaim)
+         * 4 = 2nd pass of type 3
+         */
+        for (pass = 0; pass < 5; pass++) {
+                int prio;
+                /* Needed for shrinking slab caches later on */
+                if (!lru_pages)
+                        for_each_zone(zone) {
+                                lru_pages += zone->nr_active;
+                                lru_pages += zone->nr_inactive;
+                        }
+                /* Force reclaiming mapped pages in the passes #3 and #4 */
+                if (pass > 2) {
+                        sc.may_swap = 1;
+                        sc.swappiness = 100;
+                }
+                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+                        unsigned long nr_to_scan = nr_pages - ret;
+                        sc.nr_scanned = 0;
+                        ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+                        if (ret >= nr_pages)
+                                goto out;
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                        if (ret >= nr_pages)
+                                goto out;
+                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+                                blk_congestion_wait(WRITE, HZ / 10);
+                }
+                lru_pages = 0;
        }
+        /*
+         * If ret = 0, we could not shrink LRUs, but there may be something
+         * in slab caches
+         */
+        if (!ret)
+                do {
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+out:
        current->reclaim_state = NULL;
        return ret;
 }
 #endif
@@ -1328,7 +1444,7 @@ repeat:
   not required for correctness.  So if the last cpu in a node goes
   away, we get changed to run anywhere: as the first one comes back,
   restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action, void *hcpu)
 {
        pg_data_t *pgdat;
@@ -1346,21 +1462,35 @@ static int cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+        pg_data_t *pgdat = NODE_DATA(nid);
+        int ret = 0;
+        if (pgdat->kswapd)
+                return 0;
+        pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+        if (IS_ERR(pgdat->kswapd)) {
+                /* failure at boot is fatal */
+                BUG_ON(system_state == SYSTEM_BOOTING);
+                printk("Failed to start kswapd on node %d\n",nid);
+                ret = -1;
+        }
+        return ret;
+}
 static int __init kswapd_init(void)
 {
-        pg_data_t *pgdat;
+        int nid;
        swap_setup();
-        for_each_online_pgdat(pgdat) {
+        for_each_online_node(nid)
-                pid_t pid;
+                kswapd_run(nid);
-                pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-                BUG_ON(pid < 0);
-                read_lock(&tasklist_lock);
-                pgdat->kswapd = find_task_by_pid(pid);
-                read_unlock(&tasklist_lock);
-        }
-        total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);
        return 0;
 }
@@ -1387,11 +1517,6 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_SLAB (1<<3)     /* Do a global slab shrink if the zone is out of memory */
 /*
- * Mininum time between zone reclaim scans
- */
-int zone_reclaim_interval __read_mostly = 30*HZ;
-/*
 * Priority for ZONE_RECLAIM. This determines the fraction of pages
 * of a node considered for each zone_reclaim. 4 scans 1/16th of
 * a zone.
@@ -1412,10 +1537,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        struct scan_control sc = {
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-                .nr_mapped = read_page_state(nr_mapped),
                .swap_cluster_max = max_t(unsigned long, nr_pages,
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
+                .swappiness = vm_swappiness,
        };
        disable_swap_token();
@@ -1456,16 +1581,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-        if (nr_reclaimed == 0) {
-                /*
-                 * We were unable to reclaim enough pages to stay on node.  We
-                 * now allow off node accesses for a certain time period before
-                 * trying again to reclaim pages from the local zone.
-                 */
-                zone->last_unsuccessful_zone_reclaim = jiffies;
-        }
        return nr_reclaimed >= nr_pages;
 }
@@ -1475,13 +1590,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        int node_id;
        /*
-         * Do not reclaim if there was a recent unsuccessful attempt at zone
+         * Do not reclaim if there are not enough reclaimable pages in this
-         * reclaim.  In that case we let allocations go off node for the
+         * zone that would satify this allocations.
-         * zone_reclaim_interval.  Otherwise we would scan for each off-node
+         *
-         * page allocation.
+         * All unmapped pagecache pages are reclaimable.
+         *
+         * Both counters may be temporarily off a bit so we use
+         * SWAP_CLUSTER_MAX as the boundary. It may also be good to
+         * leave a few frequently used unmapped pagecache pages around.
         */
-        if (time_before(jiffies,
+        if (zone_page_state(zone, NR_FILE_PAGES) -
-                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+                zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX)
                        return 0;
        /*