1 files changed, 181 insertions, 59 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e9..72babac71dea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -61,6 +61,8 @@ struct scan_control {
         * In this context, it doesn't matter that we scan the
         * whole list at once. */
        int swap_cluster_max;
+        int swappiness;
 };
 /*
@@ -108,7 +110,7 @@ struct shrinker {
 * From 0 .. 100.  Higher means more swappy.
 */
 int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages;    /* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -288,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
+/* possible outcome of pageout() */
+typedef enum {
+        /* failed to write page out, page is locked */
+        PAGE_KEEP,
+        /* move page to the active list, page is locked */
+        PAGE_ACTIVATE,
+        /* page has been sent to the disk successfully, page is unlocked */
+        PAGE_SUCCESS,
+        /* page is clean and locked */
+        PAGE_CLEAN,
+} pageout_t;
 /*
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -337,6 +351,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
                struct writeback_control wbc = {
                        .sync_mode = WB_SYNC_NONE,
                        .nr_to_write = SWAP_CLUSTER_MAX,
+                        .range_start = 0,
+                        .range_end = LLONG_MAX,
                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -727,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * how much memory
                 * is mapped.
                 */
-                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
                /*
                 * Now decide how much we really want to unmap some pages.  The
@@ -741,7 +757,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * A 100% value of vm_swappiness overrides this algorithm
                 * altogether.
                 */
-                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
                /*
                 * Now use this metric to decide whether to start moving mapped
@@ -957,6 +973,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                .may_writepage = !laptop_mode,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .may_swap = 1,
+                .swappiness = vm_swappiness,
        };
        inc_page_state(allocstall);
@@ -1021,10 +1038,6 @@ out:
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at pages_high.
 *
- * If `nr_pages' is non-zero then it is the number of pages which are to be
- * reclaimed, regardless of the zone occupancies.  This is a software suspend
- * special.
- *
 * Returns the number of pages which were actually freed.
 *
 * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1055,8 @@ out:
 * the page allocator fallback scheme to ensure that aging of pages is balanced
 * across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
-                                int order)
 {
-        unsigned long to_free = nr_pages;
        int all_zones_ok;
        int priority;
        int i;
@@ -1055,7 +1066,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_swap = 1,
-                .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .swappiness = vm_swappiness,
        };
 loop_again:
@@ -1082,31 +1094,26 @@ loop_again:
                all_zones_ok = 1;
-                if (nr_pages == 0) {
+                /*
-                        /*
+                 * Scan in the highmem->dma direction for the highest
-                         * Scan in the highmem->dma direction for the highest
+                 * zone which needs scanning
-                         * zone which needs scanning
+                 */
-                         */
+                for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-                        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+                        struct zone *zone = pgdat->node_zones + i;
-                                struct zone *zone = pgdat->node_zones + i;
-                                if (!populated_zone(zone))
+                        if (!populated_zone(zone))
-                                        continue;
+                                continue;
-                                if (zone->all_unreclaimable &&
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                                                priority != DEF_PRIORITY)
+                                continue;
-                                        continue;
-                                if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                                zone->pages_high, 0, 0)) {
+                                               0, 0)) {
-                                        end_zone = i;
+                                end_zone = i;
-                                        goto scan;
+                                goto scan;
-                                }
                        }
-                        goto out;
-                } else {
-                        end_zone = pgdat->nr_zones - 1;
                }
+                goto out;
 scan:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1140,9 @@ scan:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
-                        if (nr_pages == 0) {    /* Not software suspend */
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                if (!zone_watermark_ok(zone, order,
+                                               end_zone, 0))
-                                                zone->pages_high, end_zone, 0))
+                                all_zones_ok = 0;
-                                        all_zones_ok = 0;
-                        }
                        zone->temp_priority = priority;
                        if (zone->prev_priority > priority)
                                zone->prev_priority = priority;
@@ -1162,8 +1167,6 @@ scan:
                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
-                if (nr_pages && to_free > nr_reclaimed)
-                        continue;       /* swsusp: need to do more work */
                if (all_zones_ok)
                        break;          /* kswapd: all done */
                /*
@@ -1179,7 +1182,7 @@ scan:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
+                if (nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        }
 out:
@@ -1261,7 +1264,7 @@ static int kswapd(void *p)
                }
                finish_wait(&pgdat->kswapd_wait, &wait);
-                balance_pgdat(pgdat, 0, order);
+                balance_pgdat(pgdat, order);
        }
        return 0;
 }
@@ -1290,35 +1293,154 @@ void wakeup_kswapd(struct zone *zone, int order)
 #ifdef CONFIG_PM
 /*
- * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
+ * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
- * pages.
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+                                      int prio, struct scan_control *sc)
+{
+        struct zone *zone;
+        unsigned long nr_to_scan, ret = 0;
+        for_each_zone(zone) {
+                if (!populated_zone(zone))
+                        continue;
+                if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+                        continue;
+                /* For pass = 0 we don't shrink the active list */
+                if (pass > 0) {
+                        zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+                        if (zone->nr_scan_active >= nr_pages || pass > 3) {
+                                zone->nr_scan_active = 0;
+                                nr_to_scan = min(nr_pages, zone->nr_active);
+                                shrink_active_list(nr_to_scan, zone, sc);
+                        }
+                }
+                zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+                if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+                        zone->nr_scan_inactive = 0;
+                        nr_to_scan = min(nr_pages, zone->nr_inactive);
+                        ret += shrink_inactive_list(nr_to_scan, zone, sc);
+                        if (ret >= nr_pages)
+                                return ret;
+                }
+        }
+        return ret;
+}
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
 */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
-        pg_data_t *pgdat;
+        unsigned long lru_pages, nr_slab;
-        unsigned long nr_to_free = nr_pages;
        unsigned long ret = 0;
-        unsigned retry = 2;
+        int pass;
-        struct reclaim_state reclaim_state = {
+        struct reclaim_state reclaim_state;
-                .reclaimed_slab = 0,
+        struct zone *zone;
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .may_swap = 0,
+                .swap_cluster_max = nr_pages,
+                .may_writepage = 1,
+                .swappiness = vm_swappiness,
        };
        current->reclaim_state = &reclaim_state;
-repeat:
-        for_each_online_pgdat(pgdat) {
-                unsigned long freed;
-                freed = balance_pgdat(pgdat, nr_to_free, 0);
+        lru_pages = 0;
-                ret += freed;
+        for_each_zone(zone)
-                nr_to_free -= freed;
+                lru_pages += zone->nr_active + zone->nr_inactive;
-                if ((long)nr_to_free <= 0)
+        nr_slab = read_page_state(nr_slab);
+        /* If slab caches are huge, it's better to hit them first */
+        while (nr_slab >= lru_pages) {
+                reclaim_state.reclaimed_slab = 0;
+                shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                if (!reclaim_state.reclaimed_slab)
                        break;
+                ret += reclaim_state.reclaimed_slab;
+                if (ret >= nr_pages)
+                        goto out;
+                nr_slab -= reclaim_state.reclaimed_slab;
        }
-        if (retry-- && ret < nr_pages) {
-                blk_congestion_wait(WRITE, HZ/5);
+        /*
-                goto repeat;
+         * We try to shrink LRUs in 5 passes:
+         * 0 = Reclaim from inactive_list only
+         * 1 = Reclaim from active list but don't reclaim mapped
+         * 2 = 2nd pass of type 1
+         * 3 = Reclaim mapped (normal reclaim)
+         * 4 = 2nd pass of type 3
+         */
+        for (pass = 0; pass < 5; pass++) {
+                int prio;
+                /* Needed for shrinking slab caches later on */
+                if (!lru_pages)
+                        for_each_zone(zone) {
+                                lru_pages += zone->nr_active;
+                                lru_pages += zone->nr_inactive;
+                        }
+                /* Force reclaiming mapped pages in the passes #3 and #4 */
+                if (pass > 2) {
+                        sc.may_swap = 1;
+                        sc.swappiness = 100;
+                }
+                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+                        unsigned long nr_to_scan = nr_pages - ret;
+                        sc.nr_mapped = read_page_state(nr_mapped);
+                        sc.nr_scanned = 0;
+                        ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+                        if (ret >= nr_pages)
+                                goto out;
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                        if (ret >= nr_pages)
+                                goto out;
+                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+                                blk_congestion_wait(WRITE, HZ / 10);
+                }
+                lru_pages = 0;
        }
+        /*
+         * If ret = 0, we could not shrink LRUs, but there may be something
+         * in slab caches
+         */
+        if (!ret)
+                do {
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+out:
        current->reclaim_state = NULL;
        return ret;
 }
 #endif
@@ -1360,7 +1482,6 @@ static int __init kswapd_init(void)
                pgdat->kswapd = find_task_by_pid(pid);
                read_unlock(&tasklist_lock);
        }
-        total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);
        return 0;
 }
@@ -1416,6 +1537,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swap_cluster_max = max_t(unsigned long, nr_pages,
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
+                .swappiness = vm_swappiness,
        };
        disable_swap_token();

diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e9..72babac71dea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -61,6 +61,8 @@ struct scan_control {
61	* In this context, it doesn't matter that we scan the	61	* In this context, it doesn't matter that we scan the
62	* whole list at once. */	62	* whole list at once. */
63	int swap_cluster_max;	63	int swap_cluster_max;
		64
		65	int swappiness;
64	};	66	};
65		67
66	/*	68	/*
@@ -108,7 +110,7 @@ struct shrinker {
108	* From 0 .. 100. Higher means more swappy.	110	* From 0 .. 100. Higher means more swappy.
109	*/	111	*/
110	int vm_swappiness = 60;	112	int vm_swappiness = 60;
111	static long total_memory;	113	long vm_total_pages; /* The total number of pages which the VM controls */
112		114
113	static LIST_HEAD(shrinker_list);	115	static LIST_HEAD(shrinker_list);
114	static DECLARE_RWSEM(shrinker_rwsem);	116	static DECLARE_RWSEM(shrinker_rwsem);
@@ -288,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
288	unlock_page(page);	290	unlock_page(page);
289	}	291	}
290		292
		293	/* possible outcome of pageout() */
		294	typedef enum {
		295	/* failed to write page out, page is locked */
		296	PAGE_KEEP,
		297	/* move page to the active list, page is locked */
		298	PAGE_ACTIVATE,
		299	/* page has been sent to the disk successfully, page is unlocked */
		300	PAGE_SUCCESS,
		301	/* page is clean and locked */
		302	PAGE_CLEAN,
		303	} pageout_t;
		304
291	/*	305	/*
292	* pageout is called by shrink_page_list() for each dirty page.	306	* pageout is called by shrink_page_list() for each dirty page.
293	* Calls ->writepage().	307	* Calls ->writepage().
294	*/	308	*/
295	pageout_t pageout(struct page page, struct address_space mapping)	309	static pageout_t pageout(struct page page, struct address_space mapping)
296	{	310	{
297	/*	311	/*
298	* If the page is dirty, only perform writeback if that write	312	* If the page is dirty, only perform writeback if that write
@@ -337,6 +351,8 @@ pageout_t pageout(struct page page, struct address_space mapping)
337	struct writeback_control wbc = {	351	struct writeback_control wbc = {
338	.sync_mode = WB_SYNC_NONE,	352	.sync_mode = WB_SYNC_NONE,
339	.nr_to_write = SWAP_CLUSTER_MAX,	353	.nr_to_write = SWAP_CLUSTER_MAX,
		354	.range_start = 0,
		355	.range_end = LLONG_MAX,
340	.nonblocking = 1,	356	.nonblocking = 1,
341	.for_reclaim = 1,	357	.for_reclaim = 1,
342	};	358	};
@@ -727,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
727	* how much memory	743	* how much memory
728	* is mapped.	744	* is mapped.
729	*/	745	*/
730	mapped_ratio = (sc->nr_mapped * 100) / total_memory;	746	mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
731		747
732	/*	748	/*
733	* Now decide how much we really want to unmap some pages. The	749	* Now decide how much we really want to unmap some pages. The
@@ -741,7 +757,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
741	* A 100% value of vm_swappiness overrides this algorithm	757	* A 100% value of vm_swappiness overrides this algorithm
742	* altogether.	758	* altogether.
743	*/	759	*/
744	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;	760	swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
745		761
746	/*	762	/*
747	* Now use this metric to decide whether to start moving mapped	763	* Now use this metric to decide whether to start moving mapped
@@ -957,6 +973,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
957	.may_writepage = !laptop_mode,	973	.may_writepage = !laptop_mode,
958	.swap_cluster_max = SWAP_CLUSTER_MAX,	974	.swap_cluster_max = SWAP_CLUSTER_MAX,
959	.may_swap = 1,	975	.may_swap = 1,
		976	.swappiness = vm_swappiness,
960	};	977	};
961		978
962	inc_page_state(allocstall);	979	inc_page_state(allocstall);
@@ -1021,10 +1038,6 @@ out:
1021	* For kswapd, balance_pgdat() will work across all this node's zones until	1038	* For kswapd, balance_pgdat() will work across all this node's zones until
1022	* they are all at pages_high.	1039	* they are all at pages_high.
1023	*	1040	*
1024	* If `nr_pages' is non-zero then it is the number of pages which are to be
1025	* reclaimed, regardless of the zone occupancies. This is a software suspend
1026	* special.
1027	*
1028	* Returns the number of pages which were actually freed.	1041	* Returns the number of pages which were actually freed.
1029	*	1042	*
1030	* There is special handling here for zones which are full of pinned pages.	1043	* There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1055,8 @@ out:
1042	* the page allocator fallback scheme to ensure that aging of pages is balanced	1055	* the page allocator fallback scheme to ensure that aging of pages is balanced
1043	* across the zones.	1056	* across the zones.
1044	*/	1057	*/
1045	static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,	1058	static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1046	int order)
1047	{	1059	{
1048	unsigned long to_free = nr_pages;
1049	int all_zones_ok;	1060	int all_zones_ok;
1050	int priority;	1061	int priority;
1051	int i;	1062	int i;
@@ -1055,7 +1066,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1055	struct scan_control sc = {	1066	struct scan_control sc = {
1056	.gfp_mask = GFP_KERNEL,	1067	.gfp_mask = GFP_KERNEL,
1057	.may_swap = 1,	1068	.may_swap = 1,
1058	.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,	1069	.swap_cluster_max = SWAP_CLUSTER_MAX,
		1070	.swappiness = vm_swappiness,
1059	};	1071	};
1060		1072
1061	loop_again:	1073	loop_again:
@@ -1082,31 +1094,26 @@ loop_again:
1082		1094
1083	all_zones_ok = 1;	1095	all_zones_ok = 1;
1084		1096
1085	if (nr_pages == 0) {	1097	/*
1086	/*	1098	* Scan in the highmem->dma direction for the highest
1087	* Scan in the highmem->dma direction for the highest	1099	* zone which needs scanning
1088	* zone which needs scanning	1100	*/
1089	*/	1101	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1090	for (i = pgdat->nr_zones - 1; i >= 0; i--) {	1102	struct zone *zone = pgdat->node_zones + i;
1091	struct zone *zone = pgdat->node_zones + i;
1092		1103
1093	if (!populated_zone(zone))	1104	if (!populated_zone(zone))
1094	continue;	1105	continue;
1095		1106
1096	if (zone->all_unreclaimable &&	1107	if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1097	priority != DEF_PRIORITY)	1108	continue;
1098	continue;
1099		1109
1100	if (!zone_watermark_ok(zone, order,	1110	if (!zone_watermark_ok(zone, order, zone->pages_high,
1101	zone->pages_high, 0, 0)) {	1111	0, 0)) {
1102	end_zone = i;	1112	end_zone = i;
1103	goto scan;	1113	goto scan;
1104	}
1105	}	1114	}
1106	goto out;
1107	} else {
1108	end_zone = pgdat->nr_zones - 1;
1109	}	1115	}
		1116	goto out;
1110	scan:	1117	scan:
1111	for (i = 0; i <= end_zone; i++) {	1118	for (i = 0; i <= end_zone; i++) {
1112	struct zone *zone = pgdat->node_zones + i;	1119	struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1140,9 @@ scan:
1133	if (zone->all_unreclaimable && priority != DEF_PRIORITY)	1140	if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1134	continue;	1141	continue;
1135		1142
1136	if (nr_pages == 0) { /* Not software suspend */	1143	if (!zone_watermark_ok(zone, order, zone->pages_high,
1137	if (!zone_watermark_ok(zone, order,	1144	end_zone, 0))
1138	zone->pages_high, end_zone, 0))	1145	all_zones_ok = 0;
1139	all_zones_ok = 0;
1140	}
1141	zone->temp_priority = priority;	1146	zone->temp_priority = priority;
1142	if (zone->prev_priority > priority)	1147	if (zone->prev_priority > priority)
1143	zone->prev_priority = priority;	1148	zone->prev_priority = priority;
@@ -1162,8 +1167,6 @@ scan:
1162	total_scanned > nr_reclaimed + nr_reclaimed / 2)	1167	total_scanned > nr_reclaimed + nr_reclaimed / 2)
1163	sc.may_writepage = 1;	1168	sc.may_writepage = 1;
1164	}	1169	}
1165	if (nr_pages && to_free > nr_reclaimed)
1166	continue; /* swsusp: need to do more work */
1167	if (all_zones_ok)	1170	if (all_zones_ok)
1168	break; /* kswapd: all done */	1171	break; /* kswapd: all done */
1169	/*	1172	/*
@@ -1179,7 +1182,7 @@ scan:
1179	* matches the direct reclaim path behaviour in terms of impact	1182	* matches the direct reclaim path behaviour in terms of impact
1180	* on zone->*_priority.	1183	* on zone->*_priority.
1181	*/	1184	*/
1182	if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)	1185	if (nr_reclaimed >= SWAP_CLUSTER_MAX)
1183	break;	1186	break;
1184	}	1187	}
1185	out:	1188	out:
@@ -1261,7 +1264,7 @@ static int kswapd(void *p)
1261	}	1264	}
1262	finish_wait(&pgdat->kswapd_wait, &wait);	1265	finish_wait(&pgdat->kswapd_wait, &wait);
1263		1266
1264	balance_pgdat(pgdat, 0, order);	1267	balance_pgdat(pgdat, order);
1265	}	1268	}
1266	return 0;	1269	return 0;
1267	}	1270	}
@@ -1290,35 +1293,154 @@ void wakeup_kswapd(struct zone *zone, int order)
1290		1293
1291	#ifdef CONFIG_PM	1294	#ifdef CONFIG_PM
1292	/*	1295	/*
1293	* Try to free `nr_pages' of memory, system-wide. Returns the number of freed	1296	* Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
1294	* pages.	1297	* from LRU lists system-wide, for given pass and priority, and returns the
		1298	* number of reclaimed pages
		1299	*
		1300	* For pass > 3 we also try to shrink the LRU lists that contain a few pages
		1301	*/
		1302	static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
		1303	int prio, struct scan_control *sc)
		1304	{
		1305	struct zone *zone;
		1306	unsigned long nr_to_scan, ret = 0;
		1307
		1308	for_each_zone(zone) {
		1309
		1310	if (!populated_zone(zone))
		1311	continue;
		1312
		1313	if (zone->all_unreclaimable && prio != DEF_PRIORITY)
		1314	continue;
		1315
		1316	/* For pass = 0 we don't shrink the active list */
		1317	if (pass > 0) {
		1318	zone->nr_scan_active += (zone->nr_active >> prio) + 1;
		1319	if (zone->nr_scan_active >= nr_pages \|\| pass > 3) {
		1320	zone->nr_scan_active = 0;
		1321	nr_to_scan = min(nr_pages, zone->nr_active);
		1322	shrink_active_list(nr_to_scan, zone, sc);
		1323	}
		1324	}
		1325
		1326	zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
		1327	if (zone->nr_scan_inactive >= nr_pages \|\| pass > 3) {
		1328	zone->nr_scan_inactive = 0;
		1329	nr_to_scan = min(nr_pages, zone->nr_inactive);
		1330	ret += shrink_inactive_list(nr_to_scan, zone, sc);
		1331	if (ret >= nr_pages)
		1332	return ret;
		1333	}
		1334	}
		1335
		1336	return ret;
		1337	}
		1338
		1339	/*
		1340	* Try to free `nr_pages' of memory, system-wide, and return the number of
		1341	* freed pages.
		1342	*
		1343	* Rather than trying to age LRUs the aim is to preserve the overall
		1344	* LRU order by reclaiming preferentially
		1345	* inactive > active > active referenced > active mapped
1295	*/	1346	*/
1296	unsigned long shrink_all_memory(unsigned long nr_pages)	1347	unsigned long shrink_all_memory(unsigned long nr_pages)
1297	{	1348	{
1298	pg_data_t *pgdat;	1349	unsigned long lru_pages, nr_slab;
1299	unsigned long nr_to_free = nr_pages;
1300	unsigned long ret = 0;	1350	unsigned long ret = 0;
1301	unsigned retry = 2;	1351	int pass;
1302	struct reclaim_state reclaim_state = {	1352	struct reclaim_state reclaim_state;
1303	.reclaimed_slab = 0,	1353	struct zone *zone;
		1354	struct scan_control sc = {
		1355	.gfp_mask = GFP_KERNEL,
		1356	.may_swap = 0,
		1357	.swap_cluster_max = nr_pages,
		1358	.may_writepage = 1,
		1359	.swappiness = vm_swappiness,
1304	};	1360	};
1305		1361
1306	current->reclaim_state = &reclaim_state;	1362	current->reclaim_state = &reclaim_state;
1307	repeat:
1308	for_each_online_pgdat(pgdat) {
1309	unsigned long freed;
1310		1363
1311	freed = balance_pgdat(pgdat, nr_to_free, 0);	1364	lru_pages = 0;
1312	ret += freed;	1365	for_each_zone(zone)
1313	nr_to_free -= freed;	1366	lru_pages += zone->nr_active + zone->nr_inactive;
1314	if ((long)nr_to_free <= 0)	1367
		1368	nr_slab = read_page_state(nr_slab);
		1369	/* If slab caches are huge, it's better to hit them first */
		1370	while (nr_slab >= lru_pages) {
		1371	reclaim_state.reclaimed_slab = 0;
		1372	shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
		1373	if (!reclaim_state.reclaimed_slab)
1315	break;	1374	break;
		1375
		1376	ret += reclaim_state.reclaimed_slab;
		1377	if (ret >= nr_pages)
		1378	goto out;
		1379
		1380	nr_slab -= reclaim_state.reclaimed_slab;
1316	}	1381	}
1317	if (retry-- && ret < nr_pages) {	1382
1318	blk_congestion_wait(WRITE, HZ/5);	1383	/*
1319	goto repeat;	1384	* We try to shrink LRUs in 5 passes:
		1385	* 0 = Reclaim from inactive_list only
		1386	* 1 = Reclaim from active list but don't reclaim mapped
		1387	* 2 = 2nd pass of type 1
		1388	* 3 = Reclaim mapped (normal reclaim)
		1389	* 4 = 2nd pass of type 3
		1390	*/
		1391	for (pass = 0; pass < 5; pass++) {
		1392	int prio;
		1393
		1394	/* Needed for shrinking slab caches later on */
		1395	if (!lru_pages)
		1396	for_each_zone(zone) {
		1397	lru_pages += zone->nr_active;
		1398	lru_pages += zone->nr_inactive;
		1399	}
		1400
		1401	/* Force reclaiming mapped pages in the passes #3 and #4 */
		1402	if (pass > 2) {
		1403	sc.may_swap = 1;
		1404	sc.swappiness = 100;
		1405	}
		1406
		1407	for (prio = DEF_PRIORITY; prio >= 0; prio--) {
		1408	unsigned long nr_to_scan = nr_pages - ret;
		1409
		1410	sc.nr_mapped = read_page_state(nr_mapped);
		1411	sc.nr_scanned = 0;
		1412
		1413	ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
		1414	if (ret >= nr_pages)
		1415	goto out;
		1416
		1417	reclaim_state.reclaimed_slab = 0;
		1418	shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
		1419	ret += reclaim_state.reclaimed_slab;
		1420	if (ret >= nr_pages)
		1421	goto out;
		1422
		1423	if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
		1424	blk_congestion_wait(WRITE, HZ / 10);
		1425	}
		1426
		1427	lru_pages = 0;
1320	}	1428	}
		1429
		1430	/*
		1431	* If ret = 0, we could not shrink LRUs, but there may be something
		1432	* in slab caches
		1433	*/
		1434	if (!ret)
		1435	do {
		1436	reclaim_state.reclaimed_slab = 0;
		1437	shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
		1438	ret += reclaim_state.reclaimed_slab;
		1439	} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
		1440
		1441	out:
1321	current->reclaim_state = NULL;	1442	current->reclaim_state = NULL;
		1443
1322	return ret;	1444	return ret;
1323	}	1445	}
1324	#endif	1446	#endif
@@ -1360,7 +1482,6 @@ static int __init kswapd_init(void)
1360	pgdat->kswapd = find_task_by_pid(pid);	1482	pgdat->kswapd = find_task_by_pid(pid);
1361	read_unlock(&tasklist_lock);	1483	read_unlock(&tasklist_lock);
1362	}	1484	}
1363	total_memory = nr_free_pagecache_pages();
1364	hotcpu_notifier(cpu_callback, 0);	1485	hotcpu_notifier(cpu_callback, 0);
1365	return 0;	1486	return 0;
1366	}	1487	}
@@ -1416,6 +1537,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1416	.swap_cluster_max = max_t(unsigned long, nr_pages,	1537	.swap_cluster_max = max_t(unsigned long, nr_pages,
1417	SWAP_CLUSTER_MAX),	1538	SWAP_CLUSTER_MAX),
1418	.gfp_mask = gfp_mask,	1539	.gfp_mask = gfp_mask,
		1540	.swappiness = vm_swappiness,
1419	};	1541	};
1420		1542
1421	disable_swap_token();	1543	disable_swap_token();