per-zone and reclaim enhancements for memory controller: modifies vmscan.c for isolate globa/cgroup lru activity

When using memory controller, there are 2 levels of memory reclaim. 1. zone memory reclaim because of system/zone memory shortage. 2. memory cgroup memory reclaim because of hitting limit. These two can be distinguished by sc->mem_cgroup parameter. (scan_global_lru() macro) This patch tries to make memory cgroup reclaim routine avoid affecting system/zone memory reclaim. This patch inserts if (scan_global_lru()) and hook to memory_cgroup reclaim support functions. This patch can be a help for isolating system lru activity and group lru activity and shows what additional functions are necessary. * mem_cgroup_calc_mapped_ratio() ... calculate mapped ratio for cgroup. * mem_cgroup_reclaim_imbalance() ... calculate active/inactive balance in cgroup. * mem_cgroup_calc_reclaim_active() ... calculate the number of active pages to be scanned in this priority in mem_cgroup. * mem_cgroup_calc_reclaim_inactive() ... calculate the number of inactive pages to be scanned in this priority in mem_cgroup. * mem_cgroup_all_unreclaimable() .. checks cgroup's page is all unreclaimable or not. * mem_cgroup_get_reclaim_priority() ... * mem_cgroup_note_reclaim_priority() ... record reclaim priority (temporal) * mem_cgroup_remember_reclaim_priority() .... record reclaim priority as zone->prev_priority. This value is used for calc reclaim_mapped. [akpm@linux-foundation.org: fix unused var warning] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Paul Menage <menage@google.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> 2008-02-07 03:14:37 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-02-07 11:42:22 -0500
commit: 1cfb419b394ba82745c54ff05436d598ecc2dbd5 (patch)
tree: 33624176aff8f3a09f572c0fa3d699dbacdb447d /mm
parent: cc38108e1ba7f3b9e12b82d0236fa3730c2e0439 (diff)
1 files changed, 201 insertions, 131 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be4dfe87be03..a26dabd62fed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -856,7 +856,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
                __mod_zone_page_state(zone, NR_INACTIVE,
                                                -(nr_taken - nr_active));
-                zone->pages_scanned += nr_scan;
+                if (scan_global_lru(sc))
+                        zone->pages_scanned += nr_scan;
                spin_unlock_irq(&zone->lru_lock);
                nr_scanned += nr_scan;
@@ -888,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                if (current_is_kswapd()) {
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
-                } else
+                } else if (scan_global_lru(sc))
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
                if (nr_taken == 0)
@@ -943,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone)
 }
 /*
+ * Determine we should try to reclaim mapped pages.
+ * This is called only when sc->mem_cgroup is NULL.
+ */
+static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
+                                int priority)
+{
+        long mapped_ratio;
+        long distress;
+        long swap_tendency;
+        long imbalance;
+        int reclaim_mapped = 0;
+        int prev_priority;
+        if (scan_global_lru(sc) && zone_is_near_oom(zone))
+                return 1;
+        /*
+         * `distress' is a measure of how much trouble we're having
+         * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+         */
+        if (scan_global_lru(sc))
+                prev_priority = zone->prev_priority;
+        else
+                prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
+        distress = 100 >> min(prev_priority, priority);
+        /*
+         * The point of this algorithm is to decide when to start
+         * reclaiming mapped memory instead of just pagecache.  Work out
+         * how much memory
+         * is mapped.
+         */
+        if (scan_global_lru(sc))
+                mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+                                global_page_state(NR_ANON_PAGES)) * 100) /
+                                        vm_total_pages;
+        else
+                mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
+        /*
+         * Now decide how much we really want to unmap some pages.  The
+         * mapped ratio is downgraded - just because there's a lot of
+         * mapped memory doesn't necessarily mean that page reclaim
+         * isn't succeeding.
+         *
+         * The distress ratio is important - we don't want to start
+         * going oom.
+         *
+         * A 100% value of vm_swappiness overrides this algorithm
+         * altogether.
+         */
+        swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+        /*
+         * If there's huge imbalance between active and inactive
+         * (think active 100 times larger than inactive) we should
+         * become more permissive, or the system will take too much
+         * cpu before it start swapping during memory pressure.
+         * Distress is about avoiding early-oom, this is about
+         * making swappiness graceful despite setting it to low
+         * values.
+         *
+         * Avoid div by zero with nr_inactive+1, and max resulting
+         * value is vm_total_pages.
+         */
+        if (scan_global_lru(sc)) {
+                imbalance  = zone_page_state(zone, NR_ACTIVE);
+                imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+        } else
+                imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
+        /*
+         * Reduce the effect of imbalance if swappiness is low,
+         * this means for a swappiness very low, the imbalance
+         * must be much higher than 100 for this logic to make
+         * the difference.
+         *
+         * Max temporary value is vm_total_pages*100.
+         */
+        imbalance *= (vm_swappiness + 1);
+        imbalance /= 100;
+        /*
+         * If not much of the ram is mapped, makes the imbalance
+         * less relevant, it's high priority we refill the inactive
+         * list with mapped pages only in presence of high ratio of
+         * mapped pages.
+         *
+         * Max temporary value is vm_total_pages*100.
+         */
+        imbalance *= mapped_ratio;
+        imbalance /= 100;
+        /* apply imbalance feedback to swap_tendency */
+        swap_tendency += imbalance;
+        /*
+         * Now use this metric to decide whether to start moving mapped
+         * memory onto the inactive list.
+         */
+        if (swap_tendency >= 100)
+                reclaim_mapped = 1;
+        return reclaim_mapped;
+}
+/*
 * This moves pages from the active list to the inactive list.
 *
 * We move them the other way if the page is referenced by one or more
@@ -959,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone)
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                                struct scan_control *sc, int priority)
 {
@@ -972,100 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        if (sc->may_swap) {
+        if (sc->may_swap)
-                long mapped_ratio;
+                reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
-                long distress;
-                long swap_tendency;
-                long imbalance;
-                if (zone_is_near_oom(zone))
-                        goto force_reclaim_mapped;
-                /*
-                 * `distress' is a measure of how much trouble we're having
-                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
-                 */
-                distress = 100 >> min(zone->prev_priority, priority);
-                /*
-                 * The point of this algorithm is to decide when to start
-                 * reclaiming mapped memory instead of just pagecache.  Work out
-                 * how much memory
-                 * is mapped.
-                 */
-                mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-                                global_page_state(NR_ANON_PAGES)) * 100) /
-                                        vm_total_pages;
-                /*
-                 * Now decide how much we really want to unmap some pages.  The
-                 * mapped ratio is downgraded - just because there's a lot of
-                 * mapped memory doesn't necessarily mean that page reclaim
-                 * isn't succeeding.
-                 *
-                 * The distress ratio is important - we don't want to start
-                 * going oom.
-                 *
-                 * A 100% value of vm_swappiness overrides this algorithm
-                 * altogether.
-                 */
-                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-                /*
-                 * If there's huge imbalance between active and inactive
-                 * (think active 100 times larger than inactive) we should
-                 * become more permissive, or the system will take too much
-                 * cpu before it start swapping during memory pressure.
-                 * Distress is about avoiding early-oom, this is about
-                 * making swappiness graceful despite setting it to low
-                 * values.
-                 *
-                 * Avoid div by zero with nr_inactive+1, and max resulting
-                 * value is vm_total_pages.
-                 */
-                imbalance  = zone_page_state(zone, NR_ACTIVE);
-                imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
-                /*
-                 * Reduce the effect of imbalance if swappiness is low,
-                 * this means for a swappiness very low, the imbalance
-                 * must be much higher than 100 for this logic to make
-                 * the difference.
-                 *
-                 * Max temporary value is vm_total_pages*100.
-                 */
-                imbalance *= (vm_swappiness + 1);
-                imbalance /= 100;
-                /*
-                 * If not much of the ram is mapped, makes the imbalance
-                 * less relevant, it's high priority we refill the inactive
-                 * list with mapped pages only in presence of high ratio of
-                 * mapped pages.
-                 *
-                 * Max temporary value is vm_total_pages*100.
-                 */
-                imbalance *= mapped_ratio;
-                imbalance /= 100;
-                /* apply imbalance feedback to swap_tendency */
-                swap_tendency += imbalance;
-                /*
-                 * Now use this metric to decide whether to start moving mapped
-                 * memory onto the inactive list.
-                 */
-                if (swap_tendency >= 100)
-force_reclaim_mapped:
-                        reclaim_mapped = 1;
-        }
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
        pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
                                        ISOLATE_ACTIVE, zone,
                                        sc->mem_cgroup, 1);
-        zone->pages_scanned += pgscanned;
+        /*
+         * zone->pages_scanned is used for detect zone's oom
+         * mem_cgroup remembers nr_scan by itself.
+         */
+        if (scan_global_lru(sc))
+                zone->pages_scanned += pgscanned;
        __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
        spin_unlock_irq(&zone->lru_lock);
@@ -1155,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        unsigned long nr_to_scan;
        unsigned long nr_reclaimed = 0;
-        /*
+        if (scan_global_lru(sc)) {
-         * Add one to `nr_to_scan' just to make sure that the kernel will
+                /*
-         * slowly sift through the active list.
+                 * Add one to nr_to_scan just to make sure that the kernel
-         */
+                 * will slowly sift through the active list.
-        zone->nr_scan_active +=
+                 */
-                (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
+                zone->nr_scan_active +=
-        nr_active = zone->nr_scan_active;
+                        (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
-        if (nr_active >= sc->swap_cluster_max)
+                nr_active = zone->nr_scan_active;
-                zone->nr_scan_active = 0;
+                zone->nr_scan_inactive +=
-        else
+                        (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-                nr_active = 0;
+                nr_inactive = zone->nr_scan_inactive;
+                if (nr_inactive >= sc->swap_cluster_max)
+                        zone->nr_scan_inactive = 0;
+                else
+                        nr_inactive = 0;
+                if (nr_active >= sc->swap_cluster_max)
+                        zone->nr_scan_active = 0;
+                else
+                        nr_active = 0;
+        } else {
+                /*
+                 * This reclaim occurs not because zone memory shortage but
+                 * because memory controller hits its limit.
+                 * Then, don't modify zone reclaim related data.
+                 */
+                nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
+                                        zone, priority);
+                nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
+                                        zone, priority);
+        }
-        zone->nr_scan_inactive +=
-                (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-        nr_inactive = zone->nr_scan_inactive;
-        if (nr_inactive >= sc->swap_cluster_max)
-                zone->nr_scan_inactive = 0;
-        else
-                nr_inactive = 0;
        while (nr_active || nr_inactive) {
                if (nr_active) {
@@ -1218,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
        unsigned long nr_reclaimed = 0;
        int i;
        sc->all_unreclaimable = 1;
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
                if (!populated_zone(zone))
                        continue;
+                /*
+                 * Take care memory controller reclaiming has small influence
+                 * to global LRU.
+                 */
+                if (scan_global_lru(sc)) {
+                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                continue;
+                        note_zone_scanning_priority(zone, priority);
-                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                        if (zone_is_all_unreclaimable(zone) &&
-                        continue;
+                                                priority != DEF_PRIORITY)
+                                continue;       /* Let kswapd poll it */
-                note_zone_scanning_priority(zone, priority);
+                        sc->all_unreclaimable = 0;
+                } else {
-                if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
+                        /*
-                        continue;       /* Let kswapd poll it */
+                         * Ignore cpuset limitation here. We just want to reduce
+                         * # of used pages by us regardless of memory shortage.
-                sc->all_unreclaimable = 0;
+                         */
+                        sc->all_unreclaimable = 0;
+                        mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
+                                                        priority);
+                }
                nr_reclaimed += shrink_zone(priority, zone, sc);
        }
        return nr_reclaimed;
 }
 
@@ -1264,16 +1324,21 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
        unsigned long lru_pages = 0;
        int i;
-        count_vm_event(ALLOCSTALL);
+        if (scan_global_lru(sc))
+                count_vm_event(ALLOCSTALL);
-        for (i = 0; zones[i] != NULL; i++) {
+        /*
-                struct zone *zone = zones[i];
+         * mem_cgroup will not do shrink_slab.
+         */
+        if (scan_global_lru(sc)) {
+                for (i = 0; zones[i] != NULL; i++) {
+                        struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                        continue;
+                                continue;
-                lru_pages += zone_page_state(zone, NR_ACTIVE)
+                        lru_pages += zone_page_state(zone, NR_ACTIVE)
-                                + zone_page_state(zone, NR_INACTIVE);
+                                        + zone_page_state(zone, NR_INACTIVE);
+                }
        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -1330,14 +1395,19 @@ out:
         */
        if (priority < 0)
                priority = 0;
-        for (i = 0; zones[i] != NULL; i++) {
-                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+        if (scan_global_lru(sc)) {
-                        continue;
+                for (i = 0; zones[i] != NULL; i++) {
+                        struct zone *zone = zones[i];
+                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                continue;
+                        zone->prev_priority = priority;
+                }
+        } else
+                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
-                zone->prev_priority = priority;
-        }
        return ret;
 }
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>	2008-02-07 03:14:37 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-02-07 11:42:22 -0500
commit	1cfb419b394ba82745c54ff05436d598ecc2dbd5 (patch)
tree	33624176aff8f3a09f572c0fa3d699dbacdb447d /mm
parent	cc38108e1ba7f3b9e12b82d0236fa3730c2e0439 (diff)

diff --git a/mm/vmscan.c b/mm/vmscan.c index be4dfe87be03..a26dabd62fed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -856,7 +856,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
856	__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);	856	__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
857	__mod_zone_page_state(zone, NR_INACTIVE,	857	__mod_zone_page_state(zone, NR_INACTIVE,
858	-(nr_taken - nr_active));	858	-(nr_taken - nr_active));
859	zone->pages_scanned += nr_scan;	859	if (scan_global_lru(sc))
		860	zone->pages_scanned += nr_scan;
860	spin_unlock_irq(&zone->lru_lock);	861	spin_unlock_irq(&zone->lru_lock);
861		862
862	nr_scanned += nr_scan;	863	nr_scanned += nr_scan;
@@ -888,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
888	if (current_is_kswapd()) {	889	if (current_is_kswapd()) {
889	__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);	890	__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
890	__count_vm_events(KSWAPD_STEAL, nr_freed);	891	__count_vm_events(KSWAPD_STEAL, nr_freed);
891	} else	892	} else if (scan_global_lru(sc))
892	__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);	893	__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
		894
893	__count_zone_vm_events(PGSTEAL, zone, nr_freed);	895	__count_zone_vm_events(PGSTEAL, zone, nr_freed);
894		896
895	if (nr_taken == 0)	897	if (nr_taken == 0)
@@ -943,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone)
943	}	945	}
944		946
945	/*	947	/*
		948	* Determine we should try to reclaim mapped pages.
		949	* This is called only when sc->mem_cgroup is NULL.
		950	*/
		951	static int calc_reclaim_mapped(struct scan_control sc, struct zone zone,
		952	int priority)
		953	{
		954	long mapped_ratio;
		955	long distress;
		956	long swap_tendency;
		957	long imbalance;
		958	int reclaim_mapped = 0;
		959	int prev_priority;
		960
		961	if (scan_global_lru(sc) && zone_is_near_oom(zone))
		962	return 1;
		963	/*
		964	* `distress' is a measure of how much trouble we're having
		965	* reclaiming pages. 0 -> no problems. 100 -> great trouble.
		966	*/
		967	if (scan_global_lru(sc))
		968	prev_priority = zone->prev_priority;
		969	else
		970	prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
		971
		972	distress = 100 >> min(prev_priority, priority);
		973
		974	/*
		975	* The point of this algorithm is to decide when to start
		976	* reclaiming mapped memory instead of just pagecache. Work out
		977	* how much memory
		978	* is mapped.
		979	*/
		980	if (scan_global_lru(sc))
		981	mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
		982	global_page_state(NR_ANON_PAGES)) * 100) /
		983	vm_total_pages;
		984	else
		985	mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
		986
		987	/*
		988	* Now decide how much we really want to unmap some pages. The
		989	* mapped ratio is downgraded - just because there's a lot of
		990	* mapped memory doesn't necessarily mean that page reclaim
		991	* isn't succeeding.
		992	*
		993	* The distress ratio is important - we don't want to start
		994	* going oom.
		995	*
		996	* A 100% value of vm_swappiness overrides this algorithm
		997	* altogether.
		998	*/
		999	swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
		1000
		1001	/*
		1002	* If there's huge imbalance between active and inactive
		1003	* (think active 100 times larger than inactive) we should
		1004	* become more permissive, or the system will take too much
		1005	* cpu before it start swapping during memory pressure.
		1006	* Distress is about avoiding early-oom, this is about
		1007	* making swappiness graceful despite setting it to low
		1008	* values.
		1009	*
		1010	* Avoid div by zero with nr_inactive+1, and max resulting
		1011	* value is vm_total_pages.
		1012	*/
		1013	if (scan_global_lru(sc)) {
		1014	imbalance = zone_page_state(zone, NR_ACTIVE);
		1015	imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
		1016	} else
		1017	imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
		1018
		1019	/*
		1020	* Reduce the effect of imbalance if swappiness is low,
		1021	* this means for a swappiness very low, the imbalance
		1022	* must be much higher than 100 for this logic to make
		1023	* the difference.
		1024	*
		1025	* Max temporary value is vm_total_pages*100.
		1026	*/
		1027	imbalance *= (vm_swappiness + 1);
		1028	imbalance /= 100;
		1029
		1030	/*
		1031	* If not much of the ram is mapped, makes the imbalance
		1032	* less relevant, it's high priority we refill the inactive
		1033	* list with mapped pages only in presence of high ratio of
		1034	* mapped pages.
		1035	*
		1036	* Max temporary value is vm_total_pages*100.
		1037	*/
		1038	imbalance *= mapped_ratio;
		1039	imbalance /= 100;
		1040
		1041	/* apply imbalance feedback to swap_tendency */
		1042	swap_tendency += imbalance;
		1043
		1044	/*
		1045	* Now use this metric to decide whether to start moving mapped
		1046	* memory onto the inactive list.
		1047	*/
		1048	if (swap_tendency >= 100)
		1049	reclaim_mapped = 1;
		1050
		1051	return reclaim_mapped;
		1052	}
		1053
		1054	/*
946	* This moves pages from the active list to the inactive list.	1055	* This moves pages from the active list to the inactive list.
947	*	1056	*
948	* We move them the other way if the page is referenced by one or more	1057	* We move them the other way if the page is referenced by one or more
@@ -959,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone)
959	* The downside is that we have to touch page->_count against each page.	1068	* The downside is that we have to touch page->_count against each page.
960	* But we had to alter page->flags anyway.	1069	* But we had to alter page->flags anyway.
961	*/	1070	*/
		1071
		1072
962	static void shrink_active_list(unsigned long nr_pages, struct zone *zone,	1073	static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
963	struct scan_control *sc, int priority)	1074	struct scan_control *sc, int priority)
964	{	1075	{
@@ -972,100 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
972	struct pagevec pvec;	1083	struct pagevec pvec;
973	int reclaim_mapped = 0;	1084	int reclaim_mapped = 0;
974		1085
975	if (sc->may_swap) {	1086	if (sc->may_swap)
976	long mapped_ratio;	1087	reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
977	long distress;
978	long swap_tendency;
979	long imbalance;
980
981	if (zone_is_near_oom(zone))
982	goto force_reclaim_mapped;
983
984	/*
985	* `distress' is a measure of how much trouble we're having
986	* reclaiming pages. 0 -> no problems. 100 -> great trouble.
987	*/
988	distress = 100 >> min(zone->prev_priority, priority);
989
990	/*
991	* The point of this algorithm is to decide when to start
992	* reclaiming mapped memory instead of just pagecache. Work out
993	* how much memory
994	* is mapped.
995	*/
996	mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
997	global_page_state(NR_ANON_PAGES)) * 100) /
998	vm_total_pages;
999
1000	/*
1001	* Now decide how much we really want to unmap some pages. The
1002	* mapped ratio is downgraded - just because there's a lot of
1003	* mapped memory doesn't necessarily mean that page reclaim
1004	* isn't succeeding.
1005	*
1006	* The distress ratio is important - we don't want to start
1007	* going oom.
1008	*
1009	* A 100% value of vm_swappiness overrides this algorithm
1010	* altogether.
1011	*/
1012	swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1013
1014	/*
1015	* If there's huge imbalance between active and inactive
1016	* (think active 100 times larger than inactive) we should
1017	* become more permissive, or the system will take too much
1018	* cpu before it start swapping during memory pressure.
1019	* Distress is about avoiding early-oom, this is about
1020	* making swappiness graceful despite setting it to low
1021	* values.
1022	*
1023	* Avoid div by zero with nr_inactive+1, and max resulting
1024	* value is vm_total_pages.
1025	*/
1026	imbalance = zone_page_state(zone, NR_ACTIVE);
1027	imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1028
1029	/*
1030	* Reduce the effect of imbalance if swappiness is low,
1031	* this means for a swappiness very low, the imbalance
1032	* must be much higher than 100 for this logic to make
1033	* the difference.
1034	*
1035	* Max temporary value is vm_total_pages*100.
1036	*/
1037	imbalance *= (vm_swappiness + 1);
1038	imbalance /= 100;
1039
1040	/*
1041	* If not much of the ram is mapped, makes the imbalance
1042	* less relevant, it's high priority we refill the inactive
1043	* list with mapped pages only in presence of high ratio of
1044	* mapped pages.
1045	*
1046	* Max temporary value is vm_total_pages*100.
1047	*/
1048	imbalance *= mapped_ratio;
1049	imbalance /= 100;
1050
1051	/* apply imbalance feedback to swap_tendency */
1052	swap_tendency += imbalance;
1053
1054	/*
1055	* Now use this metric to decide whether to start moving mapped
1056	* memory onto the inactive list.
1057	*/
1058	if (swap_tendency >= 100)
1059	force_reclaim_mapped:
1060	reclaim_mapped = 1;
1061	}
1062		1088
1063	lru_add_drain();	1089	lru_add_drain();
1064	spin_lock_irq(&zone->lru_lock);	1090	spin_lock_irq(&zone->lru_lock);
1065	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,	1091	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1066	ISOLATE_ACTIVE, zone,	1092	ISOLATE_ACTIVE, zone,
1067	sc->mem_cgroup, 1);	1093	sc->mem_cgroup, 1);
1068	zone->pages_scanned += pgscanned;	1094	/*
		1095	* zone->pages_scanned is used for detect zone's oom
		1096	* mem_cgroup remembers nr_scan by itself.
		1097	*/
		1098	if (scan_global_lru(sc))
		1099	zone->pages_scanned += pgscanned;
		1100
1069	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);	1101	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
1070	spin_unlock_irq(&zone->lru_lock);	1102	spin_unlock_irq(&zone->lru_lock);
1071		1103
@@ -1155,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1155	unsigned long nr_to_scan;	1187	unsigned long nr_to_scan;
1156	unsigned long nr_reclaimed = 0;	1188	unsigned long nr_reclaimed = 0;
1157		1189
1158	/*	1190	if (scan_global_lru(sc)) {
1159	* Add one to `nr_to_scan' just to make sure that the kernel will	1191	/*
1160	* slowly sift through the active list.	1192	* Add one to nr_to_scan just to make sure that the kernel
1161	*/	1193	* will slowly sift through the active list.
1162	zone->nr_scan_active +=	1194	*/
1163	(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;	1195	zone->nr_scan_active +=
1164	nr_active = zone->nr_scan_active;	1196	(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
1165	if (nr_active >= sc->swap_cluster_max)	1197	nr_active = zone->nr_scan_active;
1166	zone->nr_scan_active = 0;	1198	zone->nr_scan_inactive +=
1167	else	1199	(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1168	nr_active = 0;	1200	nr_inactive = zone->nr_scan_inactive;
		1201	if (nr_inactive >= sc->swap_cluster_max)
		1202	zone->nr_scan_inactive = 0;
		1203	else
		1204	nr_inactive = 0;
		1205
		1206	if (nr_active >= sc->swap_cluster_max)
		1207	zone->nr_scan_active = 0;
		1208	else
		1209	nr_active = 0;
		1210	} else {
		1211	/*
		1212	* This reclaim occurs not because zone memory shortage but
		1213	* because memory controller hits its limit.
		1214	* Then, don't modify zone reclaim related data.
		1215	*/
		1216	nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
		1217	zone, priority);
		1218
		1219	nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
		1220	zone, priority);
		1221	}
1169		1222
1170	zone->nr_scan_inactive +=
1171	(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1172	nr_inactive = zone->nr_scan_inactive;
1173	if (nr_inactive >= sc->swap_cluster_max)
1174	zone->nr_scan_inactive = 0;
1175	else
1176	nr_inactive = 0;
1177		1223
1178	while (nr_active \|\| nr_inactive) {	1224	while (nr_active \|\| nr_inactive) {
1179	if (nr_active) {	1225	if (nr_active) {
@@ -1218,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1218	unsigned long nr_reclaimed = 0;	1264	unsigned long nr_reclaimed = 0;
1219	int i;	1265	int i;
1220		1266
		1267
1221	sc->all_unreclaimable = 1;	1268	sc->all_unreclaimable = 1;
1222	for (i = 0; zones[i] != NULL; i++) {	1269	for (i = 0; zones[i] != NULL; i++) {
1223	struct zone *zone = zones[i];	1270	struct zone *zone = zones[i];
1224		1271
1225	if (!populated_zone(zone))	1272	if (!populated_zone(zone))
1226	continue;	1273	continue;
		1274	/*
		1275	* Take care memory controller reclaiming has small influence
		1276	* to global LRU.
		1277	*/
		1278	if (scan_global_lru(sc)) {
		1279	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
		1280	continue;
		1281	note_zone_scanning_priority(zone, priority);
1227		1282
1228	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))	1283	if (zone_is_all_unreclaimable(zone) &&
1229	continue;	1284	priority != DEF_PRIORITY)
1230		1285	continue; /* Let kswapd poll it */
1231	note_zone_scanning_priority(zone, priority);	1286	sc->all_unreclaimable = 0;
1232		1287	} else {
1233	if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)	1288	/*
1234	continue; /* Let kswapd poll it */	1289	* Ignore cpuset limitation here. We just want to reduce
1235		1290	* # of used pages by us regardless of memory shortage.
1236	sc->all_unreclaimable = 0;	1291	*/
		1292	sc->all_unreclaimable = 0;
		1293	mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
		1294	priority);
		1295	}
1237		1296
1238	nr_reclaimed += shrink_zone(priority, zone, sc);	1297	nr_reclaimed += shrink_zone(priority, zone, sc);
1239	}	1298	}
		1299
1240	return nr_reclaimed;	1300	return nr_reclaimed;
1241	}	1301	}
1242		1302
@@ -1264,16 +1324,21 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1264	unsigned long lru_pages = 0;	1324	unsigned long lru_pages = 0;
1265	int i;	1325	int i;
1266		1326
1267	count_vm_event(ALLOCSTALL);	1327	if (scan_global_lru(sc))
1268		1328	count_vm_event(ALLOCSTALL);
1269	for (i = 0; zones[i] != NULL; i++) {	1329	/*
1270	struct zone *zone = zones[i];	1330	* mem_cgroup will not do shrink_slab.
		1331	*/
		1332	if (scan_global_lru(sc)) {
		1333	for (i = 0; zones[i] != NULL; i++) {
		1334	struct zone *zone = zones[i];
1271		1335
1272	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))	1336	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1273	continue;	1337	continue;
1274		1338
1275	lru_pages += zone_page_state(zone, NR_ACTIVE)	1339	lru_pages += zone_page_state(zone, NR_ACTIVE)
1276	+ zone_page_state(zone, NR_INACTIVE);	1340	+ zone_page_state(zone, NR_INACTIVE);
		1341	}
1277	}	1342	}
1278		1343
1279	for (priority = DEF_PRIORITY; priority >= 0; priority--) {	1344	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -1330,14 +1395,19 @@ out:
1330	*/	1395	*/
1331	if (priority < 0)	1396	if (priority < 0)
1332	priority = 0;	1397	priority = 0;
1333	for (i = 0; zones[i] != NULL; i++) {
1334	struct zone *zone = zones[i];
1335		1398
1336	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))	1399	if (scan_global_lru(sc)) {
1337	continue;	1400	for (i = 0; zones[i] != NULL; i++) {
		1401	struct zone *zone = zones[i];
		1402
		1403	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
		1404	continue;
		1405
		1406	zone->prev_priority = priority;
		1407	}
		1408	} else
		1409	mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1338		1410
1339	zone->prev_priority = priority;
1340	}
1341	return ret;	1411	return ret;
1342	}	1412	}
1343		1413