Memory controller: add per cgroup LRU and reclaim

Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov <xemul@openvz.org> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Balbir Singh <balbir@linux.vnet.ibm.com> 2008-02-07 03:13:56 -0500
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2008-02-07 11:42:18 -0500
commit: 66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch)
tree: d850a729887485874c976ba64eb85e3406e488a1 /mm/memcontrol.c
parent: 67e465a77ba658635309ee00b367bec6555ea544 (diff)
1 files changed, 143 insertions, 5 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b25df2a9d024..9e9ff914c0f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -22,10 +22,15 @@
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
+#include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
 struct cgroup_subsys mem_cgroup_subsys;
+static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
 /*
 * The memory controller data structure. The memory controller controls both
@@ -51,6 +56,10 @@ struct mem_cgroup {
         */
        struct list_head active_list;
        struct list_head inactive_list;
+        /*
+         * spin_lock to protect the per cgroup LRU
+         */
+        spinlock_t lru_lock;
 };
 /*
@@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page)
        bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 }
+void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+        if (active)
+                list_move(&pc->lru, &pc->mem_cgroup->active_list);
+        else
+                list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
+}
+/*
+ * This routine assumes that the appropriate zone's lru lock is already held
+ */
+void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+        struct mem_cgroup *mem;
+        if (!pc)
+                return;
+        mem = pc->mem_cgroup;
+        spin_lock(&mem->lru_lock);
+        __mem_cgroup_move_lists(pc, active);
+        spin_unlock(&mem->lru_lock);
+}
+unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
+                                        struct list_head *dst,
+                                        unsigned long *scanned, int order,
+                                        int mode, struct zone *z,
+                                        struct mem_cgroup *mem_cont,
+                                        int active)
+{
+        unsigned long nr_taken = 0;
+        struct page *page;
+        unsigned long scan;
+        LIST_HEAD(pc_list);
+        struct list_head *src;
+        struct page_cgroup *pc;
+        if (active)
+                src = &mem_cont->active_list;
+        else
+                src = &mem_cont->inactive_list;
+        spin_lock(&mem_cont->lru_lock);
+        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+                pc = list_entry(src->prev, struct page_cgroup, lru);
+                page = pc->page;
+                VM_BUG_ON(!pc);
+                if (PageActive(page) && !active) {
+                        __mem_cgroup_move_lists(pc, true);
+                        scan--;
+                        continue;
+                }
+                if (!PageActive(page) && active) {
+                        __mem_cgroup_move_lists(pc, false);
+                        scan--;
+                        continue;
+                }
+                /*
+                 * Reclaim, per zone
+                 * TODO: make the active/inactive lists per zone
+                 */
+                if (page_zone(page) != z)
+                        continue;
+                /*
+                 * Check if the meta page went away from under us
+                 */
+                if (!list_empty(&pc->lru))
+                        list_move(&pc->lru, &pc_list);
+                else
+                        continue;
+                if (__isolate_lru_page(page, mode) == 0) {
+                        list_move(&page->lru, dst);
+                        nr_taken++;
+                }
+        }
+        list_splice(&pc_list, src);
+        spin_unlock(&mem_cont->lru_lock);
+        *scanned = scan;
+        return nr_taken;
+}
 /*
 * Charge the memory controller for page usage.
 * Return
@@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc, *race_pc;
+        unsigned long flags;
+        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        /*
         * Should page_cgroup's go to their own slab?
@@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
         * to see if the cgroup page already has a page_cgroup associated
         * with it
         */
+retry:
        lock_page_cgroup(page);
        pc = page_get_page_cgroup(page);
        /*
         * The page_cgroup exists and the page has already been accounted
         */
        if (pc) {
-                atomic_inc(&pc->ref_cnt);
+                if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
-                goto done;
+                        /* this page is under being uncharged ? */
+                        unlock_page_cgroup(page);
+                        cpu_relax();
+                        goto retry;
+                } else
+                        goto done;
        }
        unlock_page_cgroup(page);
@@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
         * If we created the page_cgroup, we should free it on exceeding
         * the cgroup limit.
         */
-        if (res_counter_charge(&mem->res, 1)) {
+        while (res_counter_charge(&mem->res, 1)) {
+                if (try_to_free_mem_cgroup_pages(mem))
+                        continue;
+                /*
+                 * try_to_free_mem_cgroup_pages() might not give us a full
+                 * picture of reclaim. Some pages are reclaimed and might be
+                 * moved to swap cache or just unmapped from the cgroup.
+                 * Check the limit again to see if the reclaim reduced the
+                 * current usage of the cgroup before giving up
+                 */
+                if (res_counter_check_under_limit(&mem->res))
+                        continue;
+                        /*
+                         * Since we control both RSS and cache, we end up with a
+                         * very interesting scenario where we end up reclaiming
+                         * memory (essentially RSS), since the memory is pushed
+                         * to swap cache, we eventually end up adding those
+                         * pages back to our list. Hence we give ourselves a
+                         * few chances before we fail
+                         */
+                else if (nr_retries--) {
+                        congestion_wait(WRITE, HZ/10);
+                        continue;
+                }
                css_put(&mem->css);
                goto free_pc;
        }
@@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
        pc->page = page;
        page_assign_page_cgroup(page, pc);
+        spin_lock_irqsave(&mem->lru_lock, flags);
+        list_add(&pc->lru, &mem->active_list);
+        spin_unlock_irqrestore(&mem->lru_lock, flags);
 done:
        unlock_page_cgroup(page);
        return 0;
 free_pc:
        kfree(pc);
-        return -ENOMEM;
 err:
-        unlock_page_cgroup(page);
        return -ENOMEM;
 }
@@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
 {
        struct mem_cgroup *mem;
        struct page *page;
+        unsigned long flags;
        if (!pc)
                return;
@@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
                page_assign_page_cgroup(page, NULL);
                unlock_page_cgroup(page);
                res_counter_uncharge(&mem->res, 1);
+                spin_lock_irqsave(&mem->lru_lock, flags);
+                list_del_init(&pc->lru);
+                spin_unlock_irqrestore(&mem->lru_lock, flags);
                kfree(pc);
        }
 }
@@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        res_counter_init(&mem->res);
        INIT_LIST_HEAD(&mem->active_list);
        INIT_LIST_HEAD(&mem->inactive_list);
+        spin_lock_init(&mem->lru_lock);
        return &mem->css;
 }
author	Balbir Singh <balbir@linux.vnet.ibm.com>	2008-02-07 03:13:56 -0500
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2008-02-07 11:42:18 -0500
commit	66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch)
tree	d850a729887485874c976ba64eb85e3406e488a1 /mm/memcontrol.c
parent	67e465a77ba658635309ee00b367bec6555ea544 (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b25df2a9d024..9e9ff914c0f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -22,10 +22,15 @@
22	#include <linux/cgroup.h>	22	#include <linux/cgroup.h>
23	#include <linux/mm.h>	23	#include <linux/mm.h>
24	#include <linux/page-flags.h>	24	#include <linux/page-flags.h>
		25	#include <linux/backing-dev.h>
25	#include <linux/bit_spinlock.h>	26	#include <linux/bit_spinlock.h>
26	#include <linux/rcupdate.h>	27	#include <linux/rcupdate.h>
		28	#include <linux/swap.h>
		29	#include <linux/spinlock.h>
		30	#include <linux/fs.h>
27		31
28	struct cgroup_subsys mem_cgroup_subsys;	32	struct cgroup_subsys mem_cgroup_subsys;
		33	static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
29		34
30	/*	35	/*
31	* The memory controller data structure. The memory controller controls both	36	* The memory controller data structure. The memory controller controls both
@@ -51,6 +56,10 @@ struct mem_cgroup {
51	*/	56	*/
52	struct list_head active_list;	57	struct list_head active_list;
53	struct list_head inactive_list;	58	struct list_head inactive_list;
		59	/*
		60	* spin_lock to protect the per cgroup LRU
		61	*/
		62	spinlock_t lru_lock;
54	};	63	};
55		64
56	/*	65	/*
@@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page)
141	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);	150	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
142	}	151	}
143		152
		153	void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
		154	{
		155	if (active)
		156	list_move(&pc->lru, &pc->mem_cgroup->active_list);
		157	else
		158	list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
		159	}
		160
		161	/*
		162	* This routine assumes that the appropriate zone's lru lock is already held
		163	*/
		164	void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
		165	{
		166	struct mem_cgroup *mem;
		167	if (!pc)
		168	return;
		169
		170	mem = pc->mem_cgroup;
		171
		172	spin_lock(&mem->lru_lock);
		173	__mem_cgroup_move_lists(pc, active);
		174	spin_unlock(&mem->lru_lock);
		175	}
		176
		177	unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
		178	struct list_head *dst,
		179	unsigned long *scanned, int order,
		180	int mode, struct zone *z,
		181	struct mem_cgroup *mem_cont,
		182	int active)
		183	{
		184	unsigned long nr_taken = 0;
		185	struct page *page;
		186	unsigned long scan;
		187	LIST_HEAD(pc_list);
		188	struct list_head *src;
		189	struct page_cgroup *pc;
		190
		191	if (active)
		192	src = &mem_cont->active_list;
		193	else
		194	src = &mem_cont->inactive_list;
		195
		196	spin_lock(&mem_cont->lru_lock);
		197	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
		198	pc = list_entry(src->prev, struct page_cgroup, lru);
		199	page = pc->page;
		200	VM_BUG_ON(!pc);
		201
		202	if (PageActive(page) && !active) {
		203	__mem_cgroup_move_lists(pc, true);
		204	scan--;
		205	continue;
		206	}
		207	if (!PageActive(page) && active) {
		208	__mem_cgroup_move_lists(pc, false);
		209	scan--;
		210	continue;
		211	}
		212
		213	/*
		214	* Reclaim, per zone
		215	* TODO: make the active/inactive lists per zone
		216	*/
		217	if (page_zone(page) != z)
		218	continue;
		219
		220	/*
		221	* Check if the meta page went away from under us
		222	*/
		223	if (!list_empty(&pc->lru))
		224	list_move(&pc->lru, &pc_list);
		225	else
		226	continue;
		227
		228	if (__isolate_lru_page(page, mode) == 0) {
		229	list_move(&page->lru, dst);
		230	nr_taken++;
		231	}
		232	}
		233
		234	list_splice(&pc_list, src);
		235	spin_unlock(&mem_cont->lru_lock);
		236
		237	*scanned = scan;
		238	return nr_taken;
		239	}
		240
144	/*	241	/*
145	* Charge the memory controller for page usage.	242	* Charge the memory controller for page usage.
146	* Return	243	* Return
@@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page page, struct mm_struct mm)
151	{	248	{
152	struct mem_cgroup *mem;	249	struct mem_cgroup *mem;
153	struct page_cgroup pc, race_pc;	250	struct page_cgroup pc, race_pc;
		251	unsigned long flags;
		252	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
154		253
155	/*	254	/*
156	* Should page_cgroup's go to their own slab?	255	* Should page_cgroup's go to their own slab?
@@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page page, struct mm_struct mm)
159	* to see if the cgroup page already has a page_cgroup associated	258	* to see if the cgroup page already has a page_cgroup associated
160	* with it	259	* with it
161	*/	260	*/
		261	retry:
162	lock_page_cgroup(page);	262	lock_page_cgroup(page);
163	pc = page_get_page_cgroup(page);	263	pc = page_get_page_cgroup(page);
164	/*	264	/*
165	* The page_cgroup exists and the page has already been accounted	265	* The page_cgroup exists and the page has already been accounted
166	*/	266	*/
167	if (pc) {	267	if (pc) {
168	atomic_inc(&pc->ref_cnt);	268	if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
169	goto done;	269	/* this page is under being uncharged ? */
		270	unlock_page_cgroup(page);
		271	cpu_relax();
		272	goto retry;
		273	} else
		274	goto done;
170	}	275	}
171		276
172	unlock_page_cgroup(page);	277	unlock_page_cgroup(page);
@@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page page, struct mm_struct mm)
197	* If we created the page_cgroup, we should free it on exceeding	302	* If we created the page_cgroup, we should free it on exceeding
198	* the cgroup limit.	303	* the cgroup limit.
199	*/	304	*/
200	if (res_counter_charge(&mem->res, 1)) {	305	while (res_counter_charge(&mem->res, 1)) {
		306	if (try_to_free_mem_cgroup_pages(mem))
		307	continue;
		308
		309	/*
		310	* try_to_free_mem_cgroup_pages() might not give us a full
		311	* picture of reclaim. Some pages are reclaimed and might be
		312	* moved to swap cache or just unmapped from the cgroup.
		313	* Check the limit again to see if the reclaim reduced the
		314	* current usage of the cgroup before giving up
		315	*/
		316	if (res_counter_check_under_limit(&mem->res))
		317	continue;
		318	/*
		319	* Since we control both RSS and cache, we end up with a
		320	* very interesting scenario where we end up reclaiming
		321	* memory (essentially RSS), since the memory is pushed
		322	* to swap cache, we eventually end up adding those
		323	* pages back to our list. Hence we give ourselves a
		324	* few chances before we fail
		325	*/
		326	else if (nr_retries--) {
		327	congestion_wait(WRITE, HZ/10);
		328	continue;
		329	}
		330
201	css_put(&mem->css);	331	css_put(&mem->css);
202	goto free_pc;	332	goto free_pc;
203	}	333	}
@@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page page, struct mm_struct mm)
221	pc->page = page;	351	pc->page = page;
222	page_assign_page_cgroup(page, pc);	352	page_assign_page_cgroup(page, pc);
223		353
		354	spin_lock_irqsave(&mem->lru_lock, flags);
		355	list_add(&pc->lru, &mem->active_list);
		356	spin_unlock_irqrestore(&mem->lru_lock, flags);
		357
224	done:	358	done:
225	unlock_page_cgroup(page);	359	unlock_page_cgroup(page);
226	return 0;	360	return 0;
227	free_pc:	361	free_pc:
228	kfree(pc);	362	kfree(pc);
229	return -ENOMEM;
230	err:	363	err:
231	unlock_page_cgroup(page);
232	return -ENOMEM;	364	return -ENOMEM;
233	}	365	}
234		366
@@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
240	{	372	{
241	struct mem_cgroup *mem;	373	struct mem_cgroup *mem;
242	struct page *page;	374	struct page *page;
		375	unsigned long flags;
243		376
244	if (!pc)	377	if (!pc)
245	return;	378	return;
@@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
252	page_assign_page_cgroup(page, NULL);	385	page_assign_page_cgroup(page, NULL);
253	unlock_page_cgroup(page);	386	unlock_page_cgroup(page);
254	res_counter_uncharge(&mem->res, 1);	387	res_counter_uncharge(&mem->res, 1);
		388
		389	spin_lock_irqsave(&mem->lru_lock, flags);
		390	list_del_init(&pc->lru);
		391	spin_unlock_irqrestore(&mem->lru_lock, flags);
255	kfree(pc);	392	kfree(pc);
256	}	393	}
257	}	394	}
@@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
310	res_counter_init(&mem->res);	447	res_counter_init(&mem->res);
311	INIT_LIST_HEAD(&mem->active_list);	448	INIT_LIST_HEAD(&mem->active_list);
312	INIT_LIST_HEAD(&mem->inactive_list);	449	INIT_LIST_HEAD(&mem->inactive_list);
		450	spin_lock_init(&mem->lru_lock);
313	return &mem->css;	451	return &mem->css;
314	}	452	}
315		453