aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2008-02-07 03:13:56 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:18 -0500
commit66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch)
treed850a729887485874c976ba64eb85e3406e488a1 /mm/memcontrol.c
parent67e465a77ba658635309ee00b367bec6555ea544 (diff)
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov <xemul@openvz.org> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c148
1 files changed, 143 insertions, 5 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b25df2a9d024..9e9ff914c0f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -22,10 +22,15 @@
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/page-flags.h> 24#include <linux/page-flags.h>
25#include <linux/backing-dev.h>
25#include <linux/bit_spinlock.h> 26#include <linux/bit_spinlock.h>
26#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
28#include <linux/swap.h>
29#include <linux/spinlock.h>
30#include <linux/fs.h>
27 31
28struct cgroup_subsys mem_cgroup_subsys; 32struct cgroup_subsys mem_cgroup_subsys;
33static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
29 34
30/* 35/*
31 * The memory controller data structure. The memory controller controls both 36 * The memory controller data structure. The memory controller controls both
@@ -51,6 +56,10 @@ struct mem_cgroup {
51 */ 56 */
52 struct list_head active_list; 57 struct list_head active_list;
53 struct list_head inactive_list; 58 struct list_head inactive_list;
59 /*
60 * spin_lock to protect the per cgroup LRU
61 */
62 spinlock_t lru_lock;
54}; 63};
55 64
56/* 65/*
@@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page)
141 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 150 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
142} 151}
143 152
153void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
154{
155 if (active)
156 list_move(&pc->lru, &pc->mem_cgroup->active_list);
157 else
158 list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
159}
160
161/*
162 * This routine assumes that the appropriate zone's lru lock is already held
163 */
164void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
165{
166 struct mem_cgroup *mem;
167 if (!pc)
168 return;
169
170 mem = pc->mem_cgroup;
171
172 spin_lock(&mem->lru_lock);
173 __mem_cgroup_move_lists(pc, active);
174 spin_unlock(&mem->lru_lock);
175}
176
177unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
178 struct list_head *dst,
179 unsigned long *scanned, int order,
180 int mode, struct zone *z,
181 struct mem_cgroup *mem_cont,
182 int active)
183{
184 unsigned long nr_taken = 0;
185 struct page *page;
186 unsigned long scan;
187 LIST_HEAD(pc_list);
188 struct list_head *src;
189 struct page_cgroup *pc;
190
191 if (active)
192 src = &mem_cont->active_list;
193 else
194 src = &mem_cont->inactive_list;
195
196 spin_lock(&mem_cont->lru_lock);
197 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
198 pc = list_entry(src->prev, struct page_cgroup, lru);
199 page = pc->page;
200 VM_BUG_ON(!pc);
201
202 if (PageActive(page) && !active) {
203 __mem_cgroup_move_lists(pc, true);
204 scan--;
205 continue;
206 }
207 if (!PageActive(page) && active) {
208 __mem_cgroup_move_lists(pc, false);
209 scan--;
210 continue;
211 }
212
213 /*
214 * Reclaim, per zone
215 * TODO: make the active/inactive lists per zone
216 */
217 if (page_zone(page) != z)
218 continue;
219
220 /*
221 * Check if the meta page went away from under us
222 */
223 if (!list_empty(&pc->lru))
224 list_move(&pc->lru, &pc_list);
225 else
226 continue;
227
228 if (__isolate_lru_page(page, mode) == 0) {
229 list_move(&page->lru, dst);
230 nr_taken++;
231 }
232 }
233
234 list_splice(&pc_list, src);
235 spin_unlock(&mem_cont->lru_lock);
236
237 *scanned = scan;
238 return nr_taken;
239}
240
144/* 241/*
145 * Charge the memory controller for page usage. 242 * Charge the memory controller for page usage.
146 * Return 243 * Return
@@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
151{ 248{
152 struct mem_cgroup *mem; 249 struct mem_cgroup *mem;
153 struct page_cgroup *pc, *race_pc; 250 struct page_cgroup *pc, *race_pc;
251 unsigned long flags;
252 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
154 253
155 /* 254 /*
156 * Should page_cgroup's go to their own slab? 255 * Should page_cgroup's go to their own slab?
@@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
159 * to see if the cgroup page already has a page_cgroup associated 258 * to see if the cgroup page already has a page_cgroup associated
160 * with it 259 * with it
161 */ 260 */
261retry:
162 lock_page_cgroup(page); 262 lock_page_cgroup(page);
163 pc = page_get_page_cgroup(page); 263 pc = page_get_page_cgroup(page);
164 /* 264 /*
165 * The page_cgroup exists and the page has already been accounted 265 * The page_cgroup exists and the page has already been accounted
166 */ 266 */
167 if (pc) { 267 if (pc) {
168 atomic_inc(&pc->ref_cnt); 268 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
169 goto done; 269 /* this page is under being uncharged ? */
270 unlock_page_cgroup(page);
271 cpu_relax();
272 goto retry;
273 } else
274 goto done;
170 } 275 }
171 276
172 unlock_page_cgroup(page); 277 unlock_page_cgroup(page);
@@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
197 * If we created the page_cgroup, we should free it on exceeding 302 * If we created the page_cgroup, we should free it on exceeding
198 * the cgroup limit. 303 * the cgroup limit.
199 */ 304 */
200 if (res_counter_charge(&mem->res, 1)) { 305 while (res_counter_charge(&mem->res, 1)) {
306 if (try_to_free_mem_cgroup_pages(mem))
307 continue;
308
309 /*
310 * try_to_free_mem_cgroup_pages() might not give us a full
311 * picture of reclaim. Some pages are reclaimed and might be
312 * moved to swap cache or just unmapped from the cgroup.
313 * Check the limit again to see if the reclaim reduced the
314 * current usage of the cgroup before giving up
315 */
316 if (res_counter_check_under_limit(&mem->res))
317 continue;
318 /*
319 * Since we control both RSS and cache, we end up with a
320 * very interesting scenario where we end up reclaiming
321 * memory (essentially RSS), since the memory is pushed
322 * to swap cache, we eventually end up adding those
323 * pages back to our list. Hence we give ourselves a
324 * few chances before we fail
325 */
326 else if (nr_retries--) {
327 congestion_wait(WRITE, HZ/10);
328 continue;
329 }
330
201 css_put(&mem->css); 331 css_put(&mem->css);
202 goto free_pc; 332 goto free_pc;
203 } 333 }
@@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
221 pc->page = page; 351 pc->page = page;
222 page_assign_page_cgroup(page, pc); 352 page_assign_page_cgroup(page, pc);
223 353
354 spin_lock_irqsave(&mem->lru_lock, flags);
355 list_add(&pc->lru, &mem->active_list);
356 spin_unlock_irqrestore(&mem->lru_lock, flags);
357
224done: 358done:
225 unlock_page_cgroup(page); 359 unlock_page_cgroup(page);
226 return 0; 360 return 0;
227free_pc: 361free_pc:
228 kfree(pc); 362 kfree(pc);
229 return -ENOMEM;
230err: 363err:
231 unlock_page_cgroup(page);
232 return -ENOMEM; 364 return -ENOMEM;
233} 365}
234 366
@@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
240{ 372{
241 struct mem_cgroup *mem; 373 struct mem_cgroup *mem;
242 struct page *page; 374 struct page *page;
375 unsigned long flags;
243 376
244 if (!pc) 377 if (!pc)
245 return; 378 return;
@@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc)
252 page_assign_page_cgroup(page, NULL); 385 page_assign_page_cgroup(page, NULL);
253 unlock_page_cgroup(page); 386 unlock_page_cgroup(page);
254 res_counter_uncharge(&mem->res, 1); 387 res_counter_uncharge(&mem->res, 1);
388
389 spin_lock_irqsave(&mem->lru_lock, flags);
390 list_del_init(&pc->lru);
391 spin_unlock_irqrestore(&mem->lru_lock, flags);
255 kfree(pc); 392 kfree(pc);
256 } 393 }
257} 394}
@@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
310 res_counter_init(&mem->res); 447 res_counter_init(&mem->res);
311 INIT_LIST_HEAD(&mem->active_list); 448 INIT_LIST_HEAD(&mem->active_list);
312 INIT_LIST_HEAD(&mem->inactive_list); 449 INIT_LIST_HEAD(&mem->inactive_list);
450 spin_lock_init(&mem->lru_lock);
313 return &mem->css; 451 return &mem->css;
314} 452}
315 453