diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2008-02-07 03:13:56 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-02-07 11:42:18 -0500 |
commit | 66e1707bc34609f626e2e7b4fe7e454c9748bad5 (patch) | |
tree | d850a729887485874c976ba64eb85e3406e488a1 /mm/memcontrol.c | |
parent | 67e465a77ba658635309ee00b367bec6555ea544 (diff) |
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has
been modified to make the isolate_lru_pages() as a pluggable component. The
scan_control data structure now accepts the cgroup on behalf of which
reclaims are carried out. try_to_free_pages() has been extended to become
cgroup aware.
[akpm@linux-foundation.org: fix warning]
[Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member]
[bunk@kernel.org: make do_try_to_free_pages() static]
[hugh@veritas.com: memcgroup: fix try_to_free order]
[kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary]
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 148 |
1 files changed, 143 insertions, 5 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b25df2a9d024..9e9ff914c0f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -22,10 +22,15 @@ | |||
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/page-flags.h> | 24 | #include <linux/page-flags.h> |
25 | #include <linux/backing-dev.h> | ||
25 | #include <linux/bit_spinlock.h> | 26 | #include <linux/bit_spinlock.h> |
26 | #include <linux/rcupdate.h> | 27 | #include <linux/rcupdate.h> |
28 | #include <linux/swap.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/fs.h> | ||
27 | 31 | ||
28 | struct cgroup_subsys mem_cgroup_subsys; | 32 | struct cgroup_subsys mem_cgroup_subsys; |
33 | static const int MEM_CGROUP_RECLAIM_RETRIES = 5; | ||
29 | 34 | ||
30 | /* | 35 | /* |
31 | * The memory controller data structure. The memory controller controls both | 36 | * The memory controller data structure. The memory controller controls both |
@@ -51,6 +56,10 @@ struct mem_cgroup { | |||
51 | */ | 56 | */ |
52 | struct list_head active_list; | 57 | struct list_head active_list; |
53 | struct list_head inactive_list; | 58 | struct list_head inactive_list; |
59 | /* | ||
60 | * spin_lock to protect the per cgroup LRU | ||
61 | */ | ||
62 | spinlock_t lru_lock; | ||
54 | }; | 63 | }; |
55 | 64 | ||
56 | /* | 65 | /* |
@@ -141,6 +150,94 @@ void __always_inline unlock_page_cgroup(struct page *page) | |||
141 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | 150 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
142 | } | 151 | } |
143 | 152 | ||
153 | void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
154 | { | ||
155 | if (active) | ||
156 | list_move(&pc->lru, &pc->mem_cgroup->active_list); | ||
157 | else | ||
158 | list_move(&pc->lru, &pc->mem_cgroup->inactive_list); | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * This routine assumes that the appropriate zone's lru lock is already held | ||
163 | */ | ||
164 | void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
165 | { | ||
166 | struct mem_cgroup *mem; | ||
167 | if (!pc) | ||
168 | return; | ||
169 | |||
170 | mem = pc->mem_cgroup; | ||
171 | |||
172 | spin_lock(&mem->lru_lock); | ||
173 | __mem_cgroup_move_lists(pc, active); | ||
174 | spin_unlock(&mem->lru_lock); | ||
175 | } | ||
176 | |||
177 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | ||
178 | struct list_head *dst, | ||
179 | unsigned long *scanned, int order, | ||
180 | int mode, struct zone *z, | ||
181 | struct mem_cgroup *mem_cont, | ||
182 | int active) | ||
183 | { | ||
184 | unsigned long nr_taken = 0; | ||
185 | struct page *page; | ||
186 | unsigned long scan; | ||
187 | LIST_HEAD(pc_list); | ||
188 | struct list_head *src; | ||
189 | struct page_cgroup *pc; | ||
190 | |||
191 | if (active) | ||
192 | src = &mem_cont->active_list; | ||
193 | else | ||
194 | src = &mem_cont->inactive_list; | ||
195 | |||
196 | spin_lock(&mem_cont->lru_lock); | ||
197 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | ||
198 | pc = list_entry(src->prev, struct page_cgroup, lru); | ||
199 | page = pc->page; | ||
200 | VM_BUG_ON(!pc); | ||
201 | |||
202 | if (PageActive(page) && !active) { | ||
203 | __mem_cgroup_move_lists(pc, true); | ||
204 | scan--; | ||
205 | continue; | ||
206 | } | ||
207 | if (!PageActive(page) && active) { | ||
208 | __mem_cgroup_move_lists(pc, false); | ||
209 | scan--; | ||
210 | continue; | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * Reclaim, per zone | ||
215 | * TODO: make the active/inactive lists per zone | ||
216 | */ | ||
217 | if (page_zone(page) != z) | ||
218 | continue; | ||
219 | |||
220 | /* | ||
221 | * Check if the meta page went away from under us | ||
222 | */ | ||
223 | if (!list_empty(&pc->lru)) | ||
224 | list_move(&pc->lru, &pc_list); | ||
225 | else | ||
226 | continue; | ||
227 | |||
228 | if (__isolate_lru_page(page, mode) == 0) { | ||
229 | list_move(&page->lru, dst); | ||
230 | nr_taken++; | ||
231 | } | ||
232 | } | ||
233 | |||
234 | list_splice(&pc_list, src); | ||
235 | spin_unlock(&mem_cont->lru_lock); | ||
236 | |||
237 | *scanned = scan; | ||
238 | return nr_taken; | ||
239 | } | ||
240 | |||
144 | /* | 241 | /* |
145 | * Charge the memory controller for page usage. | 242 | * Charge the memory controller for page usage. |
146 | * Return | 243 | * Return |
@@ -151,6 +248,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
151 | { | 248 | { |
152 | struct mem_cgroup *mem; | 249 | struct mem_cgroup *mem; |
153 | struct page_cgroup *pc, *race_pc; | 250 | struct page_cgroup *pc, *race_pc; |
251 | unsigned long flags; | ||
252 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
154 | 253 | ||
155 | /* | 254 | /* |
156 | * Should page_cgroup's go to their own slab? | 255 | * Should page_cgroup's go to their own slab? |
@@ -159,14 +258,20 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
159 | * to see if the cgroup page already has a page_cgroup associated | 258 | * to see if the cgroup page already has a page_cgroup associated |
160 | * with it | 259 | * with it |
161 | */ | 260 | */ |
261 | retry: | ||
162 | lock_page_cgroup(page); | 262 | lock_page_cgroup(page); |
163 | pc = page_get_page_cgroup(page); | 263 | pc = page_get_page_cgroup(page); |
164 | /* | 264 | /* |
165 | * The page_cgroup exists and the page has already been accounted | 265 | * The page_cgroup exists and the page has already been accounted |
166 | */ | 266 | */ |
167 | if (pc) { | 267 | if (pc) { |
168 | atomic_inc(&pc->ref_cnt); | 268 | if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { |
169 | goto done; | 269 | /* this page is under being uncharged ? */ |
270 | unlock_page_cgroup(page); | ||
271 | cpu_relax(); | ||
272 | goto retry; | ||
273 | } else | ||
274 | goto done; | ||
170 | } | 275 | } |
171 | 276 | ||
172 | unlock_page_cgroup(page); | 277 | unlock_page_cgroup(page); |
@@ -197,7 +302,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
197 | * If we created the page_cgroup, we should free it on exceeding | 302 | * If we created the page_cgroup, we should free it on exceeding |
198 | * the cgroup limit. | 303 | * the cgroup limit. |
199 | */ | 304 | */ |
200 | if (res_counter_charge(&mem->res, 1)) { | 305 | while (res_counter_charge(&mem->res, 1)) { |
306 | if (try_to_free_mem_cgroup_pages(mem)) | ||
307 | continue; | ||
308 | |||
309 | /* | ||
310 | * try_to_free_mem_cgroup_pages() might not give us a full | ||
311 | * picture of reclaim. Some pages are reclaimed and might be | ||
312 | * moved to swap cache or just unmapped from the cgroup. | ||
313 | * Check the limit again to see if the reclaim reduced the | ||
314 | * current usage of the cgroup before giving up | ||
315 | */ | ||
316 | if (res_counter_check_under_limit(&mem->res)) | ||
317 | continue; | ||
318 | /* | ||
319 | * Since we control both RSS and cache, we end up with a | ||
320 | * very interesting scenario where we end up reclaiming | ||
321 | * memory (essentially RSS), since the memory is pushed | ||
322 | * to swap cache, we eventually end up adding those | ||
323 | * pages back to our list. Hence we give ourselves a | ||
324 | * few chances before we fail | ||
325 | */ | ||
326 | else if (nr_retries--) { | ||
327 | congestion_wait(WRITE, HZ/10); | ||
328 | continue; | ||
329 | } | ||
330 | |||
201 | css_put(&mem->css); | 331 | css_put(&mem->css); |
202 | goto free_pc; | 332 | goto free_pc; |
203 | } | 333 | } |
@@ -221,14 +351,16 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | |||
221 | pc->page = page; | 351 | pc->page = page; |
222 | page_assign_page_cgroup(page, pc); | 352 | page_assign_page_cgroup(page, pc); |
223 | 353 | ||
354 | spin_lock_irqsave(&mem->lru_lock, flags); | ||
355 | list_add(&pc->lru, &mem->active_list); | ||
356 | spin_unlock_irqrestore(&mem->lru_lock, flags); | ||
357 | |||
224 | done: | 358 | done: |
225 | unlock_page_cgroup(page); | 359 | unlock_page_cgroup(page); |
226 | return 0; | 360 | return 0; |
227 | free_pc: | 361 | free_pc: |
228 | kfree(pc); | 362 | kfree(pc); |
229 | return -ENOMEM; | ||
230 | err: | 363 | err: |
231 | unlock_page_cgroup(page); | ||
232 | return -ENOMEM; | 364 | return -ENOMEM; |
233 | } | 365 | } |
234 | 366 | ||
@@ -240,6 +372,7 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) | |||
240 | { | 372 | { |
241 | struct mem_cgroup *mem; | 373 | struct mem_cgroup *mem; |
242 | struct page *page; | 374 | struct page *page; |
375 | unsigned long flags; | ||
243 | 376 | ||
244 | if (!pc) | 377 | if (!pc) |
245 | return; | 378 | return; |
@@ -252,6 +385,10 @@ void mem_cgroup_uncharge(struct page_cgroup *pc) | |||
252 | page_assign_page_cgroup(page, NULL); | 385 | page_assign_page_cgroup(page, NULL); |
253 | unlock_page_cgroup(page); | 386 | unlock_page_cgroup(page); |
254 | res_counter_uncharge(&mem->res, 1); | 387 | res_counter_uncharge(&mem->res, 1); |
388 | |||
389 | spin_lock_irqsave(&mem->lru_lock, flags); | ||
390 | list_del_init(&pc->lru); | ||
391 | spin_unlock_irqrestore(&mem->lru_lock, flags); | ||
255 | kfree(pc); | 392 | kfree(pc); |
256 | } | 393 | } |
257 | } | 394 | } |
@@ -310,6 +447,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
310 | res_counter_init(&mem->res); | 447 | res_counter_init(&mem->res); |
311 | INIT_LIST_HEAD(&mem->active_list); | 448 | INIT_LIST_HEAD(&mem->active_list); |
312 | INIT_LIST_HEAD(&mem->inactive_list); | 449 | INIT_LIST_HEAD(&mem->inactive_list); |
450 | spin_lock_init(&mem->lru_lock); | ||
313 | return &mem->css; | 451 | return &mem->css; |
314 | } | 452 | } |
315 | 453 | ||