diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-01-07 21:08:01 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-08 11:31:05 -0500 |
commit | 08e552c69c6930d64722de3ec18c51844d06ee28 (patch) | |
tree | a744d57ed4b23401115f1033dcaac9e85d550e09 /mm/memcontrol.c | |
parent | 8c7c6e34a1256a5082d38c8e9bd1474476912715 (diff) |
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 323 |
1 files changed, 141 insertions, 182 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efcf38f3b73..8ce4e9e47959 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/vmalloc.h> | 36 | #include <linux/vmalloc.h> |
37 | #include <linux/mm_inline.h> | 37 | #include <linux/mm_inline.h> |
38 | #include <linux/page_cgroup.h> | 38 | #include <linux/page_cgroup.h> |
39 | #include "internal.h" | ||
39 | 40 | ||
40 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
41 | 42 | ||
@@ -100,7 +101,6 @@ struct mem_cgroup_per_zone { | |||
100 | /* | 101 | /* |
101 | * spin_lock to protect the per cgroup LRU | 102 | * spin_lock to protect the per cgroup LRU |
102 | */ | 103 | */ |
103 | spinlock_t lru_lock; | ||
104 | struct list_head lists[NR_LRU_LISTS]; | 104 | struct list_head lists[NR_LRU_LISTS]; |
105 | unsigned long count[NR_LRU_LISTS]; | 105 | unsigned long count[NR_LRU_LISTS]; |
106 | }; | 106 | }; |
@@ -163,14 +163,12 @@ enum charge_type { | |||
163 | /* only for here (for easy reading.) */ | 163 | /* only for here (for easy reading.) */ |
164 | #define PCGF_CACHE (1UL << PCG_CACHE) | 164 | #define PCGF_CACHE (1UL << PCG_CACHE) |
165 | #define PCGF_USED (1UL << PCG_USED) | 165 | #define PCGF_USED (1UL << PCG_USED) |
166 | #define PCGF_ACTIVE (1UL << PCG_ACTIVE) | ||
167 | #define PCGF_LOCK (1UL << PCG_LOCK) | 166 | #define PCGF_LOCK (1UL << PCG_LOCK) |
168 | #define PCGF_FILE (1UL << PCG_FILE) | ||
169 | static const unsigned long | 167 | static const unsigned long |
170 | pcg_default_flags[NR_CHARGE_TYPE] = { | 168 | pcg_default_flags[NR_CHARGE_TYPE] = { |
171 | PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ | 169 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ |
172 | PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ | 170 | PCGF_USED | PCGF_LOCK, /* Anon */ |
173 | PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | 171 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ |
174 | 0, /* FORCE */ | 172 | 0, /* FORCE */ |
175 | }; | 173 | }; |
176 | 174 | ||
@@ -185,9 +183,6 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
185 | static void mem_cgroup_get(struct mem_cgroup *mem); | 183 | static void mem_cgroup_get(struct mem_cgroup *mem); |
186 | static void mem_cgroup_put(struct mem_cgroup *mem); | 184 | static void mem_cgroup_put(struct mem_cgroup *mem); |
187 | 185 | ||
188 | /* | ||
189 | * Always modified under lru lock. Then, not necessary to preempt_disable() | ||
190 | */ | ||
191 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 186 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
192 | struct page_cgroup *pc, | 187 | struct page_cgroup *pc, |
193 | bool charge) | 188 | bool charge) |
@@ -195,10 +190,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
195 | int val = (charge)? 1 : -1; | 190 | int val = (charge)? 1 : -1; |
196 | struct mem_cgroup_stat *stat = &mem->stat; | 191 | struct mem_cgroup_stat *stat = &mem->stat; |
197 | struct mem_cgroup_stat_cpu *cpustat; | 192 | struct mem_cgroup_stat_cpu *cpustat; |
193 | int cpu = get_cpu(); | ||
198 | 194 | ||
199 | VM_BUG_ON(!irqs_disabled()); | 195 | cpustat = &stat->cpustat[cpu]; |
200 | |||
201 | cpustat = &stat->cpustat[smp_processor_id()]; | ||
202 | if (PageCgroupCache(pc)) | 196 | if (PageCgroupCache(pc)) |
203 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 197 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); |
204 | else | 198 | else |
@@ -210,6 +204,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
210 | else | 204 | else |
211 | __mem_cgroup_stat_add_safe(cpustat, | 205 | __mem_cgroup_stat_add_safe(cpustat, |
212 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 206 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
207 | put_cpu(); | ||
213 | } | 208 | } |
214 | 209 | ||
215 | static struct mem_cgroup_per_zone * | 210 | static struct mem_cgroup_per_zone * |
@@ -264,80 +259,95 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
264 | struct mem_cgroup, css); | 259 | struct mem_cgroup, css); |
265 | } | 260 | } |
266 | 261 | ||
267 | static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | 262 | /* |
268 | struct page_cgroup *pc) | 263 | * Following LRU functions are allowed to be used without PCG_LOCK. |
269 | { | 264 | * Operations are called by routine of global LRU independently from memcg. |
270 | int lru = LRU_BASE; | 265 | * What we have to take care of here is validness of pc->mem_cgroup. |
266 | * | ||
267 | * Changes to pc->mem_cgroup happens when | ||
268 | * 1. charge | ||
269 | * 2. moving account | ||
270 | * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. | ||
271 | * It is added to LRU before charge. | ||
272 | * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. | ||
273 | * When moving account, the page is not on LRU. It's isolated. | ||
274 | */ | ||
271 | 275 | ||
272 | if (PageCgroupUnevictable(pc)) | 276 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
273 | lru = LRU_UNEVICTABLE; | 277 | { |
274 | else { | 278 | struct page_cgroup *pc; |
275 | if (PageCgroupActive(pc)) | 279 | struct mem_cgroup *mem; |
276 | lru += LRU_ACTIVE; | 280 | struct mem_cgroup_per_zone *mz; |
277 | if (PageCgroupFile(pc)) | ||
278 | lru += LRU_FILE; | ||
279 | } | ||
280 | 281 | ||
282 | if (mem_cgroup_subsys.disabled) | ||
283 | return; | ||
284 | pc = lookup_page_cgroup(page); | ||
285 | /* can happen while we handle swapcache. */ | ||
286 | if (list_empty(&pc->lru)) | ||
287 | return; | ||
288 | mz = page_cgroup_zoneinfo(pc); | ||
289 | mem = pc->mem_cgroup; | ||
281 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 290 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
282 | 291 | list_del_init(&pc->lru); | |
283 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); | 292 | return; |
284 | list_del(&pc->lru); | ||
285 | } | 293 | } |
286 | 294 | ||
287 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 295 | void mem_cgroup_del_lru(struct page *page) |
288 | struct page_cgroup *pc, bool hot) | ||
289 | { | 296 | { |
290 | int lru = LRU_BASE; | 297 | mem_cgroup_del_lru_list(page, page_lru(page)); |
298 | } | ||
291 | 299 | ||
292 | if (PageCgroupUnevictable(pc)) | 300 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) |
293 | lru = LRU_UNEVICTABLE; | 301 | { |
294 | else { | 302 | struct mem_cgroup_per_zone *mz; |
295 | if (PageCgroupActive(pc)) | 303 | struct page_cgroup *pc; |
296 | lru += LRU_ACTIVE; | ||
297 | if (PageCgroupFile(pc)) | ||
298 | lru += LRU_FILE; | ||
299 | } | ||
300 | 304 | ||
301 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 305 | if (mem_cgroup_subsys.disabled) |
302 | if (hot) | 306 | return; |
303 | list_add(&pc->lru, &mz->lists[lru]); | ||
304 | else | ||
305 | list_add_tail(&pc->lru, &mz->lists[lru]); | ||
306 | 307 | ||
307 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); | 308 | pc = lookup_page_cgroup(page); |
309 | smp_rmb(); | ||
310 | /* unused page is not rotated. */ | ||
311 | if (!PageCgroupUsed(pc)) | ||
312 | return; | ||
313 | mz = page_cgroup_zoneinfo(pc); | ||
314 | list_move(&pc->lru, &mz->lists[lru]); | ||
308 | } | 315 | } |
309 | 316 | ||
310 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) | 317 | void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) |
311 | { | 318 | { |
312 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | 319 | struct page_cgroup *pc; |
313 | int active = PageCgroupActive(pc); | 320 | struct mem_cgroup_per_zone *mz; |
314 | int file = PageCgroupFile(pc); | ||
315 | int unevictable = PageCgroupUnevictable(pc); | ||
316 | enum lru_list from = unevictable ? LRU_UNEVICTABLE : | ||
317 | (LRU_FILE * !!file + !!active); | ||
318 | 321 | ||
319 | if (lru == from) | 322 | if (mem_cgroup_subsys.disabled) |
323 | return; | ||
324 | pc = lookup_page_cgroup(page); | ||
325 | /* barrier to sync with "charge" */ | ||
326 | smp_rmb(); | ||
327 | if (!PageCgroupUsed(pc)) | ||
320 | return; | 328 | return; |
321 | 329 | ||
322 | MEM_CGROUP_ZSTAT(mz, from) -= 1; | 330 | mz = page_cgroup_zoneinfo(pc); |
323 | /* | ||
324 | * However this is done under mz->lru_lock, another flags, which | ||
325 | * are not related to LRU, will be modified from out-of-lock. | ||
326 | * We have to use atomic set/clear flags. | ||
327 | */ | ||
328 | if (is_unevictable_lru(lru)) { | ||
329 | ClearPageCgroupActive(pc); | ||
330 | SetPageCgroupUnevictable(pc); | ||
331 | } else { | ||
332 | if (is_active_lru(lru)) | ||
333 | SetPageCgroupActive(pc); | ||
334 | else | ||
335 | ClearPageCgroupActive(pc); | ||
336 | ClearPageCgroupUnevictable(pc); | ||
337 | } | ||
338 | |||
339 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 331 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
340 | list_move(&pc->lru, &mz->lists[lru]); | 332 | list_add(&pc->lru, &mz->lists[lru]); |
333 | } | ||
334 | /* | ||
335 | * To add swapcache into LRU. Be careful to all this function. | ||
336 | * zone->lru_lock shouldn't be held and irq must not be disabled. | ||
337 | */ | ||
338 | static void mem_cgroup_lru_fixup(struct page *page) | ||
339 | { | ||
340 | if (!isolate_lru_page(page)) | ||
341 | putback_lru_page(page); | ||
342 | } | ||
343 | |||
344 | void mem_cgroup_move_lists(struct page *page, | ||
345 | enum lru_list from, enum lru_list to) | ||
346 | { | ||
347 | if (mem_cgroup_subsys.disabled) | ||
348 | return; | ||
349 | mem_cgroup_del_lru_list(page, from); | ||
350 | mem_cgroup_add_lru_list(page, to); | ||
341 | } | 351 | } |
342 | 352 | ||
343 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 353 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
@@ -351,37 +361,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
351 | } | 361 | } |
352 | 362 | ||
353 | /* | 363 | /* |
354 | * This routine assumes that the appropriate zone's lru lock is already held | ||
355 | */ | ||
356 | void mem_cgroup_move_lists(struct page *page, enum lru_list lru) | ||
357 | { | ||
358 | struct page_cgroup *pc; | ||
359 | struct mem_cgroup_per_zone *mz; | ||
360 | unsigned long flags; | ||
361 | |||
362 | if (mem_cgroup_subsys.disabled) | ||
363 | return; | ||
364 | |||
365 | /* | ||
366 | * We cannot lock_page_cgroup while holding zone's lru_lock, | ||
367 | * because other holders of lock_page_cgroup can be interrupted | ||
368 | * with an attempt to rotate_reclaimable_page. But we cannot | ||
369 | * safely get to page_cgroup without it, so just try_lock it: | ||
370 | * mem_cgroup_isolate_pages allows for page left on wrong list. | ||
371 | */ | ||
372 | pc = lookup_page_cgroup(page); | ||
373 | if (!trylock_page_cgroup(pc)) | ||
374 | return; | ||
375 | if (pc && PageCgroupUsed(pc)) { | ||
376 | mz = page_cgroup_zoneinfo(pc); | ||
377 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
378 | __mem_cgroup_move_lists(pc, lru); | ||
379 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
380 | } | ||
381 | unlock_page_cgroup(pc); | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Calculate mapped_ratio under memory controller. This will be used in | 364 | * Calculate mapped_ratio under memory controller. This will be used in |
386 | * vmscan.c for deteremining we have to reclaim mapped pages. | 365 | * vmscan.c for deteremining we have to reclaim mapped pages. |
387 | */ | 366 | */ |
@@ -460,40 +439,24 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
460 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 439 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
461 | src = &mz->lists[lru]; | 440 | src = &mz->lists[lru]; |
462 | 441 | ||
463 | spin_lock(&mz->lru_lock); | ||
464 | scan = 0; | 442 | scan = 0; |
465 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | 443 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { |
466 | if (scan >= nr_to_scan) | 444 | if (scan >= nr_to_scan) |
467 | break; | 445 | break; |
446 | |||
447 | page = pc->page; | ||
468 | if (unlikely(!PageCgroupUsed(pc))) | 448 | if (unlikely(!PageCgroupUsed(pc))) |
469 | continue; | 449 | continue; |
470 | page = pc->page; | ||
471 | |||
472 | if (unlikely(!PageLRU(page))) | 450 | if (unlikely(!PageLRU(page))) |
473 | continue; | 451 | continue; |
474 | 452 | ||
475 | /* | ||
476 | * TODO: play better with lumpy reclaim, grabbing anything. | ||
477 | */ | ||
478 | if (PageUnevictable(page) || | ||
479 | (PageActive(page) && !active) || | ||
480 | (!PageActive(page) && active)) { | ||
481 | __mem_cgroup_move_lists(pc, page_lru(page)); | ||
482 | continue; | ||
483 | } | ||
484 | |||
485 | scan++; | 453 | scan++; |
486 | list_move(&pc->lru, &pc_list); | ||
487 | |||
488 | if (__isolate_lru_page(page, mode, file) == 0) { | 454 | if (__isolate_lru_page(page, mode, file) == 0) { |
489 | list_move(&page->lru, dst); | 455 | list_move(&page->lru, dst); |
490 | nr_taken++; | 456 | nr_taken++; |
491 | } | 457 | } |
492 | } | 458 | } |
493 | 459 | ||
494 | list_splice(&pc_list, src); | ||
495 | spin_unlock(&mz->lru_lock); | ||
496 | |||
497 | *scanned = scan; | 460 | *scanned = scan; |
498 | return nr_taken; | 461 | return nr_taken; |
499 | } | 462 | } |
@@ -608,9 +571,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
608 | struct page_cgroup *pc, | 571 | struct page_cgroup *pc, |
609 | enum charge_type ctype) | 572 | enum charge_type ctype) |
610 | { | 573 | { |
611 | struct mem_cgroup_per_zone *mz; | ||
612 | unsigned long flags; | ||
613 | |||
614 | /* try_charge() can return NULL to *memcg, taking care of it. */ | 574 | /* try_charge() can return NULL to *memcg, taking care of it. */ |
615 | if (!mem) | 575 | if (!mem) |
616 | return; | 576 | return; |
@@ -625,17 +585,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
625 | return; | 585 | return; |
626 | } | 586 | } |
627 | pc->mem_cgroup = mem; | 587 | pc->mem_cgroup = mem; |
628 | /* | 588 | smp_wmb(); |
629 | * If a page is accounted as a page cache, insert to inactive list. | ||
630 | * If anon, insert to active list. | ||
631 | */ | ||
632 | pc->flags = pcg_default_flags[ctype]; | 589 | pc->flags = pcg_default_flags[ctype]; |
633 | 590 | ||
634 | mz = page_cgroup_zoneinfo(pc); | 591 | mem_cgroup_charge_statistics(mem, pc, true); |
635 | 592 | ||
636 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
637 | __mem_cgroup_add_list(mz, pc, true); | ||
638 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
639 | unlock_page_cgroup(pc); | 593 | unlock_page_cgroup(pc); |
640 | } | 594 | } |
641 | 595 | ||
@@ -646,8 +600,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
646 | * @to: mem_cgroup which the page is moved to. @from != @to. | 600 | * @to: mem_cgroup which the page is moved to. @from != @to. |
647 | * | 601 | * |
648 | * The caller must confirm following. | 602 | * The caller must confirm following. |
649 | * 1. disable irq. | 603 | * - page is not on LRU (isolate_page() is useful.) |
650 | * 2. lru_lock of old mem_cgroup(@from) should be held. | ||
651 | * | 604 | * |
652 | * returns 0 at success, | 605 | * returns 0 at success, |
653 | * returns -EBUSY when lock is busy or "pc" is unstable. | 606 | * returns -EBUSY when lock is busy or "pc" is unstable. |
@@ -663,15 +616,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
663 | int nid, zid; | 616 | int nid, zid; |
664 | int ret = -EBUSY; | 617 | int ret = -EBUSY; |
665 | 618 | ||
666 | VM_BUG_ON(!irqs_disabled()); | ||
667 | VM_BUG_ON(from == to); | 619 | VM_BUG_ON(from == to); |
620 | VM_BUG_ON(PageLRU(pc->page)); | ||
668 | 621 | ||
669 | nid = page_cgroup_nid(pc); | 622 | nid = page_cgroup_nid(pc); |
670 | zid = page_cgroup_zid(pc); | 623 | zid = page_cgroup_zid(pc); |
671 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | 624 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); |
672 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | 625 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); |
673 | 626 | ||
674 | |||
675 | if (!trylock_page_cgroup(pc)) | 627 | if (!trylock_page_cgroup(pc)) |
676 | return ret; | 628 | return ret; |
677 | 629 | ||
@@ -681,18 +633,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
681 | if (pc->mem_cgroup != from) | 633 | if (pc->mem_cgroup != from) |
682 | goto out; | 634 | goto out; |
683 | 635 | ||
684 | if (spin_trylock(&to_mz->lru_lock)) { | 636 | css_put(&from->css); |
685 | __mem_cgroup_remove_list(from_mz, pc); | 637 | res_counter_uncharge(&from->res, PAGE_SIZE); |
686 | css_put(&from->css); | 638 | mem_cgroup_charge_statistics(from, pc, false); |
687 | res_counter_uncharge(&from->res, PAGE_SIZE); | 639 | if (do_swap_account) |
688 | if (do_swap_account) | 640 | res_counter_uncharge(&from->memsw, PAGE_SIZE); |
689 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 641 | pc->mem_cgroup = to; |
690 | pc->mem_cgroup = to; | 642 | mem_cgroup_charge_statistics(to, pc, true); |
691 | css_get(&to->css); | 643 | css_get(&to->css); |
692 | __mem_cgroup_add_list(to_mz, pc, false); | 644 | ret = 0; |
693 | ret = 0; | ||
694 | spin_unlock(&to_mz->lru_lock); | ||
695 | } | ||
696 | out: | 645 | out: |
697 | unlock_page_cgroup(pc); | 646 | unlock_page_cgroup(pc); |
698 | return ret; | 647 | return ret; |
@@ -706,39 +655,47 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
706 | struct mem_cgroup *child, | 655 | struct mem_cgroup *child, |
707 | gfp_t gfp_mask) | 656 | gfp_t gfp_mask) |
708 | { | 657 | { |
658 | struct page *page = pc->page; | ||
709 | struct cgroup *cg = child->css.cgroup; | 659 | struct cgroup *cg = child->css.cgroup; |
710 | struct cgroup *pcg = cg->parent; | 660 | struct cgroup *pcg = cg->parent; |
711 | struct mem_cgroup *parent; | 661 | struct mem_cgroup *parent; |
712 | struct mem_cgroup_per_zone *mz; | ||
713 | unsigned long flags; | ||
714 | int ret; | 662 | int ret; |
715 | 663 | ||
716 | /* Is ROOT ? */ | 664 | /* Is ROOT ? */ |
717 | if (!pcg) | 665 | if (!pcg) |
718 | return -EINVAL; | 666 | return -EINVAL; |
719 | 667 | ||
668 | |||
720 | parent = mem_cgroup_from_cont(pcg); | 669 | parent = mem_cgroup_from_cont(pcg); |
721 | 670 | ||
671 | |||
722 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 672 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); |
723 | if (ret) | 673 | if (ret) |
724 | return ret; | 674 | return ret; |
725 | 675 | ||
726 | mz = mem_cgroup_zoneinfo(child, | 676 | if (!get_page_unless_zero(page)) |
727 | page_cgroup_nid(pc), page_cgroup_zid(pc)); | 677 | return -EBUSY; |
678 | |||
679 | ret = isolate_lru_page(page); | ||
680 | |||
681 | if (ret) | ||
682 | goto cancel; | ||
728 | 683 | ||
729 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
730 | ret = mem_cgroup_move_account(pc, child, parent); | 684 | ret = mem_cgroup_move_account(pc, child, parent); |
731 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
732 | 685 | ||
733 | /* drop extra refcnt */ | 686 | /* drop extra refcnt by try_charge() (move_account increment one) */ |
734 | css_put(&parent->css); | 687 | css_put(&parent->css); |
735 | /* uncharge if move fails */ | 688 | putback_lru_page(page); |
736 | if (ret) { | 689 | if (!ret) { |
737 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 690 | put_page(page); |
738 | if (do_swap_account) | 691 | return 0; |
739 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
740 | } | 692 | } |
741 | 693 | /* uncharge if move fails */ | |
694 | cancel: | ||
695 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
696 | if (do_swap_account) | ||
697 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
698 | put_page(page); | ||
742 | return ret; | 699 | return ret; |
743 | } | 700 | } |
744 | 701 | ||
@@ -912,6 +869,8 @@ int mem_cgroup_cache_charge_swapin(struct page *page, | |||
912 | } | 869 | } |
913 | if (!locked) | 870 | if (!locked) |
914 | unlock_page(page); | 871 | unlock_page(page); |
872 | /* add this page(page_cgroup) to the LRU we want. */ | ||
873 | mem_cgroup_lru_fixup(page); | ||
915 | 874 | ||
916 | return ret; | 875 | return ret; |
917 | } | 876 | } |
@@ -944,6 +903,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
944 | } | 903 | } |
945 | 904 | ||
946 | } | 905 | } |
906 | /* add this page(page_cgroup) to the LRU we want. */ | ||
907 | mem_cgroup_lru_fixup(page); | ||
947 | } | 908 | } |
948 | 909 | ||
949 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 910 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) |
@@ -968,7 +929,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
968 | struct page_cgroup *pc; | 929 | struct page_cgroup *pc; |
969 | struct mem_cgroup *mem = NULL; | 930 | struct mem_cgroup *mem = NULL; |
970 | struct mem_cgroup_per_zone *mz; | 931 | struct mem_cgroup_per_zone *mz; |
971 | unsigned long flags; | ||
972 | 932 | ||
973 | if (mem_cgroup_subsys.disabled) | 933 | if (mem_cgroup_subsys.disabled) |
974 | return NULL; | 934 | return NULL; |
@@ -1010,12 +970,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1010 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 970 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) |
1011 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 971 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
1012 | 972 | ||
973 | mem_cgroup_charge_statistics(mem, pc, false); | ||
1013 | ClearPageCgroupUsed(pc); | 974 | ClearPageCgroupUsed(pc); |
1014 | 975 | ||
1015 | mz = page_cgroup_zoneinfo(pc); | 976 | mz = page_cgroup_zoneinfo(pc); |
1016 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
1017 | __mem_cgroup_remove_list(mz, pc); | ||
1018 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
1019 | unlock_page_cgroup(pc); | 977 | unlock_page_cgroup(pc); |
1020 | 978 | ||
1021 | css_put(&mem->css); | 979 | css_put(&mem->css); |
@@ -1281,21 +1239,22 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1281 | return ret; | 1239 | return ret; |
1282 | } | 1240 | } |
1283 | 1241 | ||
1284 | |||
1285 | /* | 1242 | /* |
1286 | * This routine traverse page_cgroup in given list and drop them all. | 1243 | * This routine traverse page_cgroup in given list and drop them all. |
1287 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 1244 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
1288 | */ | 1245 | */ |
1289 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | 1246 | static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, |
1290 | struct mem_cgroup_per_zone *mz, | 1247 | int node, int zid, enum lru_list lru) |
1291 | enum lru_list lru) | ||
1292 | { | 1248 | { |
1249 | struct zone *zone; | ||
1250 | struct mem_cgroup_per_zone *mz; | ||
1293 | struct page_cgroup *pc, *busy; | 1251 | struct page_cgroup *pc, *busy; |
1294 | unsigned long flags; | 1252 | unsigned long flags, loop; |
1295 | unsigned long loop; | ||
1296 | struct list_head *list; | 1253 | struct list_head *list; |
1297 | int ret = 0; | 1254 | int ret = 0; |
1298 | 1255 | ||
1256 | zone = &NODE_DATA(node)->node_zones[zid]; | ||
1257 | mz = mem_cgroup_zoneinfo(mem, node, zid); | ||
1299 | list = &mz->lists[lru]; | 1258 | list = &mz->lists[lru]; |
1300 | 1259 | ||
1301 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 1260 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
@@ -1304,19 +1263,19 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
1304 | busy = NULL; | 1263 | busy = NULL; |
1305 | while (loop--) { | 1264 | while (loop--) { |
1306 | ret = 0; | 1265 | ret = 0; |
1307 | spin_lock_irqsave(&mz->lru_lock, flags); | 1266 | spin_lock_irqsave(&zone->lru_lock, flags); |
1308 | if (list_empty(list)) { | 1267 | if (list_empty(list)) { |
1309 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1268 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
1310 | break; | 1269 | break; |
1311 | } | 1270 | } |
1312 | pc = list_entry(list->prev, struct page_cgroup, lru); | 1271 | pc = list_entry(list->prev, struct page_cgroup, lru); |
1313 | if (busy == pc) { | 1272 | if (busy == pc) { |
1314 | list_move(&pc->lru, list); | 1273 | list_move(&pc->lru, list); |
1315 | busy = 0; | 1274 | busy = 0; |
1316 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1275 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
1317 | continue; | 1276 | continue; |
1318 | } | 1277 | } |
1319 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 1278 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
1320 | 1279 | ||
1321 | ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); | 1280 | ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); |
1322 | if (ret == -ENOMEM) | 1281 | if (ret == -ENOMEM) |
@@ -1329,6 +1288,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
1329 | } else | 1288 | } else |
1330 | busy = NULL; | 1289 | busy = NULL; |
1331 | } | 1290 | } |
1291 | |||
1332 | if (!ret && !list_empty(list)) | 1292 | if (!ret && !list_empty(list)) |
1333 | return -EBUSY; | 1293 | return -EBUSY; |
1334 | return ret; | 1294 | return ret; |
@@ -1364,12 +1324,10 @@ move_account: | |||
1364 | ret = 0; | 1324 | ret = 0; |
1365 | for_each_node_state(node, N_POSSIBLE) { | 1325 | for_each_node_state(node, N_POSSIBLE) { |
1366 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 1326 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
1367 | struct mem_cgroup_per_zone *mz; | ||
1368 | enum lru_list l; | 1327 | enum lru_list l; |
1369 | mz = mem_cgroup_zoneinfo(mem, node, zid); | ||
1370 | for_each_lru(l) { | 1328 | for_each_lru(l) { |
1371 | ret = mem_cgroup_force_empty_list(mem, | 1329 | ret = mem_cgroup_force_empty_list(mem, |
1372 | mz, l); | 1330 | node, zid, l); |
1373 | if (ret) | 1331 | if (ret) |
1374 | break; | 1332 | break; |
1375 | } | 1333 | } |
@@ -1413,6 +1371,7 @@ try_to_free: | |||
1413 | } | 1371 | } |
1414 | 1372 | ||
1415 | } | 1373 | } |
1374 | lru_add_drain(); | ||
1416 | /* try move_account...there may be some *locked* pages. */ | 1375 | /* try move_account...there may be some *locked* pages. */ |
1417 | if (mem->res.usage) | 1376 | if (mem->res.usage) |
1418 | goto move_account; | 1377 | goto move_account; |
@@ -1657,7 +1616,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
1657 | 1616 | ||
1658 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 1617 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
1659 | mz = &pn->zoneinfo[zone]; | 1618 | mz = &pn->zoneinfo[zone]; |
1660 | spin_lock_init(&mz->lru_lock); | ||
1661 | for_each_lru(l) | 1619 | for_each_lru(l) |
1662 | INIT_LIST_HEAD(&mz->lists[l]); | 1620 | INIT_LIST_HEAD(&mz->lists[l]); |
1663 | } | 1621 | } |
@@ -1706,8 +1664,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
1706 | 1664 | ||
1707 | static void mem_cgroup_free(struct mem_cgroup *mem) | 1665 | static void mem_cgroup_free(struct mem_cgroup *mem) |
1708 | { | 1666 | { |
1667 | int node; | ||
1668 | |||
1709 | if (atomic_read(&mem->refcnt) > 0) | 1669 | if (atomic_read(&mem->refcnt) > 0) |
1710 | return; | 1670 | return; |
1671 | |||
1672 | |||
1673 | for_each_node_state(node, N_POSSIBLE) | ||
1674 | free_mem_cgroup_per_zone_info(mem, node); | ||
1675 | |||
1711 | if (mem_cgroup_size() < PAGE_SIZE) | 1676 | if (mem_cgroup_size() < PAGE_SIZE) |
1712 | kfree(mem); | 1677 | kfree(mem); |
1713 | else | 1678 | else |
@@ -1780,12 +1745,6 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | |||
1780 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 1745 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
1781 | struct cgroup *cont) | 1746 | struct cgroup *cont) |
1782 | { | 1747 | { |
1783 | int node; | ||
1784 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1785 | |||
1786 | for_each_node_state(node, N_POSSIBLE) | ||
1787 | free_mem_cgroup_per_zone_info(mem, node); | ||
1788 | |||
1789 | mem_cgroup_free(mem_cgroup_from_cont(cont)); | 1748 | mem_cgroup_free(mem_cgroup_from_cont(cont)); |
1790 | } | 1749 | } |
1791 | 1750 | ||