aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-01-07 21:08:01 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:05 -0500
commit08e552c69c6930d64722de3ec18c51844d06ee28 (patch)
treea744d57ed4b23401115f1033dcaac9e85d550e09 /mm/memcontrol.c
parent8c7c6e34a1256a5082d38c8e9bd1474476912715 (diff)
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics. Now, - page_cgroup is linked to mem_cgroup's its own LRU (per zone). - LRU of page_cgroup is not synchronous with global LRU. - page and page_cgroup is one-to-one and statically allocated. - To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as - lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc); - SwapCache is handled. And, when we handle LRU list of page_cgroup, we do following. pc = lookup_page_cgroup(page); lock_page_cgroup(pc); .....................(1) mz = page_cgroup_zoneinfo(pc); spin_lock(&mz->lru_lock); .....add to LRU spin_unlock(&mz->lru_lock); unlock_page_cgroup(pc); But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock. So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct. This is a trial to remove this dirty nesting of locks. This patch changes mz->lru_lock to be zone->lru_lock. Then, above sequence will be written as spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU mem_cgroup_add/remove/etc_lru() { pc = lookup_page_cgroup(page); mz = page_cgroup_zoneinfo(pc); if (PageCgroupUsed(pc)) { ....add to LRU } spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU This is much simpler. (*) We're safe even if we don't take lock_page_cgroup(pc). Because.. 1. When pc->mem_cgroup can be modified. - at charge. - at account_move(). 2. at charge the PCG_USED bit is not set before pc->mem_cgroup is fixed. 3. at account_move() the page is isolated and not on LRU. Pros. - easy for maintenance. - memcg can make use of laziness of pagevec. - we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup. - LRU status of memcg will be synchronized with global LRU's one. - # of locks are reduced. - account_move() is simplified very much. Cons. - may increase cost of LRU rotation. (no impact if memcg is not configured.) Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c323
1 files changed, 141 insertions, 182 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2efcf38f3b73..8ce4e9e47959 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -36,6 +36,7 @@
36#include <linux/vmalloc.h> 36#include <linux/vmalloc.h>
37#include <linux/mm_inline.h> 37#include <linux/mm_inline.h>
38#include <linux/page_cgroup.h> 38#include <linux/page_cgroup.h>
39#include "internal.h"
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41 42
@@ -100,7 +101,6 @@ struct mem_cgroup_per_zone {
100 /* 101 /*
101 * spin_lock to protect the per cgroup LRU 102 * spin_lock to protect the per cgroup LRU
102 */ 103 */
103 spinlock_t lru_lock;
104 struct list_head lists[NR_LRU_LISTS]; 104 struct list_head lists[NR_LRU_LISTS];
105 unsigned long count[NR_LRU_LISTS]; 105 unsigned long count[NR_LRU_LISTS];
106}; 106};
@@ -163,14 +163,12 @@ enum charge_type {
163/* only for here (for easy reading.) */ 163/* only for here (for easy reading.) */
164#define PCGF_CACHE (1UL << PCG_CACHE) 164#define PCGF_CACHE (1UL << PCG_CACHE)
165#define PCGF_USED (1UL << PCG_USED) 165#define PCGF_USED (1UL << PCG_USED)
166#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
167#define PCGF_LOCK (1UL << PCG_LOCK) 166#define PCGF_LOCK (1UL << PCG_LOCK)
168#define PCGF_FILE (1UL << PCG_FILE)
169static const unsigned long 167static const unsigned long
170pcg_default_flags[NR_CHARGE_TYPE] = { 168pcg_default_flags[NR_CHARGE_TYPE] = {
171 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ 169 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
172 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ 170 PCGF_USED | PCGF_LOCK, /* Anon */
173 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 171 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
174 0, /* FORCE */ 172 0, /* FORCE */
175}; 173};
176 174
@@ -185,9 +183,6 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
185static void mem_cgroup_get(struct mem_cgroup *mem); 183static void mem_cgroup_get(struct mem_cgroup *mem);
186static void mem_cgroup_put(struct mem_cgroup *mem); 184static void mem_cgroup_put(struct mem_cgroup *mem);
187 185
188/*
189 * Always modified under lru lock. Then, not necessary to preempt_disable()
190 */
191static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 186static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
192 struct page_cgroup *pc, 187 struct page_cgroup *pc,
193 bool charge) 188 bool charge)
@@ -195,10 +190,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
195 int val = (charge)? 1 : -1; 190 int val = (charge)? 1 : -1;
196 struct mem_cgroup_stat *stat = &mem->stat; 191 struct mem_cgroup_stat *stat = &mem->stat;
197 struct mem_cgroup_stat_cpu *cpustat; 192 struct mem_cgroup_stat_cpu *cpustat;
193 int cpu = get_cpu();
198 194
199 VM_BUG_ON(!irqs_disabled()); 195 cpustat = &stat->cpustat[cpu];
200
201 cpustat = &stat->cpustat[smp_processor_id()];
202 if (PageCgroupCache(pc)) 196 if (PageCgroupCache(pc))
203 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 197 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
204 else 198 else
@@ -210,6 +204,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
210 else 204 else
211 __mem_cgroup_stat_add_safe(cpustat, 205 __mem_cgroup_stat_add_safe(cpustat,
212 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 206 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
207 put_cpu();
213} 208}
214 209
215static struct mem_cgroup_per_zone * 210static struct mem_cgroup_per_zone *
@@ -264,80 +259,95 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
264 struct mem_cgroup, css); 259 struct mem_cgroup, css);
265} 260}
266 261
267static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 262/*
268 struct page_cgroup *pc) 263 * Following LRU functions are allowed to be used without PCG_LOCK.
269{ 264 * Operations are called by routine of global LRU independently from memcg.
270 int lru = LRU_BASE; 265 * What we have to take care of here is validness of pc->mem_cgroup.
266 *
267 * Changes to pc->mem_cgroup happens when
268 * 1. charge
269 * 2. moving account
270 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
271 * It is added to LRU before charge.
272 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
273 * When moving account, the page is not on LRU. It's isolated.
274 */
271 275
272 if (PageCgroupUnevictable(pc)) 276void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
273 lru = LRU_UNEVICTABLE; 277{
274 else { 278 struct page_cgroup *pc;
275 if (PageCgroupActive(pc)) 279 struct mem_cgroup *mem;
276 lru += LRU_ACTIVE; 280 struct mem_cgroup_per_zone *mz;
277 if (PageCgroupFile(pc))
278 lru += LRU_FILE;
279 }
280 281
282 if (mem_cgroup_subsys.disabled)
283 return;
284 pc = lookup_page_cgroup(page);
285 /* can happen while we handle swapcache. */
286 if (list_empty(&pc->lru))
287 return;
288 mz = page_cgroup_zoneinfo(pc);
289 mem = pc->mem_cgroup;
281 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 290 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
282 291 list_del_init(&pc->lru);
283 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); 292 return;
284 list_del(&pc->lru);
285} 293}
286 294
287static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 295void mem_cgroup_del_lru(struct page *page)
288 struct page_cgroup *pc, bool hot)
289{ 296{
290 int lru = LRU_BASE; 297 mem_cgroup_del_lru_list(page, page_lru(page));
298}
291 299
292 if (PageCgroupUnevictable(pc)) 300void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
293 lru = LRU_UNEVICTABLE; 301{
294 else { 302 struct mem_cgroup_per_zone *mz;
295 if (PageCgroupActive(pc)) 303 struct page_cgroup *pc;
296 lru += LRU_ACTIVE;
297 if (PageCgroupFile(pc))
298 lru += LRU_FILE;
299 }
300 304
301 MEM_CGROUP_ZSTAT(mz, lru) += 1; 305 if (mem_cgroup_subsys.disabled)
302 if (hot) 306 return;
303 list_add(&pc->lru, &mz->lists[lru]);
304 else
305 list_add_tail(&pc->lru, &mz->lists[lru]);
306 307
307 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); 308 pc = lookup_page_cgroup(page);
309 smp_rmb();
310 /* unused page is not rotated. */
311 if (!PageCgroupUsed(pc))
312 return;
313 mz = page_cgroup_zoneinfo(pc);
314 list_move(&pc->lru, &mz->lists[lru]);
308} 315}
309 316
310static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) 317void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
311{ 318{
312 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 319 struct page_cgroup *pc;
313 int active = PageCgroupActive(pc); 320 struct mem_cgroup_per_zone *mz;
314 int file = PageCgroupFile(pc);
315 int unevictable = PageCgroupUnevictable(pc);
316 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
317 (LRU_FILE * !!file + !!active);
318 321
319 if (lru == from) 322 if (mem_cgroup_subsys.disabled)
323 return;
324 pc = lookup_page_cgroup(page);
325 /* barrier to sync with "charge" */
326 smp_rmb();
327 if (!PageCgroupUsed(pc))
320 return; 328 return;
321 329
322 MEM_CGROUP_ZSTAT(mz, from) -= 1; 330 mz = page_cgroup_zoneinfo(pc);
323 /*
324 * However this is done under mz->lru_lock, another flags, which
325 * are not related to LRU, will be modified from out-of-lock.
326 * We have to use atomic set/clear flags.
327 */
328 if (is_unevictable_lru(lru)) {
329 ClearPageCgroupActive(pc);
330 SetPageCgroupUnevictable(pc);
331 } else {
332 if (is_active_lru(lru))
333 SetPageCgroupActive(pc);
334 else
335 ClearPageCgroupActive(pc);
336 ClearPageCgroupUnevictable(pc);
337 }
338
339 MEM_CGROUP_ZSTAT(mz, lru) += 1; 331 MEM_CGROUP_ZSTAT(mz, lru) += 1;
340 list_move(&pc->lru, &mz->lists[lru]); 332 list_add(&pc->lru, &mz->lists[lru]);
333}
334/*
335 * To add swapcache into LRU. Be careful to all this function.
336 * zone->lru_lock shouldn't be held and irq must not be disabled.
337 */
338static void mem_cgroup_lru_fixup(struct page *page)
339{
340 if (!isolate_lru_page(page))
341 putback_lru_page(page);
342}
343
344void mem_cgroup_move_lists(struct page *page,
345 enum lru_list from, enum lru_list to)
346{
347 if (mem_cgroup_subsys.disabled)
348 return;
349 mem_cgroup_del_lru_list(page, from);
350 mem_cgroup_add_lru_list(page, to);
341} 351}
342 352
343int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 353int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -351,37 +361,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
351} 361}
352 362
353/* 363/*
354 * This routine assumes that the appropriate zone's lru lock is already held
355 */
356void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
357{
358 struct page_cgroup *pc;
359 struct mem_cgroup_per_zone *mz;
360 unsigned long flags;
361
362 if (mem_cgroup_subsys.disabled)
363 return;
364
365 /*
366 * We cannot lock_page_cgroup while holding zone's lru_lock,
367 * because other holders of lock_page_cgroup can be interrupted
368 * with an attempt to rotate_reclaimable_page. But we cannot
369 * safely get to page_cgroup without it, so just try_lock it:
370 * mem_cgroup_isolate_pages allows for page left on wrong list.
371 */
372 pc = lookup_page_cgroup(page);
373 if (!trylock_page_cgroup(pc))
374 return;
375 if (pc && PageCgroupUsed(pc)) {
376 mz = page_cgroup_zoneinfo(pc);
377 spin_lock_irqsave(&mz->lru_lock, flags);
378 __mem_cgroup_move_lists(pc, lru);
379 spin_unlock_irqrestore(&mz->lru_lock, flags);
380 }
381 unlock_page_cgroup(pc);
382}
383
384/*
385 * Calculate mapped_ratio under memory controller. This will be used in 364 * Calculate mapped_ratio under memory controller. This will be used in
386 * vmscan.c for deteremining we have to reclaim mapped pages. 365 * vmscan.c for deteremining we have to reclaim mapped pages.
387 */ 366 */
@@ -460,40 +439,24 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
460 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 439 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
461 src = &mz->lists[lru]; 440 src = &mz->lists[lru];
462 441
463 spin_lock(&mz->lru_lock);
464 scan = 0; 442 scan = 0;
465 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 443 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
466 if (scan >= nr_to_scan) 444 if (scan >= nr_to_scan)
467 break; 445 break;
446
447 page = pc->page;
468 if (unlikely(!PageCgroupUsed(pc))) 448 if (unlikely(!PageCgroupUsed(pc)))
469 continue; 449 continue;
470 page = pc->page;
471
472 if (unlikely(!PageLRU(page))) 450 if (unlikely(!PageLRU(page)))
473 continue; 451 continue;
474 452
475 /*
476 * TODO: play better with lumpy reclaim, grabbing anything.
477 */
478 if (PageUnevictable(page) ||
479 (PageActive(page) && !active) ||
480 (!PageActive(page) && active)) {
481 __mem_cgroup_move_lists(pc, page_lru(page));
482 continue;
483 }
484
485 scan++; 453 scan++;
486 list_move(&pc->lru, &pc_list);
487
488 if (__isolate_lru_page(page, mode, file) == 0) { 454 if (__isolate_lru_page(page, mode, file) == 0) {
489 list_move(&page->lru, dst); 455 list_move(&page->lru, dst);
490 nr_taken++; 456 nr_taken++;
491 } 457 }
492 } 458 }
493 459
494 list_splice(&pc_list, src);
495 spin_unlock(&mz->lru_lock);
496
497 *scanned = scan; 460 *scanned = scan;
498 return nr_taken; 461 return nr_taken;
499} 462}
@@ -608,9 +571,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
608 struct page_cgroup *pc, 571 struct page_cgroup *pc,
609 enum charge_type ctype) 572 enum charge_type ctype)
610{ 573{
611 struct mem_cgroup_per_zone *mz;
612 unsigned long flags;
613
614 /* try_charge() can return NULL to *memcg, taking care of it. */ 574 /* try_charge() can return NULL to *memcg, taking care of it. */
615 if (!mem) 575 if (!mem)
616 return; 576 return;
@@ -625,17 +585,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
625 return; 585 return;
626 } 586 }
627 pc->mem_cgroup = mem; 587 pc->mem_cgroup = mem;
628 /* 588 smp_wmb();
629 * If a page is accounted as a page cache, insert to inactive list.
630 * If anon, insert to active list.
631 */
632 pc->flags = pcg_default_flags[ctype]; 589 pc->flags = pcg_default_flags[ctype];
633 590
634 mz = page_cgroup_zoneinfo(pc); 591 mem_cgroup_charge_statistics(mem, pc, true);
635 592
636 spin_lock_irqsave(&mz->lru_lock, flags);
637 __mem_cgroup_add_list(mz, pc, true);
638 spin_unlock_irqrestore(&mz->lru_lock, flags);
639 unlock_page_cgroup(pc); 593 unlock_page_cgroup(pc);
640} 594}
641 595
@@ -646,8 +600,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
646 * @to: mem_cgroup which the page is moved to. @from != @to. 600 * @to: mem_cgroup which the page is moved to. @from != @to.
647 * 601 *
648 * The caller must confirm following. 602 * The caller must confirm following.
649 * 1. disable irq. 603 * - page is not on LRU (isolate_page() is useful.)
650 * 2. lru_lock of old mem_cgroup(@from) should be held.
651 * 604 *
652 * returns 0 at success, 605 * returns 0 at success,
653 * returns -EBUSY when lock is busy or "pc" is unstable. 606 * returns -EBUSY when lock is busy or "pc" is unstable.
@@ -663,15 +616,14 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
663 int nid, zid; 616 int nid, zid;
664 int ret = -EBUSY; 617 int ret = -EBUSY;
665 618
666 VM_BUG_ON(!irqs_disabled());
667 VM_BUG_ON(from == to); 619 VM_BUG_ON(from == to);
620 VM_BUG_ON(PageLRU(pc->page));
668 621
669 nid = page_cgroup_nid(pc); 622 nid = page_cgroup_nid(pc);
670 zid = page_cgroup_zid(pc); 623 zid = page_cgroup_zid(pc);
671 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 624 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
672 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 625 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
673 626
674
675 if (!trylock_page_cgroup(pc)) 627 if (!trylock_page_cgroup(pc))
676 return ret; 628 return ret;
677 629
@@ -681,18 +633,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
681 if (pc->mem_cgroup != from) 633 if (pc->mem_cgroup != from)
682 goto out; 634 goto out;
683 635
684 if (spin_trylock(&to_mz->lru_lock)) { 636 css_put(&from->css);
685 __mem_cgroup_remove_list(from_mz, pc); 637 res_counter_uncharge(&from->res, PAGE_SIZE);
686 css_put(&from->css); 638 mem_cgroup_charge_statistics(from, pc, false);
687 res_counter_uncharge(&from->res, PAGE_SIZE); 639 if (do_swap_account)
688 if (do_swap_account) 640 res_counter_uncharge(&from->memsw, PAGE_SIZE);
689 res_counter_uncharge(&from->memsw, PAGE_SIZE); 641 pc->mem_cgroup = to;
690 pc->mem_cgroup = to; 642 mem_cgroup_charge_statistics(to, pc, true);
691 css_get(&to->css); 643 css_get(&to->css);
692 __mem_cgroup_add_list(to_mz, pc, false); 644 ret = 0;
693 ret = 0;
694 spin_unlock(&to_mz->lru_lock);
695 }
696out: 645out:
697 unlock_page_cgroup(pc); 646 unlock_page_cgroup(pc);
698 return ret; 647 return ret;
@@ -706,39 +655,47 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
706 struct mem_cgroup *child, 655 struct mem_cgroup *child,
707 gfp_t gfp_mask) 656 gfp_t gfp_mask)
708{ 657{
658 struct page *page = pc->page;
709 struct cgroup *cg = child->css.cgroup; 659 struct cgroup *cg = child->css.cgroup;
710 struct cgroup *pcg = cg->parent; 660 struct cgroup *pcg = cg->parent;
711 struct mem_cgroup *parent; 661 struct mem_cgroup *parent;
712 struct mem_cgroup_per_zone *mz;
713 unsigned long flags;
714 int ret; 662 int ret;
715 663
716 /* Is ROOT ? */ 664 /* Is ROOT ? */
717 if (!pcg) 665 if (!pcg)
718 return -EINVAL; 666 return -EINVAL;
719 667
668
720 parent = mem_cgroup_from_cont(pcg); 669 parent = mem_cgroup_from_cont(pcg);
721 670
671
722 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 672 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
723 if (ret) 673 if (ret)
724 return ret; 674 return ret;
725 675
726 mz = mem_cgroup_zoneinfo(child, 676 if (!get_page_unless_zero(page))
727 page_cgroup_nid(pc), page_cgroup_zid(pc)); 677 return -EBUSY;
678
679 ret = isolate_lru_page(page);
680
681 if (ret)
682 goto cancel;
728 683
729 spin_lock_irqsave(&mz->lru_lock, flags);
730 ret = mem_cgroup_move_account(pc, child, parent); 684 ret = mem_cgroup_move_account(pc, child, parent);
731 spin_unlock_irqrestore(&mz->lru_lock, flags);
732 685
733 /* drop extra refcnt */ 686 /* drop extra refcnt by try_charge() (move_account increment one) */
734 css_put(&parent->css); 687 css_put(&parent->css);
735 /* uncharge if move fails */ 688 putback_lru_page(page);
736 if (ret) { 689 if (!ret) {
737 res_counter_uncharge(&parent->res, PAGE_SIZE); 690 put_page(page);
738 if (do_swap_account) 691 return 0;
739 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
740 } 692 }
741 693 /* uncharge if move fails */
694cancel:
695 res_counter_uncharge(&parent->res, PAGE_SIZE);
696 if (do_swap_account)
697 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
698 put_page(page);
742 return ret; 699 return ret;
743} 700}
744 701
@@ -912,6 +869,8 @@ int mem_cgroup_cache_charge_swapin(struct page *page,
912 } 869 }
913 if (!locked) 870 if (!locked)
914 unlock_page(page); 871 unlock_page(page);
872 /* add this page(page_cgroup) to the LRU we want. */
873 mem_cgroup_lru_fixup(page);
915 874
916 return ret; 875 return ret;
917} 876}
@@ -944,6 +903,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
944 } 903 }
945 904
946 } 905 }
906 /* add this page(page_cgroup) to the LRU we want. */
907 mem_cgroup_lru_fixup(page);
947} 908}
948 909
949void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 910void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
@@ -968,7 +929,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
968 struct page_cgroup *pc; 929 struct page_cgroup *pc;
969 struct mem_cgroup *mem = NULL; 930 struct mem_cgroup *mem = NULL;
970 struct mem_cgroup_per_zone *mz; 931 struct mem_cgroup_per_zone *mz;
971 unsigned long flags;
972 932
973 if (mem_cgroup_subsys.disabled) 933 if (mem_cgroup_subsys.disabled)
974 return NULL; 934 return NULL;
@@ -1010,12 +970,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1010 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 970 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1011 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 971 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1012 972
973 mem_cgroup_charge_statistics(mem, pc, false);
1013 ClearPageCgroupUsed(pc); 974 ClearPageCgroupUsed(pc);
1014 975
1015 mz = page_cgroup_zoneinfo(pc); 976 mz = page_cgroup_zoneinfo(pc);
1016 spin_lock_irqsave(&mz->lru_lock, flags);
1017 __mem_cgroup_remove_list(mz, pc);
1018 spin_unlock_irqrestore(&mz->lru_lock, flags);
1019 unlock_page_cgroup(pc); 977 unlock_page_cgroup(pc);
1020 978
1021 css_put(&mem->css); 979 css_put(&mem->css);
@@ -1281,21 +1239,22 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1281 return ret; 1239 return ret;
1282} 1240}
1283 1241
1284
1285/* 1242/*
1286 * This routine traverse page_cgroup in given list and drop them all. 1243 * This routine traverse page_cgroup in given list and drop them all.
1287 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1244 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1288 */ 1245 */
1289static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 1246static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1290 struct mem_cgroup_per_zone *mz, 1247 int node, int zid, enum lru_list lru)
1291 enum lru_list lru)
1292{ 1248{
1249 struct zone *zone;
1250 struct mem_cgroup_per_zone *mz;
1293 struct page_cgroup *pc, *busy; 1251 struct page_cgroup *pc, *busy;
1294 unsigned long flags; 1252 unsigned long flags, loop;
1295 unsigned long loop;
1296 struct list_head *list; 1253 struct list_head *list;
1297 int ret = 0; 1254 int ret = 0;
1298 1255
1256 zone = &NODE_DATA(node)->node_zones[zid];
1257 mz = mem_cgroup_zoneinfo(mem, node, zid);
1299 list = &mz->lists[lru]; 1258 list = &mz->lists[lru];
1300 1259
1301 loop = MEM_CGROUP_ZSTAT(mz, lru); 1260 loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -1304,19 +1263,19 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1304 busy = NULL; 1263 busy = NULL;
1305 while (loop--) { 1264 while (loop--) {
1306 ret = 0; 1265 ret = 0;
1307 spin_lock_irqsave(&mz->lru_lock, flags); 1266 spin_lock_irqsave(&zone->lru_lock, flags);
1308 if (list_empty(list)) { 1267 if (list_empty(list)) {
1309 spin_unlock_irqrestore(&mz->lru_lock, flags); 1268 spin_unlock_irqrestore(&zone->lru_lock, flags);
1310 break; 1269 break;
1311 } 1270 }
1312 pc = list_entry(list->prev, struct page_cgroup, lru); 1271 pc = list_entry(list->prev, struct page_cgroup, lru);
1313 if (busy == pc) { 1272 if (busy == pc) {
1314 list_move(&pc->lru, list); 1273 list_move(&pc->lru, list);
1315 busy = 0; 1274 busy = 0;
1316 spin_unlock_irqrestore(&mz->lru_lock, flags); 1275 spin_unlock_irqrestore(&zone->lru_lock, flags);
1317 continue; 1276 continue;
1318 } 1277 }
1319 spin_unlock_irqrestore(&mz->lru_lock, flags); 1278 spin_unlock_irqrestore(&zone->lru_lock, flags);
1320 1279
1321 ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE); 1280 ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
1322 if (ret == -ENOMEM) 1281 if (ret == -ENOMEM)
@@ -1329,6 +1288,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1329 } else 1288 } else
1330 busy = NULL; 1289 busy = NULL;
1331 } 1290 }
1291
1332 if (!ret && !list_empty(list)) 1292 if (!ret && !list_empty(list))
1333 return -EBUSY; 1293 return -EBUSY;
1334 return ret; 1294 return ret;
@@ -1364,12 +1324,10 @@ move_account:
1364 ret = 0; 1324 ret = 0;
1365 for_each_node_state(node, N_POSSIBLE) { 1325 for_each_node_state(node, N_POSSIBLE) {
1366 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 1326 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1367 struct mem_cgroup_per_zone *mz;
1368 enum lru_list l; 1327 enum lru_list l;
1369 mz = mem_cgroup_zoneinfo(mem, node, zid);
1370 for_each_lru(l) { 1328 for_each_lru(l) {
1371 ret = mem_cgroup_force_empty_list(mem, 1329 ret = mem_cgroup_force_empty_list(mem,
1372 mz, l); 1330 node, zid, l);
1373 if (ret) 1331 if (ret)
1374 break; 1332 break;
1375 } 1333 }
@@ -1413,6 +1371,7 @@ try_to_free:
1413 } 1371 }
1414 1372
1415 } 1373 }
1374 lru_add_drain();
1416 /* try move_account...there may be some *locked* pages. */ 1375 /* try move_account...there may be some *locked* pages. */
1417 if (mem->res.usage) 1376 if (mem->res.usage)
1418 goto move_account; 1377 goto move_account;
@@ -1657,7 +1616,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1657 1616
1658 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1617 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1659 mz = &pn->zoneinfo[zone]; 1618 mz = &pn->zoneinfo[zone];
1660 spin_lock_init(&mz->lru_lock);
1661 for_each_lru(l) 1619 for_each_lru(l)
1662 INIT_LIST_HEAD(&mz->lists[l]); 1620 INIT_LIST_HEAD(&mz->lists[l]);
1663 } 1621 }
@@ -1706,8 +1664,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
1706 1664
1707static void mem_cgroup_free(struct mem_cgroup *mem) 1665static void mem_cgroup_free(struct mem_cgroup *mem)
1708{ 1666{
1667 int node;
1668
1709 if (atomic_read(&mem->refcnt) > 0) 1669 if (atomic_read(&mem->refcnt) > 0)
1710 return; 1670 return;
1671
1672
1673 for_each_node_state(node, N_POSSIBLE)
1674 free_mem_cgroup_per_zone_info(mem, node);
1675
1711 if (mem_cgroup_size() < PAGE_SIZE) 1676 if (mem_cgroup_size() < PAGE_SIZE)
1712 kfree(mem); 1677 kfree(mem);
1713 else 1678 else
@@ -1780,12 +1745,6 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1780static void mem_cgroup_destroy(struct cgroup_subsys *ss, 1745static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1781 struct cgroup *cont) 1746 struct cgroup *cont)
1782{ 1747{
1783 int node;
1784 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1785
1786 for_each_node_state(node, N_POSSIBLE)
1787 free_mem_cgroup_per_zone_info(mem, node);
1788
1789 mem_cgroup_free(mem_cgroup_from_cont(cont)); 1748 mem_cgroup_free(mem_cgroup_from_cont(cont));
1790} 1749}
1791 1750