aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-01-07 21:07:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:04 -0500
commit7a81b88cb53e335ff7d019e6398c95792c817d93 (patch)
tree6ebca4d509a541ac707e10f9369916549e90c0ad
parent0b82ac37b889ec881b645860da3775118effb3ca (diff)
memcg: introduce charge-commit-cancel style of functions
There is a small race in do_swap_page(). When the page swapped-in is charged, the mapcount can be greater than 0. But, at the same time some process (shares it ) call unmap and make mapcount 1->0 and the page is uncharged. CPUA CPUB mapcount == 1. (1) charge if mapcount==0 zap_pte_range() (2) mapcount 1 => 0. (3) uncharge(). (success) (4) set page's rmap() mapcount 0=>1 Then, this swap page's account is leaked. For fixing this, I added a new interface. - charge account to res_counter by PAGE_SIZE and try to free pages if necessary. - commit register page_cgroup and add to LRU if necessary. - cancel uncharge PAGE_SIZE because of do_swap_page failure. CPUA (1) charge (always) (2) set page's rmap (mapcount > 0) (3) commit charge was necessary or not after set_pte(). This protocol uses PCG_USED bit on page_cgroup for avoiding over accounting. Usual mem_cgroup_charge_common() does charge -> commit at a time. And this patch also adds following function to clarify all charges. - mem_cgroup_newpage_charge() ....replacement for mem_cgroup_charge() called against newly allocated anon pages. - mem_cgroup_charge_migrate_fixup() called only from remove_migration_ptes(). we'll have to rewrite this later.(this patch just keeps old behavior) This function will be removed by additional patch to make migration clearer. Good for clarifying "what we do" Then, we have 4 following charge points. - newpage - swap-in - add-to-cache. - migration. [akpm@linux-foundation.org: add missing inline directives to stubs] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h36
-rw-r--r--mm/memcontrol.c155
-rw-r--r--mm/memory.c12
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/swapfile.c6
5 files changed, 170 insertions, 41 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1fbe14d39521..c592f315cd02 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,8 +27,17 @@ struct mm_struct;
27 27
28#ifdef CONFIG_CGROUP_MEM_RES_CTLR 28#ifdef CONFIG_CGROUP_MEM_RES_CTLR
29 29
30extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm, 30extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
31 gfp_t gfp_mask); 31 gfp_t gfp_mask);
32extern int mem_cgroup_charge_migrate_fixup(struct page *page,
33 struct mm_struct *mm, gfp_t gfp_mask);
34/* for swap handling */
35extern int mem_cgroup_try_charge(struct mm_struct *mm,
36 gfp_t gfp_mask, struct mem_cgroup **ptr);
37extern void mem_cgroup_commit_charge_swapin(struct page *page,
38 struct mem_cgroup *ptr);
39extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);
40
32extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 41extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
33 gfp_t gfp_mask); 42 gfp_t gfp_mask);
34extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru); 43extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
@@ -71,7 +80,9 @@ extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
71 80
72 81
73#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 82#else /* CONFIG_CGROUP_MEM_RES_CTLR */
74static inline int mem_cgroup_charge(struct page *page, 83struct mem_cgroup;
84
85static inline int mem_cgroup_newpage_charge(struct page *page,
75 struct mm_struct *mm, gfp_t gfp_mask) 86 struct mm_struct *mm, gfp_t gfp_mask)
76{ 87{
77 return 0; 88 return 0;
@@ -83,6 +94,27 @@ static inline int mem_cgroup_cache_charge(struct page *page,
83 return 0; 94 return 0;
84} 95}
85 96
97static inline int mem_cgroup_charge_migrate_fixup(struct page *page,
98 struct mm_struct *mm, gfp_t gfp_mask)
99{
100 return 0;
101}
102
103static inline int mem_cgroup_try_charge(struct mm_struct *mm,
104 gfp_t gfp_mask, struct mem_cgroup **ptr)
105{
106 return 0;
107}
108
109static inline void mem_cgroup_commit_charge_swapin(struct page *page,
110 struct mem_cgroup *ptr)
111{
112}
113
114static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
115{
116}
117
86static inline void mem_cgroup_uncharge_page(struct page *page) 118static inline void mem_cgroup_uncharge_page(struct page *page)
87{ 119{
88} 120}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 51ee96545579..f568b1964551 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -467,35 +467,31 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
467 return nr_taken; 467 return nr_taken;
468} 468}
469 469
470/* 470
471 * Charge the memory controller for page usage. 471/**
472 * Return 472 * mem_cgroup_try_charge - get charge of PAGE_SIZE.
473 * 0 if the charge was successful 473 * @mm: an mm_struct which is charged against. (when *memcg is NULL)
474 * < 0 if the cgroup is over its limit 474 * @gfp_mask: gfp_mask for reclaim.
475 * @memcg: a pointer to memory cgroup which is charged against.
476 *
477 * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
478 * memory cgroup from @mm is got and stored in *memcg.
479 *
480 * Returns 0 if success. -ENOMEM at failure.
475 */ 481 */
476static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 482
477 gfp_t gfp_mask, enum charge_type ctype, 483int mem_cgroup_try_charge(struct mm_struct *mm,
478 struct mem_cgroup *memcg) 484 gfp_t gfp_mask, struct mem_cgroup **memcg)
479{ 485{
480 struct mem_cgroup *mem; 486 struct mem_cgroup *mem;
481 struct page_cgroup *pc; 487 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
485
486 pc = lookup_page_cgroup(page);
487 /* can happen at boot */
488 if (unlikely(!pc))
489 return 0;
490 prefetchw(pc);
491 /* 488 /*
492 * We always charge the cgroup the mm_struct belongs to. 489 * We always charge the cgroup the mm_struct belongs to.
493 * The mm_struct's mem_cgroup changes on task migration if the 490 * The mm_struct's mem_cgroup changes on task migration if the
494 * thread group leader migrates. It's possible that mm is not 491 * thread group leader migrates. It's possible that mm is not
495 * set, if so charge the init_mm (happens for pagecache usage). 492 * set, if so charge the init_mm (happens for pagecache usage).
496 */ 493 */
497 494 if (likely(!*memcg)) {
498 if (likely(!memcg)) {
499 rcu_read_lock(); 495 rcu_read_lock();
500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 496 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
501 if (unlikely(!mem)) { 497 if (unlikely(!mem)) {
@@ -506,15 +502,17 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
506 * For every charge from the cgroup, increment reference count 502 * For every charge from the cgroup, increment reference count
507 */ 503 */
508 css_get(&mem->css); 504 css_get(&mem->css);
505 *memcg = mem;
509 rcu_read_unlock(); 506 rcu_read_unlock();
510 } else { 507 } else {
511 mem = memcg; 508 mem = *memcg;
512 css_get(&memcg->css); 509 css_get(&mem->css);
513 } 510 }
514 511
512
515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { 513 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
516 if (!(gfp_mask & __GFP_WAIT)) 514 if (!(gfp_mask & __GFP_WAIT))
517 goto out; 515 goto nomem;
518 516
519 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 517 if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
520 continue; 518 continue;
@@ -531,18 +529,37 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
531 529
532 if (!nr_retries--) { 530 if (!nr_retries--) {
533 mem_cgroup_out_of_memory(mem, gfp_mask); 531 mem_cgroup_out_of_memory(mem, gfp_mask);
534 goto out; 532 goto nomem;
535 } 533 }
536 } 534 }
535 return 0;
536nomem:
537 css_put(&mem->css);
538 return -ENOMEM;
539}
540
541/*
542 * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
543 * USED state. If already USED, uncharge and return.
544 */
545
546static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
547 struct page_cgroup *pc,
548 enum charge_type ctype)
549{
550 struct mem_cgroup_per_zone *mz;
551 unsigned long flags;
537 552
553 /* try_charge() can return NULL to *memcg, taking care of it. */
554 if (!mem)
555 return;
538 556
539 lock_page_cgroup(pc); 557 lock_page_cgroup(pc);
540 if (unlikely(PageCgroupUsed(pc))) { 558 if (unlikely(PageCgroupUsed(pc))) {
541 unlock_page_cgroup(pc); 559 unlock_page_cgroup(pc);
542 res_counter_uncharge(&mem->res, PAGE_SIZE); 560 res_counter_uncharge(&mem->res, PAGE_SIZE);
543 css_put(&mem->css); 561 css_put(&mem->css);
544 562 return;
545 goto done;
546 } 563 }
547 pc->mem_cgroup = mem; 564 pc->mem_cgroup = mem;
548 /* 565 /*
@@ -557,15 +574,39 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
557 __mem_cgroup_add_list(mz, pc); 574 __mem_cgroup_add_list(mz, pc);
558 spin_unlock_irqrestore(&mz->lru_lock, flags); 575 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc); 576 unlock_page_cgroup(pc);
577}
560 578
561done: 579/*
580 * Charge the memory controller for page usage.
581 * Return
582 * 0 if the charge was successful
583 * < 0 if the cgroup is over its limit
584 */
585static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
586 gfp_t gfp_mask, enum charge_type ctype,
587 struct mem_cgroup *memcg)
588{
589 struct mem_cgroup *mem;
590 struct page_cgroup *pc;
591 int ret;
592
593 pc = lookup_page_cgroup(page);
594 /* can happen at boot */
595 if (unlikely(!pc))
596 return 0;
597 prefetchw(pc);
598
599 mem = memcg;
600 ret = mem_cgroup_try_charge(mm, gfp_mask, &mem);
601 if (ret)
602 return ret;
603
604 __mem_cgroup_commit_charge(mem, pc, ctype);
562 return 0; 605 return 0;
563out:
564 css_put(&mem->css);
565 return -ENOMEM;
566} 606}
567 607
568int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 608int mem_cgroup_newpage_charge(struct page *page,
609 struct mm_struct *mm, gfp_t gfp_mask)
569{ 610{
570 if (mem_cgroup_subsys.disabled) 611 if (mem_cgroup_subsys.disabled)
571 return 0; 612 return 0;
@@ -586,6 +627,34 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
586 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 627 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
587} 628}
588 629
630/*
631 * same as mem_cgroup_newpage_charge(), now.
632 * But what we assume is different from newpage, and this is special case.
633 * treat this in special function. easy for maintenance.
634 */
635
636int mem_cgroup_charge_migrate_fixup(struct page *page,
637 struct mm_struct *mm, gfp_t gfp_mask)
638{
639 if (mem_cgroup_subsys.disabled)
640 return 0;
641
642 if (PageCompound(page))
643 return 0;
644
645 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
646 return 0;
647
648 if (unlikely(!mm))
649 mm = &init_mm;
650
651 return mem_cgroup_charge_common(page, mm, gfp_mask,
652 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
653}
654
655
656
657
589int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 658int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
590 gfp_t gfp_mask) 659 gfp_t gfp_mask)
591{ 660{
@@ -628,6 +697,30 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 697 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
629} 698}
630 699
700
701void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
702{
703 struct page_cgroup *pc;
704
705 if (mem_cgroup_subsys.disabled)
706 return;
707 if (!ptr)
708 return;
709 pc = lookup_page_cgroup(page);
710 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
711}
712
713void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
714{
715 if (mem_cgroup_subsys.disabled)
716 return;
717 if (!mem)
718 return;
719 res_counter_uncharge(&mem->res, PAGE_SIZE);
720 css_put(&mem->css);
721}
722
723
631/* 724/*
632 * uncharge if !page_mapped(page) 725 * uncharge if !page_mapped(page)
633 */ 726 */
diff --git a/mm/memory.c b/mm/memory.c
index 3f8fa06b963b..7f210f160990 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2000,7 +2000,7 @@ gotten:
2000 cow_user_page(new_page, old_page, address, vma); 2000 cow_user_page(new_page, old_page, address, vma);
2001 __SetPageUptodate(new_page); 2001 __SetPageUptodate(new_page);
2002 2002
2003 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) 2003 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2004 goto oom_free_new; 2004 goto oom_free_new;
2005 2005
2006 /* 2006 /*
@@ -2392,6 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2392 struct page *page; 2392 struct page *page;
2393 swp_entry_t entry; 2393 swp_entry_t entry;
2394 pte_t pte; 2394 pte_t pte;
2395 struct mem_cgroup *ptr = NULL;
2395 int ret = 0; 2396 int ret = 0;
2396 2397
2397 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2398 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2430,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2430 lock_page(page); 2431 lock_page(page);
2431 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2432 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2432 2433
2433 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2434 if (mem_cgroup_try_charge(mm, GFP_KERNEL, &ptr) == -ENOMEM) {
2434 ret = VM_FAULT_OOM; 2435 ret = VM_FAULT_OOM;
2435 unlock_page(page); 2436 unlock_page(page);
2436 goto out; 2437 goto out;
@@ -2460,6 +2461,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2460 flush_icache_page(vma, page); 2461 flush_icache_page(vma, page);
2461 set_pte_at(mm, address, page_table, pte); 2462 set_pte_at(mm, address, page_table, pte);
2462 page_add_anon_rmap(page, vma, address); 2463 page_add_anon_rmap(page, vma, address);
2464 mem_cgroup_commit_charge_swapin(page, ptr);
2463 2465
2464 swap_free(entry); 2466 swap_free(entry);
2465 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2467 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2480,7 +2482,7 @@ unlock:
2480out: 2482out:
2481 return ret; 2483 return ret;
2482out_nomap: 2484out_nomap:
2483 mem_cgroup_uncharge_page(page); 2485 mem_cgroup_cancel_charge_swapin(ptr);
2484 pte_unmap_unlock(page_table, ptl); 2486 pte_unmap_unlock(page_table, ptl);
2485 unlock_page(page); 2487 unlock_page(page);
2486 page_cache_release(page); 2488 page_cache_release(page);
@@ -2510,7 +2512,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2510 goto oom; 2512 goto oom;
2511 __SetPageUptodate(page); 2513 __SetPageUptodate(page);
2512 2514
2513 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) 2515 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2514 goto oom_free_page; 2516 goto oom_free_page;
2515 2517
2516 entry = mk_pte(page, vma->vm_page_prot); 2518 entry = mk_pte(page, vma->vm_page_prot);
@@ -2601,7 +2603,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2601 ret = VM_FAULT_OOM; 2603 ret = VM_FAULT_OOM;
2602 goto out; 2604 goto out;
2603 } 2605 }
2604 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2606 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2605 ret = VM_FAULT_OOM; 2607 ret = VM_FAULT_OOM;
2606 page_cache_release(page); 2608 page_cache_release(page);
2607 goto out; 2609 goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index 55373983c9c6..246dcb973ae7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -133,7 +133,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
133 * be reliable, and this charge can actually fail: oh well, we don't 133 * be reliable, and this charge can actually fail: oh well, we don't
134 * make the situation any worse by proceeding as if it had succeeded. 134 * make the situation any worse by proceeding as if it had succeeded.
135 */ 135 */
136 mem_cgroup_charge(new, mm, GFP_ATOMIC); 136 mem_cgroup_charge_migrate_fixup(new, mm, GFP_ATOMIC);
137 137
138 get_page(new); 138 get_page(new);
139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index eec5ca758a23..fb926efb5167 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -690,17 +690,18 @@ unsigned int count_swap_pages(int type, int free)
690static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 690static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
691 unsigned long addr, swp_entry_t entry, struct page *page) 691 unsigned long addr, swp_entry_t entry, struct page *page)
692{ 692{
693 struct mem_cgroup *ptr = NULL;
693 spinlock_t *ptl; 694 spinlock_t *ptl;
694 pte_t *pte; 695 pte_t *pte;
695 int ret = 1; 696 int ret = 1;
696 697
697 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) 698 if (mem_cgroup_try_charge(vma->vm_mm, GFP_KERNEL, &ptr))
698 ret = -ENOMEM; 699 ret = -ENOMEM;
699 700
700 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 701 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
701 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 702 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
702 if (ret > 0) 703 if (ret > 0)
703 mem_cgroup_uncharge_page(page); 704 mem_cgroup_cancel_charge_swapin(ptr);
704 ret = 0; 705 ret = 0;
705 goto out; 706 goto out;
706 } 707 }
@@ -710,6 +711,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
710 set_pte_at(vma->vm_mm, addr, pte, 711 set_pte_at(vma->vm_mm, addr, pte,
711 pte_mkold(mk_pte(page, vma->vm_page_prot))); 712 pte_mkold(mk_pte(page, vma->vm_page_prot)));
712 page_add_anon_rmap(page, vma, addr); 713 page_add_anon_rmap(page, vma, addr);
714 mem_cgroup_commit_charge_swapin(page, ptr);
713 swap_free(entry); 715 swap_free(entry);
714 /* 716 /*
715 * Move the page to the active list so it is not 717 * Move the page to the active list so it is not