diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-01-07 21:07:48 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-08 11:31:04 -0500 |
commit | 7a81b88cb53e335ff7d019e6398c95792c817d93 (patch) | |
tree | 6ebca4d509a541ac707e10f9369916549e90c0ad /mm | |
parent | 0b82ac37b889ec881b645860da3775118effb3ca (diff) |
memcg: introduce charge-commit-cancel style of functions
There is a small race in do_swap_page(). When the page swapped-in is
charged, the mapcount can be greater than 0. But, at the same time some
process (shares it ) call unmap and make mapcount 1->0 and the page is
uncharged.
CPUA CPUB
mapcount == 1.
(1) charge if mapcount==0 zap_pte_range()
(2) mapcount 1 => 0.
(3) uncharge(). (success)
(4) set page's rmap()
mapcount 0=>1
Then, this swap page's account is leaked.
For fixing this, I added a new interface.
- charge
account to res_counter by PAGE_SIZE and try to free pages if necessary.
- commit
register page_cgroup and add to LRU if necessary.
- cancel
uncharge PAGE_SIZE because of do_swap_page failure.
CPUA
(1) charge (always)
(2) set page's rmap (mapcount > 0)
(3) commit charge was necessary or not after set_pte().
This protocol uses PCG_USED bit on page_cgroup for avoiding over accounting.
Usual mem_cgroup_charge_common() does charge -> commit at a time.
And this patch also adds following function to clarify all charges.
- mem_cgroup_newpage_charge() ....replacement for mem_cgroup_charge()
called against newly allocated anon pages.
- mem_cgroup_charge_migrate_fixup()
called only from remove_migration_ptes().
we'll have to rewrite this later.(this patch just keeps old behavior)
This function will be removed by additional patch to make migration
clearer.
Good for clarifying "what we do"
Then, we have 4 following charge points.
- newpage
- swap-in
- add-to-cache.
- migration.
[akpm@linux-foundation.org: add missing inline directives to stubs]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memcontrol.c | 155 | ||||
-rw-r--r-- | mm/memory.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 6 |
4 files changed, 136 insertions, 39 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 51ee96545579..f568b1964551 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -467,35 +467,31 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
467 | return nr_taken; | 467 | return nr_taken; |
468 | } | 468 | } |
469 | 469 | ||
470 | /* | 470 | |
471 | * Charge the memory controller for page usage. | 471 | /** |
472 | * Return | 472 | * mem_cgroup_try_charge - get charge of PAGE_SIZE. |
473 | * 0 if the charge was successful | 473 | * @mm: an mm_struct which is charged against. (when *memcg is NULL) |
474 | * < 0 if the cgroup is over its limit | 474 | * @gfp_mask: gfp_mask for reclaim. |
475 | * @memcg: a pointer to memory cgroup which is charged against. | ||
476 | * | ||
477 | * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated | ||
478 | * memory cgroup from @mm is got and stored in *memcg. | ||
479 | * | ||
480 | * Returns 0 if success. -ENOMEM at failure. | ||
475 | */ | 481 | */ |
476 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 482 | |
477 | gfp_t gfp_mask, enum charge_type ctype, | 483 | int mem_cgroup_try_charge(struct mm_struct *mm, |
478 | struct mem_cgroup *memcg) | 484 | gfp_t gfp_mask, struct mem_cgroup **memcg) |
479 | { | 485 | { |
480 | struct mem_cgroup *mem; | 486 | struct mem_cgroup *mem; |
481 | struct page_cgroup *pc; | 487 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
482 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
483 | struct mem_cgroup_per_zone *mz; | ||
484 | unsigned long flags; | ||
485 | |||
486 | pc = lookup_page_cgroup(page); | ||
487 | /* can happen at boot */ | ||
488 | if (unlikely(!pc)) | ||
489 | return 0; | ||
490 | prefetchw(pc); | ||
491 | /* | 488 | /* |
492 | * We always charge the cgroup the mm_struct belongs to. | 489 | * We always charge the cgroup the mm_struct belongs to. |
493 | * The mm_struct's mem_cgroup changes on task migration if the | 490 | * The mm_struct's mem_cgroup changes on task migration if the |
494 | * thread group leader migrates. It's possible that mm is not | 491 | * thread group leader migrates. It's possible that mm is not |
495 | * set, if so charge the init_mm (happens for pagecache usage). | 492 | * set, if so charge the init_mm (happens for pagecache usage). |
496 | */ | 493 | */ |
497 | 494 | if (likely(!*memcg)) { | |
498 | if (likely(!memcg)) { | ||
499 | rcu_read_lock(); | 495 | rcu_read_lock(); |
500 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 496 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
501 | if (unlikely(!mem)) { | 497 | if (unlikely(!mem)) { |
@@ -506,15 +502,17 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
506 | * For every charge from the cgroup, increment reference count | 502 | * For every charge from the cgroup, increment reference count |
507 | */ | 503 | */ |
508 | css_get(&mem->css); | 504 | css_get(&mem->css); |
505 | *memcg = mem; | ||
509 | rcu_read_unlock(); | 506 | rcu_read_unlock(); |
510 | } else { | 507 | } else { |
511 | mem = memcg; | 508 | mem = *memcg; |
512 | css_get(&memcg->css); | 509 | css_get(&mem->css); |
513 | } | 510 | } |
514 | 511 | ||
512 | |||
515 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { | 513 | while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { |
516 | if (!(gfp_mask & __GFP_WAIT)) | 514 | if (!(gfp_mask & __GFP_WAIT)) |
517 | goto out; | 515 | goto nomem; |
518 | 516 | ||
519 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | 517 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) |
520 | continue; | 518 | continue; |
@@ -531,18 +529,37 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
531 | 529 | ||
532 | if (!nr_retries--) { | 530 | if (!nr_retries--) { |
533 | mem_cgroup_out_of_memory(mem, gfp_mask); | 531 | mem_cgroup_out_of_memory(mem, gfp_mask); |
534 | goto out; | 532 | goto nomem; |
535 | } | 533 | } |
536 | } | 534 | } |
535 | return 0; | ||
536 | nomem: | ||
537 | css_put(&mem->css); | ||
538 | return -ENOMEM; | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be | ||
543 | * USED state. If already USED, uncharge and return. | ||
544 | */ | ||
545 | |||
546 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | ||
547 | struct page_cgroup *pc, | ||
548 | enum charge_type ctype) | ||
549 | { | ||
550 | struct mem_cgroup_per_zone *mz; | ||
551 | unsigned long flags; | ||
537 | 552 | ||
553 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
554 | if (!mem) | ||
555 | return; | ||
538 | 556 | ||
539 | lock_page_cgroup(pc); | 557 | lock_page_cgroup(pc); |
540 | if (unlikely(PageCgroupUsed(pc))) { | 558 | if (unlikely(PageCgroupUsed(pc))) { |
541 | unlock_page_cgroup(pc); | 559 | unlock_page_cgroup(pc); |
542 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 560 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
543 | css_put(&mem->css); | 561 | css_put(&mem->css); |
544 | 562 | return; | |
545 | goto done; | ||
546 | } | 563 | } |
547 | pc->mem_cgroup = mem; | 564 | pc->mem_cgroup = mem; |
548 | /* | 565 | /* |
@@ -557,15 +574,39 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
557 | __mem_cgroup_add_list(mz, pc); | 574 | __mem_cgroup_add_list(mz, pc); |
558 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 575 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
559 | unlock_page_cgroup(pc); | 576 | unlock_page_cgroup(pc); |
577 | } | ||
560 | 578 | ||
561 | done: | 579 | /* |
580 | * Charge the memory controller for page usage. | ||
581 | * Return | ||
582 | * 0 if the charge was successful | ||
583 | * < 0 if the cgroup is over its limit | ||
584 | */ | ||
585 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | ||
586 | gfp_t gfp_mask, enum charge_type ctype, | ||
587 | struct mem_cgroup *memcg) | ||
588 | { | ||
589 | struct mem_cgroup *mem; | ||
590 | struct page_cgroup *pc; | ||
591 | int ret; | ||
592 | |||
593 | pc = lookup_page_cgroup(page); | ||
594 | /* can happen at boot */ | ||
595 | if (unlikely(!pc)) | ||
596 | return 0; | ||
597 | prefetchw(pc); | ||
598 | |||
599 | mem = memcg; | ||
600 | ret = mem_cgroup_try_charge(mm, gfp_mask, &mem); | ||
601 | if (ret) | ||
602 | return ret; | ||
603 | |||
604 | __mem_cgroup_commit_charge(mem, pc, ctype); | ||
562 | return 0; | 605 | return 0; |
563 | out: | ||
564 | css_put(&mem->css); | ||
565 | return -ENOMEM; | ||
566 | } | 606 | } |
567 | 607 | ||
568 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | 608 | int mem_cgroup_newpage_charge(struct page *page, |
609 | struct mm_struct *mm, gfp_t gfp_mask) | ||
569 | { | 610 | { |
570 | if (mem_cgroup_subsys.disabled) | 611 | if (mem_cgroup_subsys.disabled) |
571 | return 0; | 612 | return 0; |
@@ -586,6 +627,34 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | |||
586 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); | 627 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); |
587 | } | 628 | } |
588 | 629 | ||
630 | /* | ||
631 | * same as mem_cgroup_newpage_charge(), now. | ||
632 | * But what we assume is different from newpage, and this is special case. | ||
633 | * treat this in special function. easy for maintenance. | ||
634 | */ | ||
635 | |||
636 | int mem_cgroup_charge_migrate_fixup(struct page *page, | ||
637 | struct mm_struct *mm, gfp_t gfp_mask) | ||
638 | { | ||
639 | if (mem_cgroup_subsys.disabled) | ||
640 | return 0; | ||
641 | |||
642 | if (PageCompound(page)) | ||
643 | return 0; | ||
644 | |||
645 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) | ||
646 | return 0; | ||
647 | |||
648 | if (unlikely(!mm)) | ||
649 | mm = &init_mm; | ||
650 | |||
651 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
652 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); | ||
653 | } | ||
654 | |||
655 | |||
656 | |||
657 | |||
589 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 658 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
590 | gfp_t gfp_mask) | 659 | gfp_t gfp_mask) |
591 | { | 660 | { |
@@ -628,6 +697,30 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
628 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); | 697 | MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); |
629 | } | 698 | } |
630 | 699 | ||
700 | |||
701 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | ||
702 | { | ||
703 | struct page_cgroup *pc; | ||
704 | |||
705 | if (mem_cgroup_subsys.disabled) | ||
706 | return; | ||
707 | if (!ptr) | ||
708 | return; | ||
709 | pc = lookup_page_cgroup(page); | ||
710 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
711 | } | ||
712 | |||
713 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | ||
714 | { | ||
715 | if (mem_cgroup_subsys.disabled) | ||
716 | return; | ||
717 | if (!mem) | ||
718 | return; | ||
719 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
720 | css_put(&mem->css); | ||
721 | } | ||
722 | |||
723 | |||
631 | /* | 724 | /* |
632 | * uncharge if !page_mapped(page) | 725 | * uncharge if !page_mapped(page) |
633 | */ | 726 | */ |
diff --git a/mm/memory.c b/mm/memory.c index 3f8fa06b963b..7f210f160990 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2000,7 +2000,7 @@ gotten: | |||
2000 | cow_user_page(new_page, old_page, address, vma); | 2000 | cow_user_page(new_page, old_page, address, vma); |
2001 | __SetPageUptodate(new_page); | 2001 | __SetPageUptodate(new_page); |
2002 | 2002 | ||
2003 | if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) | 2003 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2004 | goto oom_free_new; | 2004 | goto oom_free_new; |
2005 | 2005 | ||
2006 | /* | 2006 | /* |
@@ -2392,6 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2392 | struct page *page; | 2392 | struct page *page; |
2393 | swp_entry_t entry; | 2393 | swp_entry_t entry; |
2394 | pte_t pte; | 2394 | pte_t pte; |
2395 | struct mem_cgroup *ptr = NULL; | ||
2395 | int ret = 0; | 2396 | int ret = 0; |
2396 | 2397 | ||
2397 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2398 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
@@ -2430,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2430 | lock_page(page); | 2431 | lock_page(page); |
2431 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2432 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2432 | 2433 | ||
2433 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2434 | if (mem_cgroup_try_charge(mm, GFP_KERNEL, &ptr) == -ENOMEM) { |
2434 | ret = VM_FAULT_OOM; | 2435 | ret = VM_FAULT_OOM; |
2435 | unlock_page(page); | 2436 | unlock_page(page); |
2436 | goto out; | 2437 | goto out; |
@@ -2460,6 +2461,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2460 | flush_icache_page(vma, page); | 2461 | flush_icache_page(vma, page); |
2461 | set_pte_at(mm, address, page_table, pte); | 2462 | set_pte_at(mm, address, page_table, pte); |
2462 | page_add_anon_rmap(page, vma, address); | 2463 | page_add_anon_rmap(page, vma, address); |
2464 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
2463 | 2465 | ||
2464 | swap_free(entry); | 2466 | swap_free(entry); |
2465 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 2467 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
@@ -2480,7 +2482,7 @@ unlock: | |||
2480 | out: | 2482 | out: |
2481 | return ret; | 2483 | return ret; |
2482 | out_nomap: | 2484 | out_nomap: |
2483 | mem_cgroup_uncharge_page(page); | 2485 | mem_cgroup_cancel_charge_swapin(ptr); |
2484 | pte_unmap_unlock(page_table, ptl); | 2486 | pte_unmap_unlock(page_table, ptl); |
2485 | unlock_page(page); | 2487 | unlock_page(page); |
2486 | page_cache_release(page); | 2488 | page_cache_release(page); |
@@ -2510,7 +2512,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2510 | goto oom; | 2512 | goto oom; |
2511 | __SetPageUptodate(page); | 2513 | __SetPageUptodate(page); |
2512 | 2514 | ||
2513 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) | 2515 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) |
2514 | goto oom_free_page; | 2516 | goto oom_free_page; |
2515 | 2517 | ||
2516 | entry = mk_pte(page, vma->vm_page_prot); | 2518 | entry = mk_pte(page, vma->vm_page_prot); |
@@ -2601,7 +2603,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2601 | ret = VM_FAULT_OOM; | 2603 | ret = VM_FAULT_OOM; |
2602 | goto out; | 2604 | goto out; |
2603 | } | 2605 | } |
2604 | if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { | 2606 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { |
2605 | ret = VM_FAULT_OOM; | 2607 | ret = VM_FAULT_OOM; |
2606 | page_cache_release(page); | 2608 | page_cache_release(page); |
2607 | goto out; | 2609 | goto out; |
diff --git a/mm/migrate.c b/mm/migrate.c index 55373983c9c6..246dcb973ae7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -133,7 +133,7 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
133 | * be reliable, and this charge can actually fail: oh well, we don't | 133 | * be reliable, and this charge can actually fail: oh well, we don't |
134 | * make the situation any worse by proceeding as if it had succeeded. | 134 | * make the situation any worse by proceeding as if it had succeeded. |
135 | */ | 135 | */ |
136 | mem_cgroup_charge(new, mm, GFP_ATOMIC); | 136 | mem_cgroup_charge_migrate_fixup(new, mm, GFP_ATOMIC); |
137 | 137 | ||
138 | get_page(new); | 138 | get_page(new); |
139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 139 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index eec5ca758a23..fb926efb5167 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -690,17 +690,18 @@ unsigned int count_swap_pages(int type, int free) | |||
690 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 690 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
691 | unsigned long addr, swp_entry_t entry, struct page *page) | 691 | unsigned long addr, swp_entry_t entry, struct page *page) |
692 | { | 692 | { |
693 | struct mem_cgroup *ptr = NULL; | ||
693 | spinlock_t *ptl; | 694 | spinlock_t *ptl; |
694 | pte_t *pte; | 695 | pte_t *pte; |
695 | int ret = 1; | 696 | int ret = 1; |
696 | 697 | ||
697 | if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) | 698 | if (mem_cgroup_try_charge(vma->vm_mm, GFP_KERNEL, &ptr)) |
698 | ret = -ENOMEM; | 699 | ret = -ENOMEM; |
699 | 700 | ||
700 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 701 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
701 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 702 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
702 | if (ret > 0) | 703 | if (ret > 0) |
703 | mem_cgroup_uncharge_page(page); | 704 | mem_cgroup_cancel_charge_swapin(ptr); |
704 | ret = 0; | 705 | ret = 0; |
705 | goto out; | 706 | goto out; |
706 | } | 707 | } |
@@ -710,6 +711,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
710 | set_pte_at(vma->vm_mm, addr, pte, | 711 | set_pte_at(vma->vm_mm, addr, pte, |
711 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 712 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
712 | page_add_anon_rmap(page, vma, addr); | 713 | page_add_anon_rmap(page, vma, addr); |
714 | mem_cgroup_commit_charge_swapin(page, ptr); | ||
713 | swap_free(entry); | 715 | swap_free(entry); |
714 | /* | 716 | /* |
715 | * Move the page to the active list so it is not | 717 | * Move the page to the active list so it is not |