diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2008-07-25 04:47:10 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-25 13:53:37 -0400 |
commit | e8589cc189f96b87348ae83ea4db38eaac624135 (patch) | |
tree | 6693422dc81e6da78c4ad892b0d326fb7f946dda | |
parent | 508b7be0a5b06b64203512ed9b34191cddc83f56 (diff) |
memcg: better migration handling
This patch changes page migration under memory controller to use a
different algorithm. (thanks to Christoph for new idea.)
Before:
- page_cgroup is migrated from an old page to a new page.
After:
- a new page is accounted , no reuse of page_cgroup.
Pros:
- We can avoid compliated lock depndencies and races in migration.
Cons:
- new param to mem_cgroup_charge_common().
- mem_cgroup_getref() is added for handling ref_cnt ping-pong.
This version simplifies complicated lock dependency in page migraiton
under memory resource controller.
new refcnt sequence is following.
a mapped page:
prepage_migration() ..... +1 to NEW page
try_to_unmap() ..... all refs to OLD page is gone.
move_pages() ..... +1 to NEW page if page cache.
remap... ..... all refs from *map* is added to NEW one.
end_migration() ..... -1 to New page.
page's mapcount + (page_is_cache) refs are added to NEW one.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/memcontrol.h | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 128 | ||||
-rw-r--r-- | mm/migrate.c | 22 |
3 files changed, 86 insertions, 75 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e6608776bc96..84ead2aa6f18 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -50,9 +50,10 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); | |||
50 | #define mm_match_cgroup(mm, cgroup) \ | 50 | #define mm_match_cgroup(mm, cgroup) \ |
51 | ((cgroup) == mem_cgroup_from_task((mm)->owner)) | 51 | ((cgroup) == mem_cgroup_from_task((mm)->owner)) |
52 | 52 | ||
53 | extern int mem_cgroup_prepare_migration(struct page *page); | 53 | extern int |
54 | mem_cgroup_prepare_migration(struct page *page, struct page *newpage); | ||
54 | extern void mem_cgroup_end_migration(struct page *page); | 55 | extern void mem_cgroup_end_migration(struct page *page); |
55 | extern void mem_cgroup_page_migration(struct page *page, struct page *newpage); | 56 | extern int mem_cgroup_getref(struct page *page); |
56 | 57 | ||
57 | /* | 58 | /* |
58 | * For memory reclaim. | 59 | * For memory reclaim. |
@@ -112,7 +113,8 @@ static inline int task_in_mem_cgroup(struct task_struct *task, | |||
112 | return 1; | 113 | return 1; |
113 | } | 114 | } |
114 | 115 | ||
115 | static inline int mem_cgroup_prepare_migration(struct page *page) | 116 | static inline int |
117 | mem_cgroup_prepare_migration(struct page *page, struct page *newpage) | ||
116 | { | 118 | { |
117 | return 0; | 119 | return 0; |
118 | } | 120 | } |
@@ -121,8 +123,7 @@ static inline void mem_cgroup_end_migration(struct page *page) | |||
121 | { | 123 | { |
122 | } | 124 | } |
123 | 125 | ||
124 | static inline void | 126 | static inline void mem_cgroup_getref(struct page *page) |
125 | mem_cgroup_page_migration(struct page *page, struct page *newpage) | ||
126 | { | 127 | { |
127 | } | 128 | } |
128 | 129 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 90ccc1326356..da5912b84551 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -524,7 +524,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
524 | * < 0 if the cgroup is over its limit | 524 | * < 0 if the cgroup is over its limit |
525 | */ | 525 | */ |
526 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 526 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
527 | gfp_t gfp_mask, enum charge_type ctype) | 527 | gfp_t gfp_mask, enum charge_type ctype, |
528 | struct mem_cgroup *memcg) | ||
528 | { | 529 | { |
529 | struct mem_cgroup *mem; | 530 | struct mem_cgroup *mem; |
530 | struct page_cgroup *pc; | 531 | struct page_cgroup *pc; |
@@ -569,16 +570,21 @@ retry: | |||
569 | * thread group leader migrates. It's possible that mm is not | 570 | * thread group leader migrates. It's possible that mm is not |
570 | * set, if so charge the init_mm (happens for pagecache usage). | 571 | * set, if so charge the init_mm (happens for pagecache usage). |
571 | */ | 572 | */ |
572 | if (!mm) | 573 | if (!memcg) { |
573 | mm = &init_mm; | 574 | if (!mm) |
575 | mm = &init_mm; | ||
574 | 576 | ||
575 | rcu_read_lock(); | 577 | rcu_read_lock(); |
576 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 578 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
577 | /* | 579 | /* |
578 | * For every charge from the cgroup, increment reference count | 580 | * For every charge from the cgroup, increment reference count |
579 | */ | 581 | */ |
580 | css_get(&mem->css); | 582 | css_get(&mem->css); |
581 | rcu_read_unlock(); | 583 | rcu_read_unlock(); |
584 | } else { | ||
585 | mem = memcg; | ||
586 | css_get(&memcg->css); | ||
587 | } | ||
582 | 588 | ||
583 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { | 589 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { |
584 | if (!(gfp_mask & __GFP_WAIT)) | 590 | if (!(gfp_mask & __GFP_WAIT)) |
@@ -648,7 +654,7 @@ err: | |||
648 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | 654 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) |
649 | { | 655 | { |
650 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 656 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
651 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 657 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); |
652 | } | 658 | } |
653 | 659 | ||
654 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 660 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
@@ -657,7 +663,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
657 | if (!mm) | 663 | if (!mm) |
658 | mm = &init_mm; | 664 | mm = &init_mm; |
659 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 665 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
660 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 666 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); |
667 | } | ||
668 | |||
669 | int mem_cgroup_getref(struct page *page) | ||
670 | { | ||
671 | struct page_cgroup *pc; | ||
672 | |||
673 | if (mem_cgroup_subsys.disabled) | ||
674 | return 0; | ||
675 | |||
676 | lock_page_cgroup(page); | ||
677 | pc = page_get_page_cgroup(page); | ||
678 | VM_BUG_ON(!pc); | ||
679 | pc->ref_cnt++; | ||
680 | unlock_page_cgroup(page); | ||
681 | return 0; | ||
661 | } | 682 | } |
662 | 683 | ||
663 | /* | 684 | /* |
@@ -707,65 +728,39 @@ unlock: | |||
707 | } | 728 | } |
708 | 729 | ||
709 | /* | 730 | /* |
710 | * Returns non-zero if a page (under migration) has valid page_cgroup member. | 731 | * Before starting migration, account against new page. |
711 | * Refcnt of page_cgroup is incremented. | ||
712 | */ | 732 | */ |
713 | int mem_cgroup_prepare_migration(struct page *page) | 733 | int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) |
714 | { | 734 | { |
715 | struct page_cgroup *pc; | 735 | struct page_cgroup *pc; |
736 | struct mem_cgroup *mem = NULL; | ||
737 | enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
738 | int ret = 0; | ||
716 | 739 | ||
717 | if (mem_cgroup_subsys.disabled) | 740 | if (mem_cgroup_subsys.disabled) |
718 | return 0; | 741 | return 0; |
719 | 742 | ||
720 | lock_page_cgroup(page); | 743 | lock_page_cgroup(page); |
721 | pc = page_get_page_cgroup(page); | 744 | pc = page_get_page_cgroup(page); |
722 | if (pc) | 745 | if (pc) { |
723 | pc->ref_cnt++; | 746 | mem = pc->mem_cgroup; |
747 | css_get(&mem->css); | ||
748 | if (pc->flags & PAGE_CGROUP_FLAG_CACHE) | ||
749 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
750 | } | ||
724 | unlock_page_cgroup(page); | 751 | unlock_page_cgroup(page); |
725 | return pc != NULL; | 752 | if (mem) { |
726 | } | 753 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, |
727 | 754 | ctype, mem); | |
728 | void mem_cgroup_end_migration(struct page *page) | 755 | css_put(&mem->css); |
729 | { | 756 | } |
730 | mem_cgroup_uncharge_page(page); | 757 | return ret; |
731 | } | 758 | } |
732 | 759 | ||
733 | /* | 760 | /* remove redundant charge */ |
734 | * We know both *page* and *newpage* are now not-on-LRU and PG_locked. | 761 | void mem_cgroup_end_migration(struct page *newpage) |
735 | * And no race with uncharge() routines because page_cgroup for *page* | ||
736 | * has extra one reference by mem_cgroup_prepare_migration. | ||
737 | */ | ||
738 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) | ||
739 | { | 762 | { |
740 | struct page_cgroup *pc; | 763 | mem_cgroup_uncharge_page(newpage); |
741 | struct mem_cgroup_per_zone *mz; | ||
742 | unsigned long flags; | ||
743 | |||
744 | lock_page_cgroup(page); | ||
745 | pc = page_get_page_cgroup(page); | ||
746 | if (!pc) { | ||
747 | unlock_page_cgroup(page); | ||
748 | return; | ||
749 | } | ||
750 | |||
751 | mz = page_cgroup_zoneinfo(pc); | ||
752 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
753 | __mem_cgroup_remove_list(mz, pc); | ||
754 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
755 | |||
756 | page_assign_page_cgroup(page, NULL); | ||
757 | unlock_page_cgroup(page); | ||
758 | |||
759 | pc->page = newpage; | ||
760 | lock_page_cgroup(newpage); | ||
761 | page_assign_page_cgroup(newpage, pc); | ||
762 | |||
763 | mz = page_cgroup_zoneinfo(pc); | ||
764 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
765 | __mem_cgroup_add_list(mz, pc); | ||
766 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
767 | |||
768 | unlock_page_cgroup(newpage); | ||
769 | } | 764 | } |
770 | 765 | ||
771 | /* | 766 | /* |
@@ -795,12 +790,19 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
795 | page = pc->page; | 790 | page = pc->page; |
796 | get_page(page); | 791 | get_page(page); |
797 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 792 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
798 | mem_cgroup_uncharge_page(page); | 793 | /* |
799 | put_page(page); | 794 | * Check if this page is on LRU. !LRU page can be found |
800 | if (--count <= 0) { | 795 | * if it's under page migration. |
801 | count = FORCE_UNCHARGE_BATCH; | 796 | */ |
797 | if (PageLRU(page)) { | ||
798 | mem_cgroup_uncharge_page(page); | ||
799 | put_page(page); | ||
800 | if (--count <= 0) { | ||
801 | count = FORCE_UNCHARGE_BATCH; | ||
802 | cond_resched(); | ||
803 | } | ||
804 | } else | ||
802 | cond_resched(); | 805 | cond_resched(); |
803 | } | ||
804 | spin_lock_irqsave(&mz->lru_lock, flags); | 806 | spin_lock_irqsave(&mz->lru_lock, flags); |
805 | } | 807 | } |
806 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 808 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
diff --git a/mm/migrate.c b/mm/migrate.c index 376cceba82f9..f6d7f8efd1a8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -358,6 +358,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
358 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 358 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
359 | 359 | ||
360 | write_unlock_irq(&mapping->tree_lock); | 360 | write_unlock_irq(&mapping->tree_lock); |
361 | if (!PageSwapCache(newpage)) { | ||
362 | mem_cgroup_uncharge_page(page); | ||
363 | mem_cgroup_getref(newpage); | ||
364 | } | ||
361 | 365 | ||
362 | return 0; | 366 | return 0; |
363 | } | 367 | } |
@@ -611,7 +615,6 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
611 | rc = fallback_migrate_page(mapping, newpage, page); | 615 | rc = fallback_migrate_page(mapping, newpage, page); |
612 | 616 | ||
613 | if (!rc) { | 617 | if (!rc) { |
614 | mem_cgroup_page_migration(page, newpage); | ||
615 | remove_migration_ptes(page, newpage); | 618 | remove_migration_ptes(page, newpage); |
616 | } else | 619 | } else |
617 | newpage->mapping = NULL; | 620 | newpage->mapping = NULL; |
@@ -641,6 +644,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
641 | /* page was freed from under us. So we are done. */ | 644 | /* page was freed from under us. So we are done. */ |
642 | goto move_newpage; | 645 | goto move_newpage; |
643 | 646 | ||
647 | charge = mem_cgroup_prepare_migration(page, newpage); | ||
648 | if (charge == -ENOMEM) { | ||
649 | rc = -ENOMEM; | ||
650 | goto move_newpage; | ||
651 | } | ||
652 | /* prepare cgroup just returns 0 or -ENOMEM */ | ||
653 | BUG_ON(charge); | ||
654 | |||
644 | rc = -EAGAIN; | 655 | rc = -EAGAIN; |
645 | if (TestSetPageLocked(page)) { | 656 | if (TestSetPageLocked(page)) { |
646 | if (!force) | 657 | if (!force) |
@@ -692,19 +703,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
692 | goto rcu_unlock; | 703 | goto rcu_unlock; |
693 | } | 704 | } |
694 | 705 | ||
695 | charge = mem_cgroup_prepare_migration(page); | ||
696 | /* Establish migration ptes or remove ptes */ | 706 | /* Establish migration ptes or remove ptes */ |
697 | try_to_unmap(page, 1); | 707 | try_to_unmap(page, 1); |
698 | 708 | ||
699 | if (!page_mapped(page)) | 709 | if (!page_mapped(page)) |
700 | rc = move_to_new_page(newpage, page); | 710 | rc = move_to_new_page(newpage, page); |
701 | 711 | ||
702 | if (rc) { | 712 | if (rc) |
703 | remove_migration_ptes(page, page); | 713 | remove_migration_ptes(page, page); |
704 | if (charge) | ||
705 | mem_cgroup_end_migration(page); | ||
706 | } else if (charge) | ||
707 | mem_cgroup_end_migration(newpage); | ||
708 | rcu_unlock: | 714 | rcu_unlock: |
709 | if (rcu_locked) | 715 | if (rcu_locked) |
710 | rcu_read_unlock(); | 716 | rcu_read_unlock(); |
@@ -725,6 +731,8 @@ unlock: | |||
725 | } | 731 | } |
726 | 732 | ||
727 | move_newpage: | 733 | move_newpage: |
734 | if (!charge) | ||
735 | mem_cgroup_end_migration(newpage); | ||
728 | /* | 736 | /* |
729 | * Move the new page to the LRU. If migration was not successful | 737 | * Move the new page to the LRU. If migration was not successful |
730 | * then this will free the page. | 738 | * then this will free the page. |