diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-04-02 19:57:33 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-02 22:04:55 -0400 |
commit | 04046e1a0a34286382e913f8fc461440c21d88e8 (patch) | |
tree | cab2b8a61e7474d509fbd3ea02e38b7c4137ce4b | |
parent | b4046f00ee7c1e5615261b496cf7309683275b29 (diff) |
memcg: use CSS ID
Assigning CSS ID for each memcg and use css_get_next() for scanning hierarchy.
Assume folloing tree.
group_A (ID=3)
/01 (ID=4)
/0A (ID=7)
/02 (ID=10)
group_B (ID=5)
and task in group_A/01/0A hits limit at group_A.
reclaim will be done in following order (round-robin).
group_A(3) -> group_A/01 (4) -> group_A/01/0A (7) -> group_A/02(10)
-> group_A -> .....
Round robin by ID. The last visited cgroup is recorded and restart
from it when it start reclaim again.
(More smart algorithm can be implemented..)
No cgroup_mutex or hierarchy_mutex is required.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 220 |
1 files changed, 82 insertions, 138 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ffec674c5ac..61fd9590c135 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -95,6 +95,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | |||
95 | return ret; | 95 | return ret; |
96 | } | 96 | } |
97 | 97 | ||
98 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
99 | { | ||
100 | s64 ret; | ||
101 | |||
102 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
103 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
104 | return ret; | ||
105 | } | ||
106 | |||
98 | /* | 107 | /* |
99 | * per-zone information in memory controller. | 108 | * per-zone information in memory controller. |
100 | */ | 109 | */ |
@@ -154,9 +163,9 @@ struct mem_cgroup { | |||
154 | 163 | ||
155 | /* | 164 | /* |
156 | * While reclaiming in a hiearchy, we cache the last child we | 165 | * While reclaiming in a hiearchy, we cache the last child we |
157 | * reclaimed from. Protected by hierarchy_mutex | 166 | * reclaimed from. |
158 | */ | 167 | */ |
159 | struct mem_cgroup *last_scanned_child; | 168 | int last_scanned_child; |
160 | /* | 169 | /* |
161 | * Should the accounting and control be hierarchical, per subtree? | 170 | * Should the accounting and control be hierarchical, per subtree? |
162 | */ | 171 | */ |
@@ -629,103 +638,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
629 | #define mem_cgroup_from_res_counter(counter, member) \ | 638 | #define mem_cgroup_from_res_counter(counter, member) \ |
630 | container_of(counter, struct mem_cgroup, member) | 639 | container_of(counter, struct mem_cgroup, member) |
631 | 640 | ||
632 | /* | ||
633 | * This routine finds the DFS walk successor. This routine should be | ||
634 | * called with hierarchy_mutex held | ||
635 | */ | ||
636 | static struct mem_cgroup * | ||
637 | __mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) | ||
638 | { | ||
639 | struct cgroup *cgroup, *curr_cgroup, *root_cgroup; | ||
640 | |||
641 | curr_cgroup = curr->css.cgroup; | ||
642 | root_cgroup = root_mem->css.cgroup; | ||
643 | |||
644 | if (!list_empty(&curr_cgroup->children)) { | ||
645 | /* | ||
646 | * Walk down to children | ||
647 | */ | ||
648 | cgroup = list_entry(curr_cgroup->children.next, | ||
649 | struct cgroup, sibling); | ||
650 | curr = mem_cgroup_from_cont(cgroup); | ||
651 | goto done; | ||
652 | } | ||
653 | |||
654 | visit_parent: | ||
655 | if (curr_cgroup == root_cgroup) { | ||
656 | /* caller handles NULL case */ | ||
657 | curr = NULL; | ||
658 | goto done; | ||
659 | } | ||
660 | |||
661 | /* | ||
662 | * Goto next sibling | ||
663 | */ | ||
664 | if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { | ||
665 | cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, | ||
666 | sibling); | ||
667 | curr = mem_cgroup_from_cont(cgroup); | ||
668 | goto done; | ||
669 | } | ||
670 | |||
671 | /* | ||
672 | * Go up to next parent and next parent's sibling if need be | ||
673 | */ | ||
674 | curr_cgroup = curr_cgroup->parent; | ||
675 | goto visit_parent; | ||
676 | |||
677 | done: | ||
678 | return curr; | ||
679 | } | ||
680 | |||
681 | /* | ||
682 | * Visit the first child (need not be the first child as per the ordering | ||
683 | * of the cgroup list, since we track last_scanned_child) of @mem and use | ||
684 | * that to reclaim free pages from. | ||
685 | */ | ||
686 | static struct mem_cgroup * | ||
687 | mem_cgroup_get_next_node(struct mem_cgroup *root_mem) | ||
688 | { | ||
689 | struct cgroup *cgroup; | ||
690 | struct mem_cgroup *orig, *next; | ||
691 | bool obsolete; | ||
692 | |||
693 | /* | ||
694 | * Scan all children under the mem_cgroup mem | ||
695 | */ | ||
696 | mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); | ||
697 | |||
698 | orig = root_mem->last_scanned_child; | ||
699 | obsolete = mem_cgroup_is_obsolete(orig); | ||
700 | |||
701 | if (list_empty(&root_mem->css.cgroup->children)) { | ||
702 | /* | ||
703 | * root_mem might have children before and last_scanned_child | ||
704 | * may point to one of them. We put it later. | ||
705 | */ | ||
706 | if (orig) | ||
707 | VM_BUG_ON(!obsolete); | ||
708 | next = NULL; | ||
709 | goto done; | ||
710 | } | ||
711 | |||
712 | if (!orig || obsolete) { | ||
713 | cgroup = list_first_entry(&root_mem->css.cgroup->children, | ||
714 | struct cgroup, sibling); | ||
715 | next = mem_cgroup_from_cont(cgroup); | ||
716 | } else | ||
717 | next = __mem_cgroup_get_next_node(orig, root_mem); | ||
718 | |||
719 | done: | ||
720 | if (next) | ||
721 | mem_cgroup_get(next); | ||
722 | root_mem->last_scanned_child = next; | ||
723 | if (orig) | ||
724 | mem_cgroup_put(orig); | ||
725 | mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); | ||
726 | return (next) ? next : root_mem; | ||
727 | } | ||
728 | |||
729 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | 641 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) |
730 | { | 642 | { |
731 | if (do_swap_account) { | 643 | if (do_swap_account) { |
@@ -755,46 +667,79 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
755 | } | 667 | } |
756 | 668 | ||
757 | /* | 669 | /* |
758 | * Dance down the hierarchy if needed to reclaim memory. We remember the | 670 | * Visit the first child (need not be the first child as per the ordering |
759 | * last child we reclaimed from, so that we don't end up penalizing | 671 | * of the cgroup list, since we track last_scanned_child) of @mem and use |
760 | * one child extensively based on its position in the children list. | 672 | * that to reclaim free pages from. |
673 | */ | ||
674 | static struct mem_cgroup * | ||
675 | mem_cgroup_select_victim(struct mem_cgroup *root_mem) | ||
676 | { | ||
677 | struct mem_cgroup *ret = NULL; | ||
678 | struct cgroup_subsys_state *css; | ||
679 | int nextid, found; | ||
680 | |||
681 | if (!root_mem->use_hierarchy) { | ||
682 | css_get(&root_mem->css); | ||
683 | ret = root_mem; | ||
684 | } | ||
685 | |||
686 | while (!ret) { | ||
687 | rcu_read_lock(); | ||
688 | nextid = root_mem->last_scanned_child + 1; | ||
689 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, | ||
690 | &found); | ||
691 | if (css && css_tryget(css)) | ||
692 | ret = container_of(css, struct mem_cgroup, css); | ||
693 | |||
694 | rcu_read_unlock(); | ||
695 | /* Updates scanning parameter */ | ||
696 | spin_lock(&root_mem->reclaim_param_lock); | ||
697 | if (!css) { | ||
698 | /* this means start scan from ID:1 */ | ||
699 | root_mem->last_scanned_child = 0; | ||
700 | } else | ||
701 | root_mem->last_scanned_child = found; | ||
702 | spin_unlock(&root_mem->reclaim_param_lock); | ||
703 | } | ||
704 | |||
705 | return ret; | ||
706 | } | ||
707 | |||
708 | /* | ||
709 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | ||
710 | * we reclaimed from, so that we don't end up penalizing one child extensively | ||
711 | * based on its position in the children list. | ||
761 | * | 712 | * |
762 | * root_mem is the original ancestor that we've been reclaim from. | 713 | * root_mem is the original ancestor that we've been reclaim from. |
714 | * | ||
715 | * We give up and return to the caller when we visit root_mem twice. | ||
716 | * (other groups can be removed while we're walking....) | ||
763 | */ | 717 | */ |
764 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 718 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
765 | gfp_t gfp_mask, bool noswap) | 719 | gfp_t gfp_mask, bool noswap) |
766 | { | 720 | { |
767 | struct mem_cgroup *next_mem; | 721 | struct mem_cgroup *victim; |
768 | int ret = 0; | 722 | int ret, total = 0; |
769 | 723 | int loop = 0; | |
770 | /* | 724 | |
771 | * Reclaim unconditionally and don't check for return value. | 725 | while (loop < 2) { |
772 | * We need to reclaim in the current group and down the tree. | 726 | victim = mem_cgroup_select_victim(root_mem); |
773 | * One might think about checking for children before reclaiming, | 727 | if (victim == root_mem) |
774 | * but there might be left over accounting, even after children | 728 | loop++; |
775 | * have left. | 729 | if (!mem_cgroup_local_usage(&victim->stat)) { |
776 | */ | 730 | /* this cgroup's local usage == 0 */ |
777 | ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, | 731 | css_put(&victim->css); |
778 | get_swappiness(root_mem)); | ||
779 | if (mem_cgroup_check_under_limit(root_mem)) | ||
780 | return 1; /* indicate reclaim has succeeded */ | ||
781 | if (!root_mem->use_hierarchy) | ||
782 | return ret; | ||
783 | |||
784 | next_mem = mem_cgroup_get_next_node(root_mem); | ||
785 | |||
786 | while (next_mem != root_mem) { | ||
787 | if (mem_cgroup_is_obsolete(next_mem)) { | ||
788 | next_mem = mem_cgroup_get_next_node(root_mem); | ||
789 | continue; | 732 | continue; |
790 | } | 733 | } |
791 | ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, | 734 | /* we use swappiness of local cgroup */ |
792 | get_swappiness(next_mem)); | 735 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, |
736 | get_swappiness(victim)); | ||
737 | css_put(&victim->css); | ||
738 | total += ret; | ||
793 | if (mem_cgroup_check_under_limit(root_mem)) | 739 | if (mem_cgroup_check_under_limit(root_mem)) |
794 | return 1; /* indicate reclaim has succeeded */ | 740 | return 1 + total; |
795 | next_mem = mem_cgroup_get_next_node(root_mem); | ||
796 | } | 741 | } |
797 | return ret; | 742 | return total; |
798 | } | 743 | } |
799 | 744 | ||
800 | bool mem_cgroup_oom_called(struct task_struct *task) | 745 | bool mem_cgroup_oom_called(struct task_struct *task) |
@@ -1324,8 +1269,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1324 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1269 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
1325 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1270 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) |
1326 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1271 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
1327 | |||
1328 | mem_cgroup_charge_statistics(mem, pc, false); | 1272 | mem_cgroup_charge_statistics(mem, pc, false); |
1273 | |||
1329 | ClearPageCgroupUsed(pc); | 1274 | ClearPageCgroupUsed(pc); |
1330 | /* | 1275 | /* |
1331 | * pc->mem_cgroup is not cleared here. It will be accessed when it's | 1276 | * pc->mem_cgroup is not cleared here. It will be accessed when it's |
@@ -2178,6 +2123,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
2178 | { | 2123 | { |
2179 | int node; | 2124 | int node; |
2180 | 2125 | ||
2126 | free_css_id(&mem_cgroup_subsys, &mem->css); | ||
2127 | |||
2181 | for_each_node_state(node, N_POSSIBLE) | 2128 | for_each_node_state(node, N_POSSIBLE) |
2182 | free_mem_cgroup_per_zone_info(mem, node); | 2129 | free_mem_cgroup_per_zone_info(mem, node); |
2183 | 2130 | ||
@@ -2228,11 +2175,12 @@ static struct cgroup_subsys_state * __ref | |||
2228 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 2175 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
2229 | { | 2176 | { |
2230 | struct mem_cgroup *mem, *parent; | 2177 | struct mem_cgroup *mem, *parent; |
2178 | long error = -ENOMEM; | ||
2231 | int node; | 2179 | int node; |
2232 | 2180 | ||
2233 | mem = mem_cgroup_alloc(); | 2181 | mem = mem_cgroup_alloc(); |
2234 | if (!mem) | 2182 | if (!mem) |
2235 | return ERR_PTR(-ENOMEM); | 2183 | return ERR_PTR(error); |
2236 | 2184 | ||
2237 | for_each_node_state(node, N_POSSIBLE) | 2185 | for_each_node_state(node, N_POSSIBLE) |
2238 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 2186 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
@@ -2260,7 +2208,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2260 | res_counter_init(&mem->res, NULL); | 2208 | res_counter_init(&mem->res, NULL); |
2261 | res_counter_init(&mem->memsw, NULL); | 2209 | res_counter_init(&mem->memsw, NULL); |
2262 | } | 2210 | } |
2263 | mem->last_scanned_child = NULL; | 2211 | mem->last_scanned_child = 0; |
2264 | spin_lock_init(&mem->reclaim_param_lock); | 2212 | spin_lock_init(&mem->reclaim_param_lock); |
2265 | 2213 | ||
2266 | if (parent) | 2214 | if (parent) |
@@ -2269,7 +2217,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2269 | return &mem->css; | 2217 | return &mem->css; |
2270 | free_out: | 2218 | free_out: |
2271 | __mem_cgroup_free(mem); | 2219 | __mem_cgroup_free(mem); |
2272 | return ERR_PTR(-ENOMEM); | 2220 | return ERR_PTR(error); |
2273 | } | 2221 | } |
2274 | 2222 | ||
2275 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 2223 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
@@ -2284,12 +2232,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, | |||
2284 | struct cgroup *cont) | 2232 | struct cgroup *cont) |
2285 | { | 2233 | { |
2286 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2234 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2287 | struct mem_cgroup *last_scanned_child = mem->last_scanned_child; | ||
2288 | 2235 | ||
2289 | if (last_scanned_child) { | ||
2290 | VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); | ||
2291 | mem_cgroup_put(last_scanned_child); | ||
2292 | } | ||
2293 | mem_cgroup_put(mem); | 2236 | mem_cgroup_put(mem); |
2294 | } | 2237 | } |
2295 | 2238 | ||
@@ -2328,6 +2271,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
2328 | .populate = mem_cgroup_populate, | 2271 | .populate = mem_cgroup_populate, |
2329 | .attach = mem_cgroup_move_task, | 2272 | .attach = mem_cgroup_move_task, |
2330 | .early_init = 0, | 2273 | .early_init = 0, |
2274 | .use_id = 1, | ||
2331 | }; | 2275 | }; |
2332 | 2276 | ||
2333 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 2277 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |