aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-04-02 19:57:33 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-02 22:04:55 -0400
commit04046e1a0a34286382e913f8fc461440c21d88e8 (patch)
treecab2b8a61e7474d509fbd3ea02e38b7c4137ce4b /mm/memcontrol.c
parentb4046f00ee7c1e5615261b496cf7309683275b29 (diff)
memcg: use CSS ID
Assigning CSS ID for each memcg and use css_get_next() for scanning hierarchy. Assume folloing tree. group_A (ID=3) /01 (ID=4) /0A (ID=7) /02 (ID=10) group_B (ID=5) and task in group_A/01/0A hits limit at group_A. reclaim will be done in following order (round-robin). group_A(3) -> group_A/01 (4) -> group_A/01/0A (7) -> group_A/02(10) -> group_A -> ..... Round robin by ID. The last visited cgroup is recorded and restart from it when it start reclaim again. (More smart algorithm can be implemented..) No cgroup_mutex or hierarchy_mutex is required. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c220
1 files changed, 82 insertions, 138 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ffec674c5ac..61fd9590c135 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -95,6 +95,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
95 return ret; 95 return ret;
96} 96}
97 97
98static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
99{
100 s64 ret;
101
102 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
103 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
104 return ret;
105}
106
98/* 107/*
99 * per-zone information in memory controller. 108 * per-zone information in memory controller.
100 */ 109 */
@@ -154,9 +163,9 @@ struct mem_cgroup {
154 163
155 /* 164 /*
156 * While reclaiming in a hiearchy, we cache the last child we 165 * While reclaiming in a hiearchy, we cache the last child we
157 * reclaimed from. Protected by hierarchy_mutex 166 * reclaimed from.
158 */ 167 */
159 struct mem_cgroup *last_scanned_child; 168 int last_scanned_child;
160 /* 169 /*
161 * Should the accounting and control be hierarchical, per subtree? 170 * Should the accounting and control be hierarchical, per subtree?
162 */ 171 */
@@ -629,103 +638,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
629#define mem_cgroup_from_res_counter(counter, member) \ 638#define mem_cgroup_from_res_counter(counter, member) \
630 container_of(counter, struct mem_cgroup, member) 639 container_of(counter, struct mem_cgroup, member)
631 640
632/*
633 * This routine finds the DFS walk successor. This routine should be
634 * called with hierarchy_mutex held
635 */
636static struct mem_cgroup *
637__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638{
639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
640
641 curr_cgroup = curr->css.cgroup;
642 root_cgroup = root_mem->css.cgroup;
643
644 if (!list_empty(&curr_cgroup->children)) {
645 /*
646 * Walk down to children
647 */
648 cgroup = list_entry(curr_cgroup->children.next,
649 struct cgroup, sibling);
650 curr = mem_cgroup_from_cont(cgroup);
651 goto done;
652 }
653
654visit_parent:
655 if (curr_cgroup == root_cgroup) {
656 /* caller handles NULL case */
657 curr = NULL;
658 goto done;
659 }
660
661 /*
662 * Goto next sibling
663 */
664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666 sibling);
667 curr = mem_cgroup_from_cont(cgroup);
668 goto done;
669 }
670
671 /*
672 * Go up to next parent and next parent's sibling if need be
673 */
674 curr_cgroup = curr_cgroup->parent;
675 goto visit_parent;
676
677done:
678 return curr;
679}
680
681/*
682 * Visit the first child (need not be the first child as per the ordering
683 * of the cgroup list, since we track last_scanned_child) of @mem and use
684 * that to reclaim free pages from.
685 */
686static struct mem_cgroup *
687mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688{
689 struct cgroup *cgroup;
690 struct mem_cgroup *orig, *next;
691 bool obsolete;
692
693 /*
694 * Scan all children under the mem_cgroup mem
695 */
696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
697
698 orig = root_mem->last_scanned_child;
699 obsolete = mem_cgroup_is_obsolete(orig);
700
701 if (list_empty(&root_mem->css.cgroup->children)) {
702 /*
703 * root_mem might have children before and last_scanned_child
704 * may point to one of them. We put it later.
705 */
706 if (orig)
707 VM_BUG_ON(!obsolete);
708 next = NULL;
709 goto done;
710 }
711
712 if (!orig || obsolete) {
713 cgroup = list_first_entry(&root_mem->css.cgroup->children,
714 struct cgroup, sibling);
715 next = mem_cgroup_from_cont(cgroup);
716 } else
717 next = __mem_cgroup_get_next_node(orig, root_mem);
718
719done:
720 if (next)
721 mem_cgroup_get(next);
722 root_mem->last_scanned_child = next;
723 if (orig)
724 mem_cgroup_put(orig);
725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
726 return (next) ? next : root_mem;
727}
728
729static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 641static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
730{ 642{
731 if (do_swap_account) { 643 if (do_swap_account) {
@@ -755,46 +667,79 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
755} 667}
756 668
757/* 669/*
758 * Dance down the hierarchy if needed to reclaim memory. We remember the 670 * Visit the first child (need not be the first child as per the ordering
759 * last child we reclaimed from, so that we don't end up penalizing 671 * of the cgroup list, since we track last_scanned_child) of @mem and use
760 * one child extensively based on its position in the children list. 672 * that to reclaim free pages from.
673 */
674static struct mem_cgroup *
675mem_cgroup_select_victim(struct mem_cgroup *root_mem)
676{
677 struct mem_cgroup *ret = NULL;
678 struct cgroup_subsys_state *css;
679 int nextid, found;
680
681 if (!root_mem->use_hierarchy) {
682 css_get(&root_mem->css);
683 ret = root_mem;
684 }
685
686 while (!ret) {
687 rcu_read_lock();
688 nextid = root_mem->last_scanned_child + 1;
689 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
690 &found);
691 if (css && css_tryget(css))
692 ret = container_of(css, struct mem_cgroup, css);
693
694 rcu_read_unlock();
695 /* Updates scanning parameter */
696 spin_lock(&root_mem->reclaim_param_lock);
697 if (!css) {
698 /* this means start scan from ID:1 */
699 root_mem->last_scanned_child = 0;
700 } else
701 root_mem->last_scanned_child = found;
702 spin_unlock(&root_mem->reclaim_param_lock);
703 }
704
705 return ret;
706}
707
708/*
709 * Scan the hierarchy if needed to reclaim memory. We remember the last child
710 * we reclaimed from, so that we don't end up penalizing one child extensively
711 * based on its position in the children list.
761 * 712 *
762 * root_mem is the original ancestor that we've been reclaim from. 713 * root_mem is the original ancestor that we've been reclaim from.
714 *
715 * We give up and return to the caller when we visit root_mem twice.
716 * (other groups can be removed while we're walking....)
763 */ 717 */
764static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 718static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 gfp_t gfp_mask, bool noswap) 719 gfp_t gfp_mask, bool noswap)
766{ 720{
767 struct mem_cgroup *next_mem; 721 struct mem_cgroup *victim;
768 int ret = 0; 722 int ret, total = 0;
769 723 int loop = 0;
770 /* 724
771 * Reclaim unconditionally and don't check for return value. 725 while (loop < 2) {
772 * We need to reclaim in the current group and down the tree. 726 victim = mem_cgroup_select_victim(root_mem);
773 * One might think about checking for children before reclaiming, 727 if (victim == root_mem)
774 * but there might be left over accounting, even after children 728 loop++;
775 * have left. 729 if (!mem_cgroup_local_usage(&victim->stat)) {
776 */ 730 /* this cgroup's local usage == 0 */
777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 731 css_put(&victim->css);
778 get_swappiness(root_mem));
779 if (mem_cgroup_check_under_limit(root_mem))
780 return 1; /* indicate reclaim has succeeded */
781 if (!root_mem->use_hierarchy)
782 return ret;
783
784 next_mem = mem_cgroup_get_next_node(root_mem);
785
786 while (next_mem != root_mem) {
787 if (mem_cgroup_is_obsolete(next_mem)) {
788 next_mem = mem_cgroup_get_next_node(root_mem);
789 continue; 732 continue;
790 } 733 }
791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 734 /* we use swappiness of local cgroup */
792 get_swappiness(next_mem)); 735 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
736 get_swappiness(victim));
737 css_put(&victim->css);
738 total += ret;
793 if (mem_cgroup_check_under_limit(root_mem)) 739 if (mem_cgroup_check_under_limit(root_mem))
794 return 1; /* indicate reclaim has succeeded */ 740 return 1 + total;
795 next_mem = mem_cgroup_get_next_node(root_mem);
796 } 741 }
797 return ret; 742 return total;
798} 743}
799 744
800bool mem_cgroup_oom_called(struct task_struct *task) 745bool mem_cgroup_oom_called(struct task_struct *task)
@@ -1324,8 +1269,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1324 res_counter_uncharge(&mem->res, PAGE_SIZE); 1269 res_counter_uncharge(&mem->res, PAGE_SIZE);
1325 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1270 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1326 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1271 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1327
1328 mem_cgroup_charge_statistics(mem, pc, false); 1272 mem_cgroup_charge_statistics(mem, pc, false);
1273
1329 ClearPageCgroupUsed(pc); 1274 ClearPageCgroupUsed(pc);
1330 /* 1275 /*
1331 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1276 * pc->mem_cgroup is not cleared here. It will be accessed when it's
@@ -2178,6 +2123,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2178{ 2123{
2179 int node; 2124 int node;
2180 2125
2126 free_css_id(&mem_cgroup_subsys, &mem->css);
2127
2181 for_each_node_state(node, N_POSSIBLE) 2128 for_each_node_state(node, N_POSSIBLE)
2182 free_mem_cgroup_per_zone_info(mem, node); 2129 free_mem_cgroup_per_zone_info(mem, node);
2183 2130
@@ -2228,11 +2175,12 @@ static struct cgroup_subsys_state * __ref
2228mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2175mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2229{ 2176{
2230 struct mem_cgroup *mem, *parent; 2177 struct mem_cgroup *mem, *parent;
2178 long error = -ENOMEM;
2231 int node; 2179 int node;
2232 2180
2233 mem = mem_cgroup_alloc(); 2181 mem = mem_cgroup_alloc();
2234 if (!mem) 2182 if (!mem)
2235 return ERR_PTR(-ENOMEM); 2183 return ERR_PTR(error);
2236 2184
2237 for_each_node_state(node, N_POSSIBLE) 2185 for_each_node_state(node, N_POSSIBLE)
2238 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2186 if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -2260,7 +2208,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2260 res_counter_init(&mem->res, NULL); 2208 res_counter_init(&mem->res, NULL);
2261 res_counter_init(&mem->memsw, NULL); 2209 res_counter_init(&mem->memsw, NULL);
2262 } 2210 }
2263 mem->last_scanned_child = NULL; 2211 mem->last_scanned_child = 0;
2264 spin_lock_init(&mem->reclaim_param_lock); 2212 spin_lock_init(&mem->reclaim_param_lock);
2265 2213
2266 if (parent) 2214 if (parent)
@@ -2269,7 +2217,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2269 return &mem->css; 2217 return &mem->css;
2270free_out: 2218free_out:
2271 __mem_cgroup_free(mem); 2219 __mem_cgroup_free(mem);
2272 return ERR_PTR(-ENOMEM); 2220 return ERR_PTR(error);
2273} 2221}
2274 2222
2275static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2223static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
@@ -2284,12 +2232,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2284 struct cgroup *cont) 2232 struct cgroup *cont)
2285{ 2233{
2286 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2234 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2287 struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2288 2235
2289 if (last_scanned_child) {
2290 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2291 mem_cgroup_put(last_scanned_child);
2292 }
2293 mem_cgroup_put(mem); 2236 mem_cgroup_put(mem);
2294} 2237}
2295 2238
@@ -2328,6 +2271,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
2328 .populate = mem_cgroup_populate, 2271 .populate = mem_cgroup_populate,
2329 .attach = mem_cgroup_move_task, 2272 .attach = mem_cgroup_move_task,
2330 .early_init = 0, 2273 .early_init = 0,
2274 .use_id = 1,
2331}; 2275};
2332 2276
2333#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2277#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP