aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2013-09-12 18:13:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-12 18:38:00 -0400
commitde57780dc659f95b17ccb649f003278dde0b5b86 (patch)
treed2493cc412c16946f3ead9158a61b26dd1f0c45a
parenta5b7c87f92076352dbff2fe0423ec255e1c9a71b (diff)
memcg: enhance memcg iterator to support predicates
The caller of the iterator might know that some nodes or even subtrees should be skipped but there is no way to tell iterators about that so the only choice left is to let iterators to visit each node and do the selection outside of the iterating code. This, however, doesn't scale well with hierarchies with many groups where only few groups are interesting. This patch adds mem_cgroup_iter_cond variant of the iterator with a callback which gets called for every visited node. There are three possible ways how the callback can influence the walk. Either the node is visited, it is skipped but the tree walk continues down the tree or the whole subtree of the current group is skipped. [hughd@google.com: fix memcg-less page reclaim] Signed-off-by: Michal Hocko <mhocko@suse.cz> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Glauber Costa <glommer@openvz.org> Cc: Greg Thelen <gthelen@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Michel Lespinasse <walken@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Ying Han <yinghan@google.com> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h49
-rw-r--r--mm/memcontrol.c70
-rw-r--r--mm/vmscan.c16
3 files changed, 103 insertions, 32 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d8dd6560621b..d4d1f9b0dbba 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -41,6 +41,23 @@ struct mem_cgroup_reclaim_cookie {
41 unsigned int generation; 41 unsigned int generation;
42}; 42};
43 43
44enum mem_cgroup_filter_t {
45 VISIT, /* visit current node */
46 SKIP, /* skip the current node and continue traversal */
47 SKIP_TREE, /* skip the whole subtree and continue traversal */
48};
49
50/*
51 * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
52 * iterate through the hierarchy tree. Each tree element is checked by the
53 * predicate before it is returned by the iterator. If a filter returns
54 * SKIP or SKIP_TREE then the iterator code continues traversal (with the
55 * next node down the hierarchy or the next node that doesn't belong under the
56 * memcg's subtree).
57 */
58typedef enum mem_cgroup_filter_t
59(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
60
44#ifdef CONFIG_MEMCG 61#ifdef CONFIG_MEMCG
45/* 62/*
46 * All "charge" functions with gfp_mask should use GFP_KERNEL or 63 * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -108,9 +125,18 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
108extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, 125extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
109 struct page *oldpage, struct page *newpage, bool migration_ok); 126 struct page *oldpage, struct page *newpage, bool migration_ok);
110 127
111struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 128struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
112 struct mem_cgroup *, 129 struct mem_cgroup *prev,
113 struct mem_cgroup_reclaim_cookie *); 130 struct mem_cgroup_reclaim_cookie *reclaim,
131 mem_cgroup_iter_filter cond);
132
133static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
134 struct mem_cgroup *prev,
135 struct mem_cgroup_reclaim_cookie *reclaim)
136{
137 return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
138}
139
114void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 140void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
115 141
116/* 142/*
@@ -180,7 +206,8 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
180 mem_cgroup_update_page_stat(page, idx, -1); 206 mem_cgroup_update_page_stat(page, idx, -1);
181} 207}
182 208
183bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 209enum mem_cgroup_filter_t
210mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
184 struct mem_cgroup *root); 211 struct mem_cgroup *root);
185 212
186void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 213void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
@@ -295,6 +322,15 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
295 struct page *oldpage, struct page *newpage, bool migration_ok) 322 struct page *oldpage, struct page *newpage, bool migration_ok)
296{ 323{
297} 324}
325static inline struct mem_cgroup *
326mem_cgroup_iter_cond(struct mem_cgroup *root,
327 struct mem_cgroup *prev,
328 struct mem_cgroup_reclaim_cookie *reclaim,
329 mem_cgroup_iter_filter cond)
330{
331 /* first call must return non-NULL, second return NULL */
332 return (struct mem_cgroup *)(unsigned long)!prev;
333}
298 334
299static inline struct mem_cgroup * 335static inline struct mem_cgroup *
300mem_cgroup_iter(struct mem_cgroup *root, 336mem_cgroup_iter(struct mem_cgroup *root,
@@ -358,10 +394,11 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
358} 394}
359 395
360static inline 396static inline
361bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 397enum mem_cgroup_filter_t
398mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
362 struct mem_cgroup *root) 399 struct mem_cgroup *root)
363{ 400{
364 return false; 401 return VISIT;
365} 402}
366 403
367static inline void mem_cgroup_split_huge_fixup(struct page *head) 404static inline void mem_cgroup_split_huge_fixup(struct page *head)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c016e001c5b2..a4bb857d902c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -875,6 +875,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
875 return memcg; 875 return memcg;
876} 876}
877 877
878static enum mem_cgroup_filter_t
879mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
880 mem_cgroup_iter_filter cond)
881{
882 if (!cond)
883 return VISIT;
884 return cond(memcg, root);
885}
886
878/* 887/*
879 * Returns a next (in a pre-order walk) alive memcg (with elevated css 888 * Returns a next (in a pre-order walk) alive memcg (with elevated css
880 * ref. count) or NULL if the whole root's subtree has been visited. 889 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -882,7 +891,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
882 * helper function to be used by mem_cgroup_iter 891 * helper function to be used by mem_cgroup_iter
883 */ 892 */
884static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 893static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
885 struct mem_cgroup *last_visited) 894 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
886{ 895{
887 struct cgroup_subsys_state *prev_css, *next_css; 896 struct cgroup_subsys_state *prev_css, *next_css;
888 897
@@ -900,11 +909,31 @@ skip_node:
900 if (next_css) { 909 if (next_css) {
901 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 910 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
902 911
903 if (css_tryget(&mem->css)) 912 switch (mem_cgroup_filter(mem, root, cond)) {
904 return mem; 913 case SKIP:
905 else {
906 prev_css = next_css; 914 prev_css = next_css;
907 goto skip_node; 915 goto skip_node;
916 case SKIP_TREE:
917 if (mem == root)
918 return NULL;
919 /*
920 * css_rightmost_descendant is not an optimal way to
921 * skip through a subtree (especially for imbalanced
922 * trees leaning to right) but that's what we have right
923 * now. More effective solution would be traversing
924 * right-up for first non-NULL without calling
925 * css_next_descendant_pre afterwards.
926 */
927 prev_css = css_rightmost_descendant(next_css);
928 goto skip_node;
929 case VISIT:
930 if (css_tryget(&mem->css))
931 return mem;
932 else {
933 prev_css = next_css;
934 goto skip_node;
935 }
936 break;
908 } 937 }
909 } 938 }
910 939
@@ -968,6 +997,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
968 * @root: hierarchy root 997 * @root: hierarchy root
969 * @prev: previously returned memcg, NULL on first invocation 998 * @prev: previously returned memcg, NULL on first invocation
970 * @reclaim: cookie for shared reclaim walks, NULL for full walks 999 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1000 * @cond: filter for visited nodes, NULL for no filter
971 * 1001 *
972 * Returns references to children of the hierarchy below @root, or 1002 * Returns references to children of the hierarchy below @root, or
973 * @root itself, or %NULL after a full round-trip. 1003 * @root itself, or %NULL after a full round-trip.
@@ -980,15 +1010,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
980 * divide up the memcgs in the hierarchy among all concurrent 1010 * divide up the memcgs in the hierarchy among all concurrent
981 * reclaimers operating on the same zone and priority. 1011 * reclaimers operating on the same zone and priority.
982 */ 1012 */
983struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1013struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
984 struct mem_cgroup *prev, 1014 struct mem_cgroup *prev,
985 struct mem_cgroup_reclaim_cookie *reclaim) 1015 struct mem_cgroup_reclaim_cookie *reclaim,
1016 mem_cgroup_iter_filter cond)
986{ 1017{
987 struct mem_cgroup *memcg = NULL; 1018 struct mem_cgroup *memcg = NULL;
988 struct mem_cgroup *last_visited = NULL; 1019 struct mem_cgroup *last_visited = NULL;
989 1020
990 if (mem_cgroup_disabled()) 1021 if (mem_cgroup_disabled()) {
991 return NULL; 1022 /* first call must return non-NULL, second return NULL */
1023 return (struct mem_cgroup *)(unsigned long)!prev;
1024 }
992 1025
993 if (!root) 1026 if (!root)
994 root = root_mem_cgroup; 1027 root = root_mem_cgroup;
@@ -999,7 +1032,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
999 if (!root->use_hierarchy && root != root_mem_cgroup) { 1032 if (!root->use_hierarchy && root != root_mem_cgroup) {
1000 if (prev) 1033 if (prev)
1001 goto out_css_put; 1034 goto out_css_put;
1002 return root; 1035 if (mem_cgroup_filter(root, root, cond) == VISIT)
1036 return root;
1037 return NULL;
1003 } 1038 }
1004 1039
1005 rcu_read_lock(); 1040 rcu_read_lock();
@@ -1022,7 +1057,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1022 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1057 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1023 } 1058 }
1024 1059
1025 memcg = __mem_cgroup_iter_next(root, last_visited); 1060 memcg = __mem_cgroup_iter_next(root, last_visited, cond);
1026 1061
1027 if (reclaim) { 1062 if (reclaim) {
1028 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1063 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1033,7 +1068,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1033 reclaim->generation = iter->generation; 1068 reclaim->generation = iter->generation;
1034 } 1069 }
1035 1070
1036 if (prev && !memcg) 1071 /*
1072 * We have finished the whole tree walk or no group has been
1073 * visited because filter told us to skip the root node.
1074 */
1075 if (!memcg && (prev || (cond && !last_visited)))
1037 goto out_unlock; 1076 goto out_unlock;
1038 } 1077 }
1039out_unlock: 1078out_unlock:
@@ -1778,13 +1817,14 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1778 * a) it is over its soft limit 1817 * a) it is over its soft limit
1779 * b) any parent up the hierarchy is over its soft limit 1818 * b) any parent up the hierarchy is over its soft limit
1780 */ 1819 */
1781bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, 1820enum mem_cgroup_filter_t
1821mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1782 struct mem_cgroup *root) 1822 struct mem_cgroup *root)
1783{ 1823{
1784 struct mem_cgroup *parent = memcg; 1824 struct mem_cgroup *parent = memcg;
1785 1825
1786 if (res_counter_soft_limit_excess(&memcg->res)) 1826 if (res_counter_soft_limit_excess(&memcg->res))
1787 return true; 1827 return VISIT;
1788 1828
1789 /* 1829 /*
1790 * If any parent up to the root in the hierarchy is over its soft limit 1830 * If any parent up to the root in the hierarchy is over its soft limit
@@ -1792,12 +1832,12 @@ bool mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1792 */ 1832 */
1793 while((parent = parent_mem_cgroup(parent))) { 1833 while((parent = parent_mem_cgroup(parent))) {
1794 if (res_counter_soft_limit_excess(&parent->res)) 1834 if (res_counter_soft_limit_excess(&parent->res))
1795 return true; 1835 return VISIT;
1796 if (parent == root) 1836 if (parent == root)
1797 break; 1837 break;
1798 } 1838 }
1799 1839
1800 return false; 1840 return SKIP;
1801} 1841}
1802 1842
1803/* 1843/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1896e7ca494b..f2e35099508b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2151,21 +2151,16 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2151 .zone = zone, 2151 .zone = zone,
2152 .priority = sc->priority, 2152 .priority = sc->priority,
2153 }; 2153 };
2154 struct mem_cgroup *memcg; 2154 struct mem_cgroup *memcg = NULL;
2155 mem_cgroup_iter_filter filter = (soft_reclaim) ?
2156 mem_cgroup_soft_reclaim_eligible : NULL;
2155 2157
2156 nr_reclaimed = sc->nr_reclaimed; 2158 nr_reclaimed = sc->nr_reclaimed;
2157 nr_scanned = sc->nr_scanned; 2159 nr_scanned = sc->nr_scanned;
2158 2160
2159 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2161 while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
2160 do {
2161 struct lruvec *lruvec; 2162 struct lruvec *lruvec;
2162 2163
2163 if (soft_reclaim &&
2164 !mem_cgroup_soft_reclaim_eligible(memcg, root)) {
2165 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2166 continue;
2167 }
2168
2169 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2164 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2170 2165
2171 shrink_lruvec(lruvec, sc); 2166 shrink_lruvec(lruvec, sc);
@@ -2185,8 +2180,7 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2185 mem_cgroup_iter_break(root, memcg); 2180 mem_cgroup_iter_break(root, memcg);
2186 break; 2181 break;
2187 } 2182 }
2188 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2183 }
2189 } while (memcg);
2190 2184
2191 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2185 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2192 sc->nr_scanned - nr_scanned, 2186 sc->nr_scanned - nr_scanned,