aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2013-04-29 18:07:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-04-29 18:54:32 -0400
commit542f85f9ae4acd17dca06f4390f54d411a387efd (patch)
tree0e9d90cec45efc1081498bc2f0f3cee9844e76e6 /mm/memcontrol.c
parentc40046f3ad5e877b18cc721aaa7906b98077bc2d (diff)
memcg: rework mem_cgroup_iter to use cgroup iterators
mem_cgroup_iter curently relies on css->id when walking down a group hierarchy tree. This is really awkward because the tree walk depends on the groups creation ordering. The only guarantee is that a parent node is visited before its children. Example: 1) mkdir -p a a/d a/b/c 2) mkdir -a a/b/c a/d Will create the same trees but the tree walks will be different: 1) a, d, b, c 2) a, b, c, d Commit 574bd9f7c7c1 ("cgroup: implement generic child / descendant walk macros") has introduced generic cgroup tree walkers which provide either pre-order or post-order tree walk. This patch converts css->id based iteration to pre-order tree walk to keep the semantic with the original iterator where parent is always visited before its subtree. cgroup_for_each_descendant_pre suggests using post_create and pre_destroy for proper synchronization with groups addidition resp. removal. This implementation doesn't use those because a new memory cgroup is initialized sufficiently for iteration in mem_cgroup_css_alloc already and css reference counting enforces that the group is alive for both the last seen cgroup and the found one resp. it signals that the group is dead and it should be skipped. If the reclaim cookie is used we need to store the last visited group into the iterator so we have to be careful that it doesn't disappear in the mean time. Elevated reference count on the css keeps it alive even though the group have been removed (parked waiting for the last dput so that it can be freed). Per node-zone-prio iter_lock has been introduced to ensure that css_tryget and iter->last_visited is set atomically. Otherwise two racing walkers could both take a references and only one release it leading to a css leak (which pins cgroup dentry). Signed-off-by: Michal Hocko <mhocko@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Ying Han <yinghan@google.com> Cc: Tejun Heo <htejun@gmail.com> Cc: Glauber Costa <glommer@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c86
1 files changed, 68 insertions, 18 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 661a2c679f64..26a38b7c7739 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -152,10 +152,12 @@ struct mem_cgroup_stat_cpu {
152}; 152};
153 153
154struct mem_cgroup_reclaim_iter { 154struct mem_cgroup_reclaim_iter {
155 /* css_id of the last scanned hierarchy member */ 155 /* last scanned hierarchy member with elevated css ref count */
156 int position; 156 struct mem_cgroup *last_visited;
157 /* scan generation, increased every round-trip */ 157 /* scan generation, increased every round-trip */
158 unsigned int generation; 158 unsigned int generation;
159 /* lock to protect the position and generation */
160 spinlock_t iter_lock;
159}; 161};
160 162
161/* 163/*
@@ -1089,7 +1091,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1089 struct mem_cgroup_reclaim_cookie *reclaim) 1091 struct mem_cgroup_reclaim_cookie *reclaim)
1090{ 1092{
1091 struct mem_cgroup *memcg = NULL; 1093 struct mem_cgroup *memcg = NULL;
1092 int id = 0; 1094 struct mem_cgroup *last_visited = NULL;
1093 1095
1094 if (mem_cgroup_disabled()) 1096 if (mem_cgroup_disabled())
1095 return NULL; 1097 return NULL;
@@ -1098,7 +1100,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1098 root = root_mem_cgroup; 1100 root = root_mem_cgroup;
1099 1101
1100 if (prev && !reclaim) 1102 if (prev && !reclaim)
1101 id = css_id(&prev->css); 1103 last_visited = prev;
1102 1104
1103 if (!root->use_hierarchy && root != root_mem_cgroup) { 1105 if (!root->use_hierarchy && root != root_mem_cgroup) {
1104 if (prev) 1106 if (prev)
@@ -1106,9 +1108,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1106 return root; 1108 return root;
1107 } 1109 }
1108 1110
1111 rcu_read_lock();
1109 while (!memcg) { 1112 while (!memcg) {
1110 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1113 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1111 struct cgroup_subsys_state *css; 1114 struct cgroup_subsys_state *css = NULL;
1112 1115
1113 if (reclaim) { 1116 if (reclaim) {
1114 int nid = zone_to_nid(reclaim->zone); 1117 int nid = zone_to_nid(reclaim->zone);
@@ -1117,31 +1120,74 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1117 1120
1118 mz = mem_cgroup_zoneinfo(root, nid, zid); 1121 mz = mem_cgroup_zoneinfo(root, nid, zid);
1119 iter = &mz->reclaim_iter[reclaim->priority]; 1122 iter = &mz->reclaim_iter[reclaim->priority];
1120 if (prev && reclaim->generation != iter->generation) 1123 spin_lock(&iter->iter_lock);
1121 goto out_css_put; 1124 last_visited = iter->last_visited;
1122 id = iter->position; 1125 if (prev && reclaim->generation != iter->generation) {
1126 if (last_visited) {
1127 css_put(&last_visited->css);
1128 iter->last_visited = NULL;
1129 }
1130 spin_unlock(&iter->iter_lock);
1131 goto out_unlock;
1132 }
1123 } 1133 }
1124 1134
1125 rcu_read_lock(); 1135 /*
1126 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 1136 * Root is not visited by cgroup iterators so it needs an
1127 if (css) { 1137 * explicit visit.
1128 if (css == &root->css || css_tryget(css)) 1138 */
1129 memcg = mem_cgroup_from_css(css); 1139 if (!last_visited) {
1130 } else 1140 css = &root->css;
1131 id = 0; 1141 } else {
1132 rcu_read_unlock(); 1142 struct cgroup *prev_cgroup, *next_cgroup;
1143
1144 prev_cgroup = (last_visited == root) ? NULL
1145 : last_visited->css.cgroup;
1146 next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
1147 root->css.cgroup);
1148 if (next_cgroup)
1149 css = cgroup_subsys_state(next_cgroup,
1150 mem_cgroup_subsys_id);
1151 }
1152
1153 /*
1154 * Even if we found a group we have to make sure it is alive.
1155 * css && !memcg means that the groups should be skipped and
1156 * we should continue the tree walk.
1157 * last_visited css is safe to use because it is protected by
1158 * css_get and the tree walk is rcu safe.
1159 */
1160 if (css == &root->css || (css && css_tryget(css)))
1161 memcg = mem_cgroup_from_css(css);
1133 1162
1134 if (reclaim) { 1163 if (reclaim) {
1135 iter->position = id; 1164 struct mem_cgroup *curr = memcg;
1165
1166 if (last_visited)
1167 css_put(&last_visited->css);
1168
1169 if (css && !memcg)
1170 curr = mem_cgroup_from_css(css);
1171
1172 /* make sure that the cached memcg is not removed */
1173 if (curr)
1174 css_get(&curr->css);
1175 iter->last_visited = curr;
1176
1136 if (!css) 1177 if (!css)
1137 iter->generation++; 1178 iter->generation++;
1138 else if (!prev && memcg) 1179 else if (!prev && memcg)
1139 reclaim->generation = iter->generation; 1180 reclaim->generation = iter->generation;
1181 spin_unlock(&iter->iter_lock);
1182 } else if (css && !memcg) {
1183 last_visited = mem_cgroup_from_css(css);
1140 } 1184 }
1141 1185
1142 if (prev && !css) 1186 if (prev && !css)
1143 goto out_css_put; 1187 goto out_unlock;
1144 } 1188 }
1189out_unlock:
1190 rcu_read_unlock();
1145out_css_put: 1191out_css_put:
1146 if (prev && prev != root) 1192 if (prev && prev != root)
1147 css_put(&prev->css); 1193 css_put(&prev->css);
@@ -5929,8 +5975,12 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5929 return 1; 5975 return 1;
5930 5976
5931 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5977 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5978 int prio;
5979
5932 mz = &pn->zoneinfo[zone]; 5980 mz = &pn->zoneinfo[zone];
5933 lruvec_init(&mz->lruvec); 5981 lruvec_init(&mz->lruvec);
5982 for (prio = 0; prio < DEF_PRIORITY + 1; prio++)
5983 spin_lock_init(&mz->reclaim_iter[prio].iter_lock);
5934 mz->usage_in_excess = 0; 5984 mz->usage_in_excess = 0;
5935 mz->on_tree = false; 5985 mz->on_tree = false;
5936 mz->memcg = memcg; 5986 mz->memcg = memcg;