summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.cz>2013-04-29 18:07:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-04-29 18:54:32 -0400
commit5f57816197186378449b848e99ca9cf41a0055f1 (patch)
tree70a5d1b5144f180c4d56861b6768e60d34d873d1 /mm
parent542f85f9ae4acd17dca06f4390f54d411a387efd (diff)
memcg: relax memcg iter caching
Now that the per-node-zone-priority iterator caches memory cgroups rather than their css ids we have to be careful and remove them from the iterator when they are on the way out otherwise they might live for unbounded amount of time even though their group is already gone (until the global/targeted reclaim triggers the zone under priority to find out the group is dead and let it to find the final rest). We can fix this issue by relaxing rules for the last_visited memcg. Instead of taking a reference to the css before it is stored into iter->last_visited we can just store its pointer and track the number of removed groups from each memcg's subhierarchy. This number would be stored into iterator everytime when a memcg is cached. If the iter count doesn't match the curent walker root's one we will start from the root again. The group counter is incremented upwards the hierarchy every time a group is removed. The iter_lock can be dropped because racing iterators cannot leak the reference anymore as the reference count is not elevated for last_visited when it is cached. Locking rules got a bit complicated by this change though. The iterator primarily relies on rcu read lock which makes sure that once we see a valid last_visited pointer then it will be valid for the whole RCU walk. smp_rmb makes sure that dead_count is read before last_visited and last_dead_count while smp_wmb makes sure that last_visited is updated before last_dead_count so the up-to-date last_dead_count cannot point to an outdated last_visited. css_tryget then makes sure that the last_visited is still alive in case the iteration races with the cached group removal (css is invalidated before mem_cgroup_css_offline increments dead_count). In short: mem_cgroup_iter rcu_read_lock() dead_count = atomic_read(parent->dead_count) smp_rmb() if (dead_count != iter->last_dead_count) last_visited POSSIBLY INVALID -> last_visited = NULL if (!css_tryget(iter->last_visited)) last_visited DEAD -> last_visited = NULL next = find_next(last_visited) css_tryget(next) css_put(last_visited) // css would be invalidated and parent->dead_count // incremented if this was the last reference iter->last_visited = next smp_wmb() iter->last_dead_count = dead_count rcu_read_unlock() cgroup_rmdir cgroup_destroy_locked atomic_add(CSS_DEACT_BIAS, &css->refcnt) // subsequent css_tryget fail mem_cgroup_css_offline mem_cgroup_invalidate_reclaim_iterators while(parent = parent_mem_cgroup) atomic_inc(parent->dead_count) css_put(css) // last reference held by cgroup core Spotted by Ying Han. Original idea from Johannes Weiner. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko <mhocko@suse.cz> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Ying Han <yinghan@google.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Tejun Heo <htejun@gmail.com> Cc: Glauber Costa <glommer@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c69
1 files changed, 52 insertions, 17 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26a38b7c7739..408a5c75d77d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -152,12 +152,15 @@ struct mem_cgroup_stat_cpu {
152}; 152};
153 153
154struct mem_cgroup_reclaim_iter { 154struct mem_cgroup_reclaim_iter {
155 /* last scanned hierarchy member with elevated css ref count */ 155 /*
156 * last scanned hierarchy member. Valid only if last_dead_count
157 * matches memcg->dead_count of the hierarchy root group.
158 */
156 struct mem_cgroup *last_visited; 159 struct mem_cgroup *last_visited;
160 unsigned long last_dead_count;
161
157 /* scan generation, increased every round-trip */ 162 /* scan generation, increased every round-trip */
158 unsigned int generation; 163 unsigned int generation;
159 /* lock to protect the position and generation */
160 spinlock_t iter_lock;
161}; 164};
162 165
163/* 166/*
@@ -337,6 +340,7 @@ struct mem_cgroup {
337 struct mem_cgroup_stat_cpu nocpu_base; 340 struct mem_cgroup_stat_cpu nocpu_base;
338 spinlock_t pcp_counter_lock; 341 spinlock_t pcp_counter_lock;
339 342
343 atomic_t dead_count;
340#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 344#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
341 struct tcp_memcontrol tcp_mem; 345 struct tcp_memcontrol tcp_mem;
342#endif 346#endif
@@ -1092,6 +1096,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1092{ 1096{
1093 struct mem_cgroup *memcg = NULL; 1097 struct mem_cgroup *memcg = NULL;
1094 struct mem_cgroup *last_visited = NULL; 1098 struct mem_cgroup *last_visited = NULL;
1099 unsigned long uninitialized_var(dead_count);
1095 1100
1096 if (mem_cgroup_disabled()) 1101 if (mem_cgroup_disabled())
1097 return NULL; 1102 return NULL;
@@ -1120,16 +1125,33 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1120 1125
1121 mz = mem_cgroup_zoneinfo(root, nid, zid); 1126 mz = mem_cgroup_zoneinfo(root, nid, zid);
1122 iter = &mz->reclaim_iter[reclaim->priority]; 1127 iter = &mz->reclaim_iter[reclaim->priority];
1123 spin_lock(&iter->iter_lock);
1124 last_visited = iter->last_visited; 1128 last_visited = iter->last_visited;
1125 if (prev && reclaim->generation != iter->generation) { 1129 if (prev && reclaim->generation != iter->generation) {
1126 if (last_visited) { 1130 iter->last_visited = NULL;
1127 css_put(&last_visited->css);
1128 iter->last_visited = NULL;
1129 }
1130 spin_unlock(&iter->iter_lock);
1131 goto out_unlock; 1131 goto out_unlock;
1132 } 1132 }
1133
1134 /*
1135 * If the dead_count mismatches, a destruction
1136 * has happened or is happening concurrently.
1137 * If the dead_count matches, a destruction
1138 * might still happen concurrently, but since
1139 * we checked under RCU, that destruction
1140 * won't free the object until we release the
1141 * RCU reader lock. Thus, the dead_count
1142 * check verifies the pointer is still valid,
1143 * css_tryget() verifies the cgroup pointed to
1144 * is alive.
1145 */
1146 dead_count = atomic_read(&root->dead_count);
1147 smp_rmb();
1148 last_visited = iter->last_visited;
1149 if (last_visited) {
1150 if ((dead_count != iter->last_dead_count) ||
1151 !css_tryget(&last_visited->css)) {
1152 last_visited = NULL;
1153 }
1154 }
1133 } 1155 }
1134 1156
1135 /* 1157 /*
@@ -1169,16 +1191,14 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1169 if (css && !memcg) 1191 if (css && !memcg)
1170 curr = mem_cgroup_from_css(css); 1192 curr = mem_cgroup_from_css(css);
1171 1193
1172 /* make sure that the cached memcg is not removed */
1173 if (curr)
1174 css_get(&curr->css);
1175 iter->last_visited = curr; 1194 iter->last_visited = curr;
1195 smp_wmb();
1196 iter->last_dead_count = dead_count;
1176 1197
1177 if (!css) 1198 if (!css)
1178 iter->generation++; 1199 iter->generation++;
1179 else if (!prev && memcg) 1200 else if (!prev && memcg)
1180 reclaim->generation = iter->generation; 1201 reclaim->generation = iter->generation;
1181 spin_unlock(&iter->iter_lock);
1182 } else if (css && !memcg) { 1202 } else if (css && !memcg) {
1183 last_visited = mem_cgroup_from_css(css); 1203 last_visited = mem_cgroup_from_css(css);
1184 } 1204 }
@@ -5975,12 +5995,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5975 return 1; 5995 return 1;
5976 5996
5977 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5997 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5978 int prio;
5979
5980 mz = &pn->zoneinfo[zone]; 5998 mz = &pn->zoneinfo[zone];
5981 lruvec_init(&mz->lruvec); 5999 lruvec_init(&mz->lruvec);
5982 for (prio = 0; prio < DEF_PRIORITY + 1; prio++)
5983 spin_lock_init(&mz->reclaim_iter[prio].iter_lock);
5984 mz->usage_in_excess = 0; 6000 mz->usage_in_excess = 0;
5985 mz->on_tree = false; 6001 mz->on_tree = false;
5986 mz->memcg = memcg; 6002 mz->memcg = memcg;
@@ -6235,10 +6251,29 @@ mem_cgroup_css_online(struct cgroup *cont)
6235 return error; 6251 return error;
6236} 6252}
6237 6253
6254/*
6255 * Announce all parents that a group from their hierarchy is gone.
6256 */
6257static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6258{
6259 struct mem_cgroup *parent = memcg;
6260
6261 while ((parent = parent_mem_cgroup(parent)))
6262 atomic_inc(&parent->dead_count);
6263
6264 /*
6265 * if the root memcg is not hierarchical we have to check it
6266 * explicitely.
6267 */
6268 if (!root_mem_cgroup->use_hierarchy)
6269 atomic_inc(&root_mem_cgroup->dead_count);
6270}
6271
6238static void mem_cgroup_css_offline(struct cgroup *cont) 6272static void mem_cgroup_css_offline(struct cgroup *cont)
6239{ 6273{
6240 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6274 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6241 6275
6276 mem_cgroup_invalidate_reclaim_iterators(memcg);
6242 mem_cgroup_reparent_charges(memcg); 6277 mem_cgroup_reparent_charges(memcg);
6243 mem_cgroup_destroy_all_caches(memcg); 6278 mem_cgroup_destroy_all_caches(memcg);
6244} 6279}