aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2012-03-15 18:17:07 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-15 20:03:03 -0400
commit59927fb984de1703c67bc640c3e522d8b5276c73 (patch)
treebbb2f9d93cebcd04e97f0f0b7d26c12e56ce831e /mm
parentf1cbd03f5eabb75ea8ace23b47d2209f10871c16 (diff)
memcg: free mem_cgroup by RCU to fix oops
After fixing the GPF in mem_cgroup_lru_del_list(), three times one machine running a similar load (moving and removing memcgs while swapping) has oopsed in mem_cgroup_zone_nr_lru_pages(), when retrieving memcg zone numbers for get_scan_count() for shrink_mem_cgroup_zone(): this is where a struct mem_cgroup is first accessed after being chosen by mem_cgroup_iter(). Just what protects a struct mem_cgroup from being freed, in between mem_cgroup_iter()'s css_get_next() and its css_tryget()? css_tryget() fails once css->refcnt is zero with CSS_REMOVED set in flags, yes: but what if that memory is freed and reused for something else, which sets "refcnt" non-zero? Hmm, and scope for an indefinite freeze if refcnt is left at zero but flags are cleared. It's tempting to move the css_tryget() into css_get_next(), to make it really "get" the css, but I don't think that actually solves anything: the same difficulty in moving from css_id found to stable css remains. But we already have rcu_read_lock() around the two, so it's easily fixed if __mem_cgroup_free() just uses kfree_rcu() to free mem_cgroup. However, a big struct mem_cgroup is allocated with vzalloc() instead of kzalloc(), and we're not allowed to vfree() at interrupt time: there doesn't appear to be a general vfree_rcu() to help with this, so roll our own using schedule_work(). The compiler decently removes vfree_work() and vfree_rcu() when the config doesn't need them. Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Tejun Heo <tj@kernel.org> Cc: Ying Han <yinghan@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c53
1 files changed, 47 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d0e57a3cda18..58a08fc7414a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -230,10 +230,30 @@ struct mem_cgroup {
230 * the counter to account for memory usage 230 * the counter to account for memory usage
231 */ 231 */
232 struct res_counter res; 232 struct res_counter res;
233 /* 233
234 * the counter to account for mem+swap usage. 234 union {
235 */ 235 /*
236 struct res_counter memsw; 236 * the counter to account for mem+swap usage.
237 */
238 struct res_counter memsw;
239
240 /*
241 * rcu_freeing is used only when freeing struct mem_cgroup,
242 * so put it into a union to avoid wasting more memory.
243 * It must be disjoint from the css field. It could be
244 * in a union with the res field, but res plays a much
245 * larger part in mem_cgroup life than memsw, and might
246 * be of interest, even at time of free, when debugging.
247 * So share rcu_head with the less interesting memsw.
248 */
249 struct rcu_head rcu_freeing;
250 /*
251 * But when using vfree(), that cannot be done at
252 * interrupt time, so we must then queue the work.
253 */
254 struct work_struct work_freeing;
255 };
256
237 /* 257 /*
238 * Per cgroup active and inactive list, similar to the 258 * Per cgroup active and inactive list, similar to the
239 * per zone LRU lists. 259 * per zone LRU lists.
@@ -4780,6 +4800,27 @@ out_free:
4780} 4800}
4781 4801
4782/* 4802/*
4803 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
4804 * but in process context. The work_freeing structure is overlaid
4805 * on the rcu_freeing structure, which itself is overlaid on memsw.
4806 */
4807static void vfree_work(struct work_struct *work)
4808{
4809 struct mem_cgroup *memcg;
4810
4811 memcg = container_of(work, struct mem_cgroup, work_freeing);
4812 vfree(memcg);
4813}
4814static void vfree_rcu(struct rcu_head *rcu_head)
4815{
4816 struct mem_cgroup *memcg;
4817
4818 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4819 INIT_WORK(&memcg->work_freeing, vfree_work);
4820 schedule_work(&memcg->work_freeing);
4821}
4822
4823/*
4783 * At destroying mem_cgroup, references from swap_cgroup can remain. 4824 * At destroying mem_cgroup, references from swap_cgroup can remain.
4784 * (scanning all at force_empty is too costly...) 4825 * (scanning all at force_empty is too costly...)
4785 * 4826 *
@@ -4802,9 +4843,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4802 4843
4803 free_percpu(memcg->stat); 4844 free_percpu(memcg->stat);
4804 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4845 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4805 kfree(memcg); 4846 kfree_rcu(memcg, rcu_freeing);
4806 else 4847 else
4807 vfree(memcg); 4848 call_rcu(&memcg->rcu_freeing, vfree_rcu);
4808} 4849}
4809 4850
4810static void mem_cgroup_get(struct mem_cgroup *memcg) 4851static void mem_cgroup_get(struct mem_cgroup *memcg)