diff options
author | Hugh Dickins <hughd@google.com> | 2012-03-15 18:17:07 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-15 20:03:03 -0400 |
commit | 59927fb984de1703c67bc640c3e522d8b5276c73 (patch) | |
tree | bbb2f9d93cebcd04e97f0f0b7d26c12e56ce831e /mm | |
parent | f1cbd03f5eabb75ea8ace23b47d2209f10871c16 (diff) |
memcg: free mem_cgroup by RCU to fix oops
After fixing the GPF in mem_cgroup_lru_del_list(), three times one
machine running a similar load (moving and removing memcgs while
swapping) has oopsed in mem_cgroup_zone_nr_lru_pages(), when retrieving
memcg zone numbers for get_scan_count() for shrink_mem_cgroup_zone():
this is where a struct mem_cgroup is first accessed after being chosen
by mem_cgroup_iter().
Just what protects a struct mem_cgroup from being freed, in between
mem_cgroup_iter()'s css_get_next() and its css_tryget()? css_tryget()
fails once css->refcnt is zero with CSS_REMOVED set in flags, yes: but
what if that memory is freed and reused for something else, which sets
"refcnt" non-zero? Hmm, and scope for an indefinite freeze if refcnt is
left at zero but flags are cleared.
It's tempting to move the css_tryget() into css_get_next(), to make it
really "get" the css, but I don't think that actually solves anything:
the same difficulty in moving from css_id found to stable css remains.
But we already have rcu_read_lock() around the two, so it's easily fixed
if __mem_cgroup_free() just uses kfree_rcu() to free mem_cgroup.
However, a big struct mem_cgroup is allocated with vzalloc() instead of
kzalloc(), and we're not allowed to vfree() at interrupt time: there
doesn't appear to be a general vfree_rcu() to help with this, so roll
our own using schedule_work(). The compiler decently removes
vfree_work() and vfree_rcu() when the config doesn't need them.
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memcontrol.c | 53 |
1 files changed, 47 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d0e57a3cda18..58a08fc7414a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -230,10 +230,30 @@ struct mem_cgroup { | |||
230 | * the counter to account for memory usage | 230 | * the counter to account for memory usage |
231 | */ | 231 | */ |
232 | struct res_counter res; | 232 | struct res_counter res; |
233 | /* | 233 | |
234 | * the counter to account for mem+swap usage. | 234 | union { |
235 | */ | 235 | /* |
236 | struct res_counter memsw; | 236 | * the counter to account for mem+swap usage. |
237 | */ | ||
238 | struct res_counter memsw; | ||
239 | |||
240 | /* | ||
241 | * rcu_freeing is used only when freeing struct mem_cgroup, | ||
242 | * so put it into a union to avoid wasting more memory. | ||
243 | * It must be disjoint from the css field. It could be | ||
244 | * in a union with the res field, but res plays a much | ||
245 | * larger part in mem_cgroup life than memsw, and might | ||
246 | * be of interest, even at time of free, when debugging. | ||
247 | * So share rcu_head with the less interesting memsw. | ||
248 | */ | ||
249 | struct rcu_head rcu_freeing; | ||
250 | /* | ||
251 | * But when using vfree(), that cannot be done at | ||
252 | * interrupt time, so we must then queue the work. | ||
253 | */ | ||
254 | struct work_struct work_freeing; | ||
255 | }; | ||
256 | |||
237 | /* | 257 | /* |
238 | * Per cgroup active and inactive list, similar to the | 258 | * Per cgroup active and inactive list, similar to the |
239 | * per zone LRU lists. | 259 | * per zone LRU lists. |
@@ -4780,6 +4800,27 @@ out_free: | |||
4780 | } | 4800 | } |
4781 | 4801 | ||
4782 | /* | 4802 | /* |
4803 | * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, | ||
4804 | * but in process context. The work_freeing structure is overlaid | ||
4805 | * on the rcu_freeing structure, which itself is overlaid on memsw. | ||
4806 | */ | ||
4807 | static void vfree_work(struct work_struct *work) | ||
4808 | { | ||
4809 | struct mem_cgroup *memcg; | ||
4810 | |||
4811 | memcg = container_of(work, struct mem_cgroup, work_freeing); | ||
4812 | vfree(memcg); | ||
4813 | } | ||
4814 | static void vfree_rcu(struct rcu_head *rcu_head) | ||
4815 | { | ||
4816 | struct mem_cgroup *memcg; | ||
4817 | |||
4818 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); | ||
4819 | INIT_WORK(&memcg->work_freeing, vfree_work); | ||
4820 | schedule_work(&memcg->work_freeing); | ||
4821 | } | ||
4822 | |||
4823 | /* | ||
4783 | * At destroying mem_cgroup, references from swap_cgroup can remain. | 4824 | * At destroying mem_cgroup, references from swap_cgroup can remain. |
4784 | * (scanning all at force_empty is too costly...) | 4825 | * (scanning all at force_empty is too costly...) |
4785 | * | 4826 | * |
@@ -4802,9 +4843,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4802 | 4843 | ||
4803 | free_percpu(memcg->stat); | 4844 | free_percpu(memcg->stat); |
4804 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | 4845 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) |
4805 | kfree(memcg); | 4846 | kfree_rcu(memcg, rcu_freeing); |
4806 | else | 4847 | else |
4807 | vfree(memcg); | 4848 | call_rcu(&memcg->rcu_freeing, vfree_rcu); |
4808 | } | 4849 | } |
4809 | 4850 | ||
4810 | static void mem_cgroup_get(struct mem_cgroup *memcg) | 4851 | static void mem_cgroup_get(struct mem_cgroup *memcg) |