diff options
author | Hugh Dickins <hugh@veritas.com> | 2008-03-04 17:29:13 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2008-03-04 19:35:15 -0500 |
commit | 2680eed723b664d83e6181ae275fac0ec8fa05ff (patch) | |
tree | f4c137e43c3bdf78e6923bb3aafaf38680c4c301 | |
parent | 6d48ff8bcfd403ec8d3ef7a56538ea9e6f773b9c (diff) |
memcg: fix mem_cgroup_move_lists locking
Ever since the VM_BUG_ON(page_get_page_cgroup(page)) (now Bad page state) went
into page freeing, I've hit it from time to time in testing on some machines,
sometimes only after many days. Recently found a machine which could usually
produce it within a few hours, which got me there at last.
The culprit is mem_cgroup_move_lists, whose locking is inadequate; and the
arrangement of structures was such that you got page_cgroups from the lru list
neatly put on to SLUB's freelist. Kamezawa-san identified the same hole
independently.
The main problem was that it was missing the lock_page_cgroup it needs to
safely page_get_page_cgroup; but it's tricky to go beyond that too, and I
couldn't do it with SLAB_DESTROY_BY_RCU as I'd expected. See the code for
comments on the constraints.
This patch immediately gets replaced by a simpler one from Hirokazu-san; but
is it just foolish pride that tells me to put this one on record, in case we
need to come back to it later?
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 49 |
1 files changed, 43 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 66d0e84cefa6..dcbe30aad1da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -277,6 +277,11 @@ static void lock_page_cgroup(struct page *page) | |||
277 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | 277 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
278 | } | 278 | } |
279 | 279 | ||
280 | static int try_lock_page_cgroup(struct page *page) | ||
281 | { | ||
282 | return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
283 | } | ||
284 | |||
280 | static void unlock_page_cgroup(struct page *page) | 285 | static void unlock_page_cgroup(struct page *page) |
281 | { | 286 | { |
282 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | 287 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
@@ -348,17 +353,49 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
348 | void mem_cgroup_move_lists(struct page *page, bool active) | 353 | void mem_cgroup_move_lists(struct page *page, bool active) |
349 | { | 354 | { |
350 | struct page_cgroup *pc; | 355 | struct page_cgroup *pc; |
356 | struct mem_cgroup *mem; | ||
351 | struct mem_cgroup_per_zone *mz; | 357 | struct mem_cgroup_per_zone *mz; |
352 | unsigned long flags; | 358 | unsigned long flags; |
353 | 359 | ||
354 | pc = page_get_page_cgroup(page); | 360 | /* |
355 | if (!pc) | 361 | * We cannot lock_page_cgroup while holding zone's lru_lock, |
362 | * because other holders of lock_page_cgroup can be interrupted | ||
363 | * with an attempt to rotate_reclaimable_page. But we cannot | ||
364 | * safely get to page_cgroup without it, so just try_lock it: | ||
365 | * mem_cgroup_isolate_pages allows for page left on wrong list. | ||
366 | */ | ||
367 | if (!try_lock_page_cgroup(page)) | ||
356 | return; | 368 | return; |
357 | 369 | ||
358 | mz = page_cgroup_zoneinfo(pc); | 370 | /* |
359 | spin_lock_irqsave(&mz->lru_lock, flags); | 371 | * Now page_cgroup is stable, but we cannot acquire mz->lru_lock |
360 | __mem_cgroup_move_lists(pc, active); | 372 | * while holding it, because mem_cgroup_force_empty_list does the |
361 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 373 | * reverse. Get a hold on the mem_cgroup before unlocking, so that |
374 | * the zoneinfo remains stable, then take mz->lru_lock; then check | ||
375 | * that page still points to pc and pc (even if freed and reassigned | ||
376 | * to that same page meanwhile) still points to the same mem_cgroup. | ||
377 | * Then we know mz still points to the right spinlock, so it's safe | ||
378 | * to move_lists (page->page_cgroup might be reset while we do so, but | ||
379 | * that doesn't matter: pc->page is stable till we drop mz->lru_lock). | ||
380 | * We're being a little naughty not to try_lock_page_cgroup again | ||
381 | * inside there, but we are safe, aren't we? Aren't we? Whistle... | ||
382 | */ | ||
383 | pc = page_get_page_cgroup(page); | ||
384 | if (pc) { | ||
385 | mem = pc->mem_cgroup; | ||
386 | mz = page_cgroup_zoneinfo(pc); | ||
387 | css_get(&mem->css); | ||
388 | |||
389 | unlock_page_cgroup(page); | ||
390 | |||
391 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
392 | if (page_get_page_cgroup(page) == pc && pc->mem_cgroup == mem) | ||
393 | __mem_cgroup_move_lists(pc, active); | ||
394 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
395 | |||
396 | css_put(&mem->css); | ||
397 | } else | ||
398 | unlock_page_cgroup(page); | ||
362 | } | 399 | } |
363 | 400 | ||
364 | /* | 401 | /* |