aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2012-03-05 17:59:16 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-05 18:49:43 -0500
commit9ce70c0240d01309b34712f87eda4fbfba3c3764 (patch)
treebe59ba1d702631aab6e5f5c12b0ec5ebe15b10b2 /mm
parent73737b878767ef441d7cc34ebeeba01dd0a68dd6 (diff)
memcg: fix deadlock by inverting lrucare nesting
We have forgotten the rules of lock nesting: the irq-safe ones must be taken inside the non-irq-safe ones, otherwise we are open to deadlock: CPU0 CPU1 ---- ---- lock(&(&pc->lock)->rlock); local_irq_disable(); lock(&(&zone->lru_lock)->rlock); lock(&(&pc->lock)->rlock); <Interrupt> lock(&(&zone->lru_lock)->rlock); To check a different locking issue, I happened to add a spin_lock to memcg's bit_spin_lock in lock_page_cgroup(), and lockdep very quickly complained about __mem_cgroup_commit_charge_lrucare() (on CPU1 above). So delete __mem_cgroup_commit_charge_lrucare(), passing a bool lrucare to __mem_cgroup_commit_charge() instead, taking zone->lru_lock under lock_page_cgroup() in the lrucare case. The original was using spin_lock_irqsave, but we'd be in more trouble if it were ever called at interrupt time: unconditional _irq is enough. And ClearPageLRU before del from lru, SetPageLRU before add to lru: no strong reason, but that is the ordering used consistently elsewhere. Fixes 36b62ad539498d00c2d280a151a ("memcg: simplify corner case handling of LRU"). Signed-off-by: Hugh Dickins <hughd@google.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c72
1 files changed, 37 insertions, 35 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 228d6461c12a..1097d8098f8c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2408,8 +2408,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2408 struct page *page, 2408 struct page *page,
2409 unsigned int nr_pages, 2409 unsigned int nr_pages,
2410 struct page_cgroup *pc, 2410 struct page_cgroup *pc,
2411 enum charge_type ctype) 2411 enum charge_type ctype,
2412 bool lrucare)
2412{ 2413{
2414 struct zone *uninitialized_var(zone);
2415 bool was_on_lru = false;
2416
2413 lock_page_cgroup(pc); 2417 lock_page_cgroup(pc);
2414 if (unlikely(PageCgroupUsed(pc))) { 2418 if (unlikely(PageCgroupUsed(pc))) {
2415 unlock_page_cgroup(pc); 2419 unlock_page_cgroup(pc);
@@ -2420,6 +2424,21 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2420 * we don't need page_cgroup_lock about tail pages, becase they are not 2424 * we don't need page_cgroup_lock about tail pages, becase they are not
2421 * accessed by any other context at this point. 2425 * accessed by any other context at this point.
2422 */ 2426 */
2427
2428 /*
2429 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2430 * may already be on some other mem_cgroup's LRU. Take care of it.
2431 */
2432 if (lrucare) {
2433 zone = page_zone(page);
2434 spin_lock_irq(&zone->lru_lock);
2435 if (PageLRU(page)) {
2436 ClearPageLRU(page);
2437 del_page_from_lru_list(zone, page, page_lru(page));
2438 was_on_lru = true;
2439 }
2440 }
2441
2423 pc->mem_cgroup = memcg; 2442 pc->mem_cgroup = memcg;
2424 /* 2443 /*
2425 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2444 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2443,9 +2462,18 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2443 break; 2462 break;
2444 } 2463 }
2445 2464
2465 if (lrucare) {
2466 if (was_on_lru) {
2467 VM_BUG_ON(PageLRU(page));
2468 SetPageLRU(page);
2469 add_page_to_lru_list(zone, page, page_lru(page));
2470 }
2471 spin_unlock_irq(&zone->lru_lock);
2472 }
2473
2446 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2474 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2447 unlock_page_cgroup(pc); 2475 unlock_page_cgroup(pc);
2448 WARN_ON_ONCE(PageLRU(page)); 2476
2449 /* 2477 /*
2450 * "charge_statistics" updated event counter. Then, check it. 2478 * "charge_statistics" updated event counter. Then, check it.
2451 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2479 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2643,7 +2671,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2643 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2671 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2644 if (ret == -ENOMEM) 2672 if (ret == -ENOMEM)
2645 return ret; 2673 return ret;
2646 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); 2674 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
2647 return 0; 2675 return 0;
2648} 2676}
2649 2677
@@ -2663,35 +2691,6 @@ static void
2663__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2691__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2664 enum charge_type ctype); 2692 enum charge_type ctype);
2665 2693
2666static void
2667__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2668 enum charge_type ctype)
2669{
2670 struct page_cgroup *pc = lookup_page_cgroup(page);
2671 struct zone *zone = page_zone(page);
2672 unsigned long flags;
2673 bool removed = false;
2674
2675 /*
2676 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2677 * is already on LRU. It means the page may on some other page_cgroup's
2678 * LRU. Take care of it.
2679 */
2680 spin_lock_irqsave(&zone->lru_lock, flags);
2681 if (PageLRU(page)) {
2682 del_page_from_lru_list(zone, page, page_lru(page));
2683 ClearPageLRU(page);
2684 removed = true;
2685 }
2686 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2687 if (removed) {
2688 add_page_to_lru_list(zone, page, page_lru(page));
2689 SetPageLRU(page);
2690 }
2691 spin_unlock_irqrestore(&zone->lru_lock, flags);
2692 return;
2693}
2694
2695int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2694int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2696 gfp_t gfp_mask) 2695 gfp_t gfp_mask)
2697{ 2696{
@@ -2769,13 +2768,16 @@ static void
2769__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2768__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2770 enum charge_type ctype) 2769 enum charge_type ctype)
2771{ 2770{
2771 struct page_cgroup *pc;
2772
2772 if (mem_cgroup_disabled()) 2773 if (mem_cgroup_disabled())
2773 return; 2774 return;
2774 if (!memcg) 2775 if (!memcg)
2775 return; 2776 return;
2776 cgroup_exclude_rmdir(&memcg->css); 2777 cgroup_exclude_rmdir(&memcg->css);
2777 2778
2778 __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); 2779 pc = lookup_page_cgroup(page);
2780 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
2779 /* 2781 /*
2780 * Now swap is on-memory. This means this page may be 2782 * Now swap is on-memory. This means this page may be
2781 * counted both as mem and swap....double count. 2783 * counted both as mem and swap....double count.
@@ -3248,7 +3250,7 @@ int mem_cgroup_prepare_migration(struct page *page,
3248 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3250 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3249 else 3251 else
3250 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3252 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3251 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype); 3253 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
3252 return ret; 3254 return ret;
3253} 3255}
3254 3256
@@ -3332,7 +3334,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3332 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3334 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3333 * LRU while we overwrite pc->mem_cgroup. 3335 * LRU while we overwrite pc->mem_cgroup.
3334 */ 3336 */
3335 __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); 3337 __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
3336} 3338}
3337 3339
3338#ifdef CONFIG_DEBUG_VM 3340#ifdef CONFIG_DEBUG_VM