aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2011-03-23 19:42:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-23 22:46:33 -0400
commit5a6475a4e162200f43855e2d42bbf55bcca1a9f2 (patch)
tree361ef686ef833cd1560e884ca1420d512e4d06bb
parent6cfddb261555dd0c0529a5fb7cf8bc5b85ad95a5 (diff)
memcg: fix leak on wrong LRU with FUSE
fs/fuse/dev.c::fuse_try_move_page() does (1) remove a page by ->steal() (2) re-add the page to page cache (3) link the page to LRU if it was not on LRU at (1) This implies the page is _on_ LRU when it's added to radix-tree. So, the page is added to memory cgroup while it's on LRU. because LRU is lazy and no one flushs it. This is the same behavior as SwapCache and needs special care as - remove page from LRU before overwrite pc->mem_cgroup. - add page to LRU after overwrite pc->mem_cgroup. And we need to taking care of pagevec. If PageLRU(page) is set before we add PCG_USED bit, the page will not be added to memcg's LRU (in short period). So, regardlress of PageLRU(page) value before commit_charge(), we need to check PageLRU(page) after commit_charge(). Addresses https://bugzilla.kernel.org/show_bug.cgi?id=30432 Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Balbir Singh <balbir@in.ibm.com> Reported-by: Daniel Poelzleithner <poelzi@poelzi.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c70
1 files changed, 52 insertions, 18 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 61ffe712afe0..1f0b460fe58c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -926,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
926} 926}
927 927
928/* 928/*
929 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 929 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
930 * lru because the page may.be reused after it's fully uncharged (because of 930 * while it's linked to lru because the page may be reused after it's fully
931 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 931 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
932 * it again. This function is only used to charge SwapCache. It's done under 932 * It's done under lock_page and expected that zone->lru_lock isnever held.
933 * lock_page and expected that zone->lru_lock is never held.
934 */ 933 */
935static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 934static void mem_cgroup_lru_del_before_commit(struct page *page)
936{ 935{
937 unsigned long flags; 936 unsigned long flags;
938 struct zone *zone = page_zone(page); 937 struct zone *zone = page_zone(page);
939 struct page_cgroup *pc = lookup_page_cgroup(page); 938 struct page_cgroup *pc = lookup_page_cgroup(page);
940 939
940 /*
941 * Doing this check without taking ->lru_lock seems wrong but this
942 * is safe. Because if page_cgroup's USED bit is unset, the page
943 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
944 * set, the commit after this will fail, anyway.
945 * This all charge/uncharge is done under some mutual execustion.
946 * So, we don't need to taking care of changes in USED bit.
947 */
948 if (likely(!PageLRU(page)))
949 return;
950
941 spin_lock_irqsave(&zone->lru_lock, flags); 951 spin_lock_irqsave(&zone->lru_lock, flags);
942 /* 952 /*
943 * Forget old LRU when this page_cgroup is *not* used. This Used bit 953 * Forget old LRU when this page_cgroup is *not* used. This Used bit
@@ -948,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
948 spin_unlock_irqrestore(&zone->lru_lock, flags); 958 spin_unlock_irqrestore(&zone->lru_lock, flags);
949} 959}
950 960
951static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 961static void mem_cgroup_lru_add_after_commit(struct page *page)
952{ 962{
953 unsigned long flags; 963 unsigned long flags;
954 struct zone *zone = page_zone(page); 964 struct zone *zone = page_zone(page);
955 struct page_cgroup *pc = lookup_page_cgroup(page); 965 struct page_cgroup *pc = lookup_page_cgroup(page);
956 966
967 /* taking care of that the page is added to LRU while we commit it */
968 if (likely(!PageLRU(page)))
969 return;
957 spin_lock_irqsave(&zone->lru_lock, flags); 970 spin_lock_irqsave(&zone->lru_lock, flags);
958 /* link when the page is linked to LRU but page_cgroup isn't */ 971 /* link when the page is linked to LRU but page_cgroup isn't */
959 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 972 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
@@ -2431,9 +2444,26 @@ static void
2431__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2444__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2432 enum charge_type ctype); 2445 enum charge_type ctype);
2433 2446
2447static void
2448__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2449 enum charge_type ctype)
2450{
2451 struct page_cgroup *pc = lookup_page_cgroup(page);
2452 /*
2453 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2454 * is already on LRU. It means the page may on some other page_cgroup's
2455 * LRU. Take care of it.
2456 */
2457 mem_cgroup_lru_del_before_commit(page);
2458 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2459 mem_cgroup_lru_add_after_commit(page);
2460 return;
2461}
2462
2434int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2463int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2435 gfp_t gfp_mask) 2464 gfp_t gfp_mask)
2436{ 2465{
2466 struct mem_cgroup *mem = NULL;
2437 int ret; 2467 int ret;
2438 2468
2439 if (mem_cgroup_disabled()) 2469 if (mem_cgroup_disabled())
@@ -2468,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2468 if (unlikely(!mm)) 2498 if (unlikely(!mm))
2469 mm = &init_mm; 2499 mm = &init_mm;
2470 2500
2471 if (page_is_file_cache(page)) 2501 if (page_is_file_cache(page)) {
2472 return mem_cgroup_charge_common(page, mm, gfp_mask, 2502 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2473 MEM_CGROUP_CHARGE_TYPE_CACHE); 2503 if (ret || !mem)
2504 return ret;
2474 2505
2506 /*
2507 * FUSE reuses pages without going through the final
2508 * put that would remove them from the LRU list, make
2509 * sure that they get relinked properly.
2510 */
2511 __mem_cgroup_commit_charge_lrucare(page, mem,
2512 MEM_CGROUP_CHARGE_TYPE_CACHE);
2513 return ret;
2514 }
2475 /* shmem */ 2515 /* shmem */
2476 if (PageSwapCache(page)) { 2516 if (PageSwapCache(page)) {
2477 struct mem_cgroup *mem;
2478
2479 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2517 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2480 if (!ret) 2518 if (!ret)
2481 __mem_cgroup_commit_charge_swapin(page, mem, 2519 __mem_cgroup_commit_charge_swapin(page, mem,
@@ -2532,17 +2570,13 @@ static void
2532__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2570__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2533 enum charge_type ctype) 2571 enum charge_type ctype)
2534{ 2572{
2535 struct page_cgroup *pc;
2536
2537 if (mem_cgroup_disabled()) 2573 if (mem_cgroup_disabled())
2538 return; 2574 return;
2539 if (!ptr) 2575 if (!ptr)
2540 return; 2576 return;
2541 cgroup_exclude_rmdir(&ptr->css); 2577 cgroup_exclude_rmdir(&ptr->css);
2542 pc = lookup_page_cgroup(page); 2578
2543 mem_cgroup_lru_del_before_commit_swapcache(page); 2579 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2544 __mem_cgroup_commit_charge(ptr, page, 1, pc, ctype);
2545 mem_cgroup_lru_add_after_commit_swapcache(page);
2546 /* 2580 /*
2547 * Now swap is on-memory. This means this page may be 2581 * Now swap is on-memory. This means this page may be
2548 * counted both as mem and swap....double count. 2582 * counted both as mem and swap....double count.