aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2012-03-05 17:59:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-05 18:49:43 -0500
commit7512102cf64d36e3c7444480273623c7aab3563f (patch)
tree4c5b589455ed0d343384d7eeaedfab5057648a0d
parent9f78ff005a6b6313728247113948450b2adddde8 (diff)
memcg: fix GPF when cgroup removal races with last exit
When moving tasks from old memcg (with move_charge_at_immigrate on new memcg), followed by removal of old memcg, hit General Protection Fault in mem_cgroup_lru_del_list() (called from release_pages called from free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from exit_mmap from mmput from exit_mm from do_exit). Somewhat reproducible, takes a few hours: the old struct mem_cgroup has been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is still trying to update its stats, and take page off lru before freeing. A task, or a charge, or a page on lru: each secures a memcg against removal. In this case, the last task has been moved out of the old memcg, and it is exiting: anonymous pages are uncharged one by one from the memcg, as they are zapped from its pagetables, so the charge gets down to 0; but the pages themselves are queued in an mmu_gather for freeing. Most of those pages will be on lru (and force_empty is careful to lru_add_drain_all, to add pages from pagevec to lru first), but not necessarily all: perhaps some have been isolated for page reclaim, perhaps some isolated for other reasons. So, force_empty may find no task, no charge and no page on lru, and let the removal proceed. There would still be no problem if these pages were immediately freed; but typically (and the put_page_testzero protocol demands it) they have to be added back to lru before they are found freeable, then removed from lru and freed. We don't see the issue when adding, because the mem_cgroup_iter() loops keep their own reference to the memcg being scanned; but when it comes to mem_cgroup_lru_del_list(). I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and PageCgroupUsed flags were used (like a trick with mirrors) to deflect view of pc->mem_cgroup to the stable root_mem_cgroup when neither set. 38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully removed those convolutions, but left this General Protection Fault. But it's surprisingly easy to restore the old behaviour: just check PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec to add), and reset pc to root_mem_cgroup if page is uncharged. A risky change? just going back to how it worked before; testing, and an audit of uses of pc->mem_cgroup, show no problem. And there's a nice bonus: with mem_cgroup_lru_add_list() itself making sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg: clear pc->mem_cgroup if necessary"). Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c is not strictly necessary: the lru_lock there, with RCU before memcg structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe without that; but it seems cleaner to rely on one dependency less. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h5
-rw-r--r--mm/ksm.c11
-rw-r--r--mm/memcontrol.c30
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c10
6 files changed, 18 insertions, 48 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4d34356fe644..b80de520670b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -129,7 +129,6 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
129extern void mem_cgroup_replace_page_cache(struct page *oldpage, 129extern void mem_cgroup_replace_page_cache(struct page *oldpage,
130 struct page *newpage); 130 struct page *newpage);
131 131
132extern void mem_cgroup_reset_owner(struct page *page);
133#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 132#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
134extern int do_swap_account; 133extern int do_swap_account;
135#endif 134#endif
@@ -392,10 +391,6 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
392 struct page *newpage) 391 struct page *newpage)
393{ 392{
394} 393}
395
396static inline void mem_cgroup_reset_owner(struct page *page)
397{
398}
399#endif /* CONFIG_CGROUP_MEM_CONT */ 394#endif /* CONFIG_CGROUP_MEM_CONT */
400 395
401#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) 396#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
diff --git a/mm/ksm.c b/mm/ksm.c
index 1925ffbfb27f..310544a379ae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -28,7 +28,6 @@
28#include <linux/kthread.h> 28#include <linux/kthread.h>
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/memcontrol.h>
32#include <linux/rbtree.h> 31#include <linux/rbtree.h>
33#include <linux/memory.h> 32#include <linux/memory.h>
34#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
@@ -1572,16 +1571,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
1572 1571
1573 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1572 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1574 if (new_page) { 1573 if (new_page) {
1575 /*
1576 * The memcg-specific accounting when moving
1577 * pages around the LRU lists relies on the
1578 * page's owner (memcg) to be valid. Usually,
1579 * pages are assigned to a new owner before
1580 * being put on the LRU list, but since this
1581 * is not the case here, the stale owner from
1582 * a previous allocation cycle must be reset.
1583 */
1584 mem_cgroup_reset_owner(new_page);
1585 copy_user_highpage(new_page, page, address, vma); 1574 copy_user_highpage(new_page, page, address, vma);
1586 1575
1587 SetPageDirty(new_page); 1576 SetPageDirty(new_page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1097d8098f8c..d0e57a3cda18 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1042,6 +1042,19 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1042 1042
1043 pc = lookup_page_cgroup(page); 1043 pc = lookup_page_cgroup(page);
1044 memcg = pc->mem_cgroup; 1044 memcg = pc->mem_cgroup;
1045
1046 /*
1047 * Surreptitiously switch any uncharged page to root:
1048 * an uncharged page off lru does nothing to secure
1049 * its former mem_cgroup from sudden removal.
1050 *
1051 * Our caller holds lru_lock, and PageCgroupUsed is updated
1052 * under page_cgroup lock: between them, they make all uses
1053 * of pc->mem_cgroup safe.
1054 */
1055 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1056 pc->mem_cgroup = memcg = root_mem_cgroup;
1057
1045 mz = page_cgroup_zoneinfo(memcg, page); 1058 mz = page_cgroup_zoneinfo(memcg, page);
1046 /* compound_order() is stabilized through lru_lock */ 1059 /* compound_order() is stabilized through lru_lock */
1047 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1060 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
@@ -3029,23 +3042,6 @@ void mem_cgroup_uncharge_end(void)
3029 batch->memcg = NULL; 3042 batch->memcg = NULL;
3030} 3043}
3031 3044
3032/*
3033 * A function for resetting pc->mem_cgroup for newly allocated pages.
3034 * This function should be called if the newpage will be added to LRU
3035 * before start accounting.
3036 */
3037void mem_cgroup_reset_owner(struct page *newpage)
3038{
3039 struct page_cgroup *pc;
3040
3041 if (mem_cgroup_disabled())
3042 return;
3043
3044 pc = lookup_page_cgroup(newpage);
3045 VM_BUG_ON(PageCgroupUsed(pc));
3046 pc->mem_cgroup = root_mem_cgroup;
3047}
3048
3049#ifdef CONFIG_SWAP 3045#ifdef CONFIG_SWAP
3050/* 3046/*
3051 * called after __delete_from_swap_cache() and drop "page" account. 3047 * called after __delete_from_swap_cache() and drop "page" account.
diff --git a/mm/migrate.c b/mm/migrate.c
index df141f60289e..1503b6b54ecb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
839 if (!newpage) 839 if (!newpage)
840 return -ENOMEM; 840 return -ENOMEM;
841 841
842 mem_cgroup_reset_owner(newpage);
843
844 if (page_count(page) == 1) { 842 if (page_count(page) == 1) {
845 /* page was freed from under us. So we are done. */ 843 /* page was freed from under us. So we are done. */
846 goto out; 844 goto out;
diff --git a/mm/swap.c b/mm/swap.c
index fff1ff7fb9ad..14380e9fbe33 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -652,7 +652,7 @@ EXPORT_SYMBOL(__pagevec_release);
652void lru_add_page_tail(struct zone* zone, 652void lru_add_page_tail(struct zone* zone,
653 struct page *page, struct page *page_tail) 653 struct page *page, struct page *page_tail)
654{ 654{
655 int active; 655 int uninitialized_var(active);
656 enum lru_list lru; 656 enum lru_list lru;
657 const int file = 0; 657 const int file = 0;
658 658
@@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone,
672 active = 0; 672 active = 0;
673 lru = LRU_INACTIVE_ANON; 673 lru = LRU_INACTIVE_ANON;
674 } 674 }
675 update_page_reclaim_stat(zone, page_tail, file, active);
676 } else { 675 } else {
677 SetPageUnevictable(page_tail); 676 SetPageUnevictable(page_tail);
678 lru = LRU_UNEVICTABLE; 677 lru = LRU_UNEVICTABLE;
@@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone,
693 list_head = page_tail->lru.prev; 692 list_head = page_tail->lru.prev;
694 list_move_tail(&page_tail->lru, list_head); 693 list_move_tail(&page_tail->lru, list_head);
695 } 694 }
695
696 if (!PageUnevictable(page))
697 update_page_reclaim_stat(zone, page_tail, file, active);
696} 698}
697#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 699#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
698 700
@@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
710 SetPageLRU(page); 712 SetPageLRU(page);
711 if (active) 713 if (active)
712 SetPageActive(page); 714 SetPageActive(page);
713 update_page_reclaim_stat(zone, page, file, active);
714 add_page_to_lru_list(zone, page, lru); 715 add_page_to_lru_list(zone, page, lru);
716 update_page_reclaim_stat(zone, page, file, active);
715} 717}
716 718
717/* 719/*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 470038a91873..ea6b32d61873 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
300 new_page = alloc_page_vma(gfp_mask, vma, addr); 300 new_page = alloc_page_vma(gfp_mask, vma, addr);
301 if (!new_page) 301 if (!new_page)
302 break; /* Out of memory */ 302 break; /* Out of memory */
303 /*
304 * The memcg-specific accounting when moving
305 * pages around the LRU lists relies on the
306 * page's owner (memcg) to be valid. Usually,
307 * pages are assigned to a new owner before
308 * being put on the LRU list, but since this
309 * is not the case here, the stale owner from
310 * a previous allocation cycle must be reset.
311 */
312 mem_cgroup_reset_owner(new_page);
313 } 303 }
314 304
315 /* 305 /*