aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorakpm@linux-foundation.org <akpm@linux-foundation.org>2010-05-26 17:42:46 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-27 12:12:44 -0400
commitac39cf8cb86c45eeac6a592ce0d58f9021a97235 (patch)
tree7321cafb0a1f8f2727c86f9d29159751df856c59
parent315c1998e10527ff364a9883048455e609bc7232 (diff)
memcg: fix mis-accounting of file mapped racy with migration
FILE_MAPPED per memcg of migrated file cache is not properly updated, because our hook in page_add_file_rmap() can't know to which memcg FILE_MAPPED should be counted. Basically, this patch is for fixing the bug but includes some big changes to fix up other messes. Now, at migrating mapped file, events happen in following sequence. 1. allocate a new page. 2. get memcg of an old page. 3. charge ageinst a new page before migration. But at this point, no changes to new page's page_cgroup, no commit for the charge. (IOW, PCG_USED bit is not set.) 4. page migration replaces radix-tree, old-page and new-page. 5. page migration remaps the new page if the old page was mapped. 6. Here, the new page is unlocked. 7. memcg commits the charge for newpage, Mark the new page's page_cgroup as PCG_USED. Because "commit" happens after page-remap, we can count FILE_MAPPED at "5", because we should avoid to trust page_cgroup->mem_cgroup. if PCG_USED bit is unset. (Note: memcg's LRU removal code does that but LRU-isolation logic is used for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is not on LRU or page_cgroup->mem_cgroup is NULL.) We can lose file_mapped accounting information at 5 because FILE_MAPPED is updated only when mapcount changes 0->1. So we should catch it. BTW, historically, above implemntation comes from migration-failure of anonymous page. Because we charge both of old page and new page with mapcount=0, we can't catch - the page is really freed before remap. - migration fails but it's freed before remap or .....corner cases. New migration sequence with memcg is: 1. allocate a new page. 2. mark PageCgroupMigration to the old page. 3. charge against a new page onto the old page's memcg. (here, new page's pc is marked as PageCgroupUsed.) 4. page migration replaces radix-tree, page table, etc... 5. At remapping, new page's page_cgroup is now makrked as "USED" We can catch 0->1 event and FILE_MAPPED will be properly updated. And we can catch SWAPOUT event after unlock this and freeing this page by unmap() can be caught. 7. Clear PageCgroupMigration of the old page. So, FILE_MAPPED will be correctly updated. Then, for what MIGRATION flag is ? Without it, at migration failure, we may have to charge old page again because it may be fully unmapped. "charge" means that we have to dive into memory reclaim or something complated. So, it's better to avoid charge it again. Before this patch, __commit_charge() was working for both of the old/new page and fixed up all. But this technique has some racy condtion around FILE_MAPPED and SWAPOUT etc... Now, the kernel use MIGRATION flag and don't uncharge old page until the end of migration. I hope this change will make memcg's page migration much simpler. This page migration has caused several troubles. Worth to add a flag for simplification. Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reported-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h6
-rw-r--r--include/linux/page_cgroup.h5
-rw-r--r--mm/memcontrol.c135
-rw-r--r--mm/migrate.c2
4 files changed, 107 insertions, 41 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 05894795fdc1..9411d32840b0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -90,7 +90,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
90extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); 90extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem);
91 91
92extern int 92extern int
93mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); 93mem_cgroup_prepare_migration(struct page *page,
94 struct page *newpage, struct mem_cgroup **ptr);
94extern void mem_cgroup_end_migration(struct mem_cgroup *mem, 95extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
95 struct page *oldpage, struct page *newpage); 96 struct page *oldpage, struct page *newpage);
96 97
@@ -227,7 +228,8 @@ static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
227} 228}
228 229
229static inline int 230static inline int
230mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 231mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
232 struct mem_cgroup **ptr)
231{ 233{
232 return 0; 234 return 0;
233} 235}
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index aef22ae2af47..5bb13b3db84d 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -40,6 +40,7 @@ enum {
40 PCG_USED, /* this object is in use. */ 40 PCG_USED, /* this object is in use. */
41 PCG_ACCT_LRU, /* page has been accounted for */ 41 PCG_ACCT_LRU, /* page has been accounted for */
42 PCG_FILE_MAPPED, /* page is accounted as "mapped" */ 42 PCG_FILE_MAPPED, /* page is accounted as "mapped" */
43 PCG_MIGRATION, /* under page migration */
43}; 44};
44 45
45#define TESTPCGFLAG(uname, lname) \ 46#define TESTPCGFLAG(uname, lname) \
@@ -79,6 +80,10 @@ SETPCGFLAG(FileMapped, FILE_MAPPED)
79CLEARPCGFLAG(FileMapped, FILE_MAPPED) 80CLEARPCGFLAG(FileMapped, FILE_MAPPED)
80TESTPCGFLAG(FileMapped, FILE_MAPPED) 81TESTPCGFLAG(FileMapped, FILE_MAPPED)
81 82
83SETPCGFLAG(Migration, MIGRATION)
84CLEARPCGFLAG(Migration, MIGRATION)
85TESTPCGFLAG(Migration, MIGRATION)
86
82static inline int page_cgroup_nid(struct page_cgroup *pc) 87static inline int page_cgroup_nid(struct page_cgroup *pc)
83{ 88{
84 return page_to_nid(pc->page); 89 return page_to_nid(pc->page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c200e86da4c..df1234c0dac3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2258,7 +2258,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2258 switch (ctype) { 2258 switch (ctype) {
2259 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2259 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2260 case MEM_CGROUP_CHARGE_TYPE_DROP: 2260 case MEM_CGROUP_CHARGE_TYPE_DROP:
2261 if (page_mapped(page)) 2261 /* See mem_cgroup_prepare_migration() */
2262 if (page_mapped(page) || PageCgroupMigration(pc))
2262 goto unlock_out; 2263 goto unlock_out;
2263 break; 2264 break;
2264 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2265 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2481,10 +2482,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2481 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2482 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2482 * page belongs to. 2483 * page belongs to.
2483 */ 2484 */
2484int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) 2485int mem_cgroup_prepare_migration(struct page *page,
2486 struct page *newpage, struct mem_cgroup **ptr)
2485{ 2487{
2486 struct page_cgroup *pc; 2488 struct page_cgroup *pc;
2487 struct mem_cgroup *mem = NULL; 2489 struct mem_cgroup *mem = NULL;
2490 enum charge_type ctype;
2488 int ret = 0; 2491 int ret = 0;
2489 2492
2490 if (mem_cgroup_disabled()) 2493 if (mem_cgroup_disabled())
@@ -2495,69 +2498,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2495 if (PageCgroupUsed(pc)) { 2498 if (PageCgroupUsed(pc)) {
2496 mem = pc->mem_cgroup; 2499 mem = pc->mem_cgroup;
2497 css_get(&mem->css); 2500 css_get(&mem->css);
2501 /*
2502 * At migrating an anonymous page, its mapcount goes down
2503 * to 0 and uncharge() will be called. But, even if it's fully
2504 * unmapped, migration may fail and this page has to be
2505 * charged again. We set MIGRATION flag here and delay uncharge
2506 * until end_migration() is called
2507 *
2508 * Corner Case Thinking
2509 * A)
2510 * When the old page was mapped as Anon and it's unmap-and-freed
2511 * while migration was ongoing.
2512 * If unmap finds the old page, uncharge() of it will be delayed
2513 * until end_migration(). If unmap finds a new page, it's
2514 * uncharged when it make mapcount to be 1->0. If unmap code
2515 * finds swap_migration_entry, the new page will not be mapped
2516 * and end_migration() will find it(mapcount==0).
2517 *
2518 * B)
2519 * When the old page was mapped but migraion fails, the kernel
2520 * remaps it. A charge for it is kept by MIGRATION flag even
2521 * if mapcount goes down to 0. We can do remap successfully
2522 * without charging it again.
2523 *
2524 * C)
2525 * The "old" page is under lock_page() until the end of
2526 * migration, so, the old page itself will not be swapped-out.
2527 * If the new page is swapped out before end_migraton, our
2528 * hook to usual swap-out path will catch the event.
2529 */
2530 if (PageAnon(page))
2531 SetPageCgroupMigration(pc);
2498 } 2532 }
2499 unlock_page_cgroup(pc); 2533 unlock_page_cgroup(pc);
2534 /*
2535 * If the page is not charged at this point,
2536 * we return here.
2537 */
2538 if (!mem)
2539 return 0;
2500 2540
2501 *ptr = mem; 2541 *ptr = mem;
2502 if (mem) { 2542 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2503 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2543 css_put(&mem->css);/* drop extra refcnt */
2504 css_put(&mem->css); 2544 if (ret || *ptr == NULL) {
2545 if (PageAnon(page)) {
2546 lock_page_cgroup(pc);
2547 ClearPageCgroupMigration(pc);
2548 unlock_page_cgroup(pc);
2549 /*
2550 * The old page may be fully unmapped while we kept it.
2551 */
2552 mem_cgroup_uncharge_page(page);
2553 }
2554 return -ENOMEM;
2505 } 2555 }
2556 /*
2557 * We charge new page before it's used/mapped. So, even if unlock_page()
2558 * is called before end_migration, we can catch all events on this new
2559 * page. In the case new page is migrated but not remapped, new page's
2560 * mapcount will be finally 0 and we call uncharge in end_migration().
2561 */
2562 pc = lookup_page_cgroup(newpage);
2563 if (PageAnon(page))
2564 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2565 else if (page_is_file_cache(page))
2566 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2567 else
2568 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2569 __mem_cgroup_commit_charge(mem, pc, ctype);
2506 return ret; 2570 return ret;
2507} 2571}
2508 2572
2509/* remove redundant charge if migration failed*/ 2573/* remove redundant charge if migration failed*/
2510void mem_cgroup_end_migration(struct mem_cgroup *mem, 2574void mem_cgroup_end_migration(struct mem_cgroup *mem,
2511 struct page *oldpage, struct page *newpage) 2575 struct page *oldpage, struct page *newpage)
2512{ 2576{
2513 struct page *target, *unused; 2577 struct page *used, *unused;
2514 struct page_cgroup *pc; 2578 struct page_cgroup *pc;
2515 enum charge_type ctype;
2516 2579
2517 if (!mem) 2580 if (!mem)
2518 return; 2581 return;
2582 /* blocks rmdir() */
2519 cgroup_exclude_rmdir(&mem->css); 2583 cgroup_exclude_rmdir(&mem->css);
2520 /* at migration success, oldpage->mapping is NULL. */ 2584 /* at migration success, oldpage->mapping is NULL. */
2521 if (oldpage->mapping) { 2585 if (oldpage->mapping) {
2522 target = oldpage; 2586 used = oldpage;
2523 unused = NULL; 2587 unused = newpage;
2524 } else { 2588 } else {
2525 target = newpage; 2589 used = newpage;
2526 unused = oldpage; 2590 unused = oldpage;
2527 } 2591 }
2528
2529 if (PageAnon(target))
2530 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
2531 else if (page_is_file_cache(target))
2532 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2533 else
2534 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2535
2536 /* unused page is not on radix-tree now. */
2537 if (unused)
2538 __mem_cgroup_uncharge_common(unused, ctype);
2539
2540 pc = lookup_page_cgroup(target);
2541 /* 2592 /*
2542 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. 2593 * We disallowed uncharge of pages under migration because mapcount
2543 * So, double-counting is effectively avoided. 2594 * of the page goes down to zero, temporarly.
2595 * Clear the flag and check the page should be charged.
2544 */ 2596 */
2545 __mem_cgroup_commit_charge(mem, pc, ctype); 2597 pc = lookup_page_cgroup(oldpage);
2598 lock_page_cgroup(pc);
2599 ClearPageCgroupMigration(pc);
2600 unlock_page_cgroup(pc);
2546 2601
2602 if (unused != oldpage)
2603 pc = lookup_page_cgroup(unused);
2604 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2605
2606 pc = lookup_page_cgroup(used);
2547 /* 2607 /*
2548 * Both of oldpage and newpage are still under lock_page(). 2608 * If a page is a file cache, radix-tree replacement is very atomic
2549 * Then, we don't have to care about race in radix-tree. 2609 * and we can skip this check. When it was an Anon page, its mapcount
2550 * But we have to be careful that this page is unmapped or not. 2610 * goes down to 0. But because we added MIGRATION flage, it's not
2551 * 2611 * uncharged yet. There are several case but page->mapcount check
2552 * There is a case for !page_mapped(). At the start of 2612 * and USED bit check in mem_cgroup_uncharge_page() will do enough
2553 * migration, oldpage was mapped. But now, it's zapped. 2613 * check. (see prepare_charge() also)
2554 * But we know *target* page is not freed/reused under us.
2555 * mem_cgroup_uncharge_page() does all necessary checks.
2556 */ 2614 */
2557 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2615 if (PageAnon(used))
2558 mem_cgroup_uncharge_page(target); 2616 mem_cgroup_uncharge_page(used);
2559 /* 2617 /*
2560 * At migration, we may charge account against cgroup which has no tasks 2618 * At migration, we may charge account against cgroup which has no
2619 * tasks.
2561 * So, rmdir()->pre_destroy() can be called while we do this charge. 2620 * So, rmdir()->pre_destroy() can be called while we do this charge.
2562 * In that case, we need to call pre_destroy() again. check it here. 2621 * In that case, we need to call pre_destroy() again. check it here.
2563 */ 2622 */
diff --git a/mm/migrate.c b/mm/migrate.c
index 09e2471afa0f..4205b1d6049e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -590,7 +590,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
590 } 590 }
591 591
592 /* charge against new page */ 592 /* charge against new page */
593 charge = mem_cgroup_prepare_migration(page, &mem); 593 charge = mem_cgroup_prepare_migration(page, newpage, &mem);
594 if (charge == -ENOMEM) { 594 if (charge == -ENOMEM) {
595 rc = -ENOMEM; 595 rc = -ENOMEM;
596 goto unlock; 596 goto unlock;