aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>2010-03-10 18:22:17 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-12 18:52:36 -0500
commit024914477e15ef8b17f271ec47f1bb8a589f0806 (patch)
tree9a6a8b4224c94fcdd1b8c3127b301ee3537f8cc2
parent8033b97c9b5ef063e3f4bf2efe1cd0a22093aaff (diff)
memcg: move charges of anonymous swap
This patch is another core part of this move-charge-at-task-migration feature. It enables moving charges of anonymous swaps. To move the charge of swap, we need to exchange swap_cgroup's record. In current implementation, swap_cgroup's record is protected by: - page lock: if the entry is on swap cache. - swap_lock: if the entry is not on swap cache. This works well in usual swap-in/out activity. But this behavior make the feature of moving swap charge check many conditions to exchange swap_cgroup's record safely. So I changed modification of swap_cgroup's recored(swap_cgroup_record()) to use xchg, and define a new function to cmpxchg swap_cgroup's record. This patch also enables moving charge of non pte_present but not uncharged swap caches, which can be exist on swap-out path, by getting the target pages via find_get_page() as do_mincore() does. [kosaki.motohiro@jp.fujitsu.com: fix ia64 build] [akpm@linux-foundation.org: fix typos] Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/memory.txt2
-rw-r--r--include/linux/page_cgroup.h2
-rw-r--r--include/linux/swap.h9
-rw-r--r--mm/memcontrol.c183
-rw-r--r--mm/page_cgroup.c34
-rw-r--r--mm/swapfile.c31
6 files changed, 223 insertions, 38 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index e726fb0df719..1f59a1a38bd9 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -420,6 +420,8 @@ NOTE2: It is recommended to set the soft limit always below the hard limit,
420 420
421Users can move charges associated with a task along with task migration, that 421Users can move charges associated with a task along with task migration, that
422is, uncharge task's pages from the old cgroup and charge them to the new cgroup. 422is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
423This feature is not supported in !CONFIG_MMU environments because of lack of
424page tables.
423 425
4248.1 Interface 4268.1 Interface
425 427
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index b0e4eb126236..30b08136fdf3 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -118,6 +118,8 @@ static inline void __init page_cgroup_init_flatmem(void)
118#include <linux/swap.h> 118#include <linux/swap.h>
119 119
120#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 120#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
121extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
122 unsigned short old, unsigned short new);
121extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); 123extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
122extern unsigned short lookup_swap_cgroup(swp_entry_t ent); 124extern unsigned short lookup_swap_cgroup(swp_entry_t ent);
123extern int swap_cgroup_swapon(int type, unsigned long max_pages); 125extern int swap_cgroup_swapon(int type, unsigned long max_pages);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a2602a8207a6..1f59d9340c4d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -355,6 +355,7 @@ static inline void disable_swap_token(void)
355#ifdef CONFIG_CGROUP_MEM_RES_CTLR 355#ifdef CONFIG_CGROUP_MEM_RES_CTLR
356extern void 356extern void
357mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); 357mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
358extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep);
358#else 359#else
359static inline void 360static inline void
360mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 361mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
@@ -485,6 +486,14 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
485{ 486{
486} 487}
487 488
489#ifdef CONFIG_CGROUP_MEM_RES_CTLR
490static inline int
491mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
492{
493 return 0;
494}
495#endif
496
488#endif /* CONFIG_SWAP */ 497#endif /* CONFIG_SWAP */
489#endif /* __KERNEL__*/ 498#endif /* __KERNEL__*/
490#endif /* _LINUX_SWAP_H */ 499#endif /* _LINUX_SWAP_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 589084f00b70..e883198baf81 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,6 +33,7 @@
33#include <linux/rbtree.h> 33#include <linux/rbtree.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/swap.h> 35#include <linux/swap.h>
36#include <linux/swapops.h>
36#include <linux/spinlock.h> 37#include <linux/spinlock.h>
37#include <linux/fs.h> 38#include <linux/fs.h>
38#include <linux/seq_file.h> 39#include <linux/seq_file.h>
@@ -2270,6 +2271,54 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
2270 } 2271 }
2271 rcu_read_unlock(); 2272 rcu_read_unlock();
2272} 2273}
2274
2275/**
2276 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2277 * @entry: swap entry to be moved
2278 * @from: mem_cgroup which the entry is moved from
2279 * @to: mem_cgroup which the entry is moved to
2280 *
2281 * It succeeds only when the swap_cgroup's record for this entry is the same
2282 * as the mem_cgroup's id of @from.
2283 *
2284 * Returns 0 on success, -EINVAL on failure.
2285 *
2286 * The caller must have charged to @to, IOW, called res_counter_charge() about
2287 * both res and memsw, and called css_get().
2288 */
2289static int mem_cgroup_move_swap_account(swp_entry_t entry,
2290 struct mem_cgroup *from, struct mem_cgroup *to)
2291{
2292 unsigned short old_id, new_id;
2293
2294 old_id = css_id(&from->css);
2295 new_id = css_id(&to->css);
2296
2297 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2298 if (!mem_cgroup_is_root(from))
2299 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2300 mem_cgroup_swap_statistics(from, false);
2301 mem_cgroup_put(from);
2302 /*
2303 * we charged both to->res and to->memsw, so we should uncharge
2304 * to->res.
2305 */
2306 if (!mem_cgroup_is_root(to))
2307 res_counter_uncharge(&to->res, PAGE_SIZE);
2308 mem_cgroup_swap_statistics(to, true);
2309 mem_cgroup_get(to);
2310 css_put(&to->css);
2311
2312 return 0;
2313 }
2314 return -EINVAL;
2315}
2316#else
2317static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2318 struct mem_cgroup *from, struct mem_cgroup *to)
2319{
2320 return -EINVAL;
2321}
2273#endif 2322#endif
2274 2323
2275/* 2324/*
@@ -2949,6 +2998,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
2949 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 2998 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
2950} 2999}
2951 3000
3001#ifdef CONFIG_MMU
2952static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3002static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
2953 struct cftype *cft, u64 val) 3003 struct cftype *cft, u64 val)
2954{ 3004{
@@ -2967,6 +3017,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
2967 3017
2968 return 0; 3018 return 0;
2969} 3019}
3020#else
3021static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3022 struct cftype *cft, u64 val)
3023{
3024 return -ENOSYS;
3025}
3026#endif
2970 3027
2971 3028
2972/* For read statistics */ 3029/* For read statistics */
@@ -3489,6 +3546,7 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
3489 return ret; 3546 return ret;
3490} 3547}
3491 3548
3549#ifdef CONFIG_MMU
3492/* Handlers for move charge at task migration. */ 3550/* Handlers for move charge at task migration. */
3493#define PRECHARGE_COUNT_AT_ONCE 256 3551#define PRECHARGE_COUNT_AT_ONCE 256
3494static int mem_cgroup_do_precharge(unsigned long count) 3552static int mem_cgroup_do_precharge(unsigned long count)
@@ -3544,77 +3602,124 @@ one_by_one:
3544 } 3602 }
3545 return ret; 3603 return ret;
3546} 3604}
3605#else /* !CONFIG_MMU */
3606static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3607 struct cgroup *cgroup,
3608 struct task_struct *p,
3609 bool threadgroup)
3610{
3611 return 0;
3612}
3613static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
3614 struct cgroup *cgroup,
3615 struct task_struct *p,
3616 bool threadgroup)
3617{
3618}
3619static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3620 struct cgroup *cont,
3621 struct cgroup *old_cont,
3622 struct task_struct *p,
3623 bool threadgroup)
3624{
3625}
3626#endif
3547 3627
3548/** 3628/**
3549 * is_target_pte_for_mc - check a pte whether it is valid for move charge 3629 * is_target_pte_for_mc - check a pte whether it is valid for move charge
3550 * @vma: the vma the pte to be checked belongs 3630 * @vma: the vma the pte to be checked belongs
3551 * @addr: the address corresponding to the pte to be checked 3631 * @addr: the address corresponding to the pte to be checked
3552 * @ptent: the pte to be checked 3632 * @ptent: the pte to be checked
3553 * @target: the pointer the target page will be stored(can be NULL) 3633 * @target: the pointer the target page or swap ent will be stored(can be NULL)
3554 * 3634 *
3555 * Returns 3635 * Returns
3556 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 3636 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
3557 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 3637 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3558 * move charge. if @target is not NULL, the page is stored in target->page 3638 * move charge. if @target is not NULL, the page is stored in target->page
3559 * with extra refcnt got(Callers should handle it). 3639 * with extra refcnt got(Callers should handle it).
3640 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
3641 * target for charge migration. if @target is not NULL, the entry is stored
3642 * in target->ent.
3560 * 3643 *
3561 * Called with pte lock held. 3644 * Called with pte lock held.
3562 */ 3645 */
3563/* We add a new member later. */
3564union mc_target { 3646union mc_target {
3565 struct page *page; 3647 struct page *page;
3648 swp_entry_t ent;
3566}; 3649};
3567 3650
3568/* We add a new type later. */
3569enum mc_target_type { 3651enum mc_target_type {
3570 MC_TARGET_NONE, /* not used */ 3652 MC_TARGET_NONE, /* not used */
3571 MC_TARGET_PAGE, 3653 MC_TARGET_PAGE,
3654 MC_TARGET_SWAP,
3572}; 3655};
3573 3656
3574static int is_target_pte_for_mc(struct vm_area_struct *vma, 3657static int is_target_pte_for_mc(struct vm_area_struct *vma,
3575 unsigned long addr, pte_t ptent, union mc_target *target) 3658 unsigned long addr, pte_t ptent, union mc_target *target)
3576{ 3659{
3577 struct page *page; 3660 struct page *page = NULL;
3578 struct page_cgroup *pc; 3661 struct page_cgroup *pc;
3579 int ret = 0; 3662 int ret = 0;
3663 swp_entry_t ent = { .val = 0 };
3664 int usage_count = 0;
3580 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, 3665 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3581 &mc.to->move_charge_at_immigrate); 3666 &mc.to->move_charge_at_immigrate);
3582 3667
3583 if (!pte_present(ptent)) 3668 if (!pte_present(ptent)) {
3584 return 0; 3669 /* TODO: handle swap of shmes/tmpfs */
3585 3670 if (pte_none(ptent) || pte_file(ptent))
3586 page = vm_normal_page(vma, addr, ptent); 3671 return 0;
3587 if (!page || !page_mapped(page)) 3672 else if (is_swap_pte(ptent)) {
3588 return 0; 3673 ent = pte_to_swp_entry(ptent);
3589 /* 3674 if (!move_anon || non_swap_entry(ent))
3590 * TODO: We don't move charges of file(including shmem/tmpfs) pages for 3675 return 0;
3591 * now. 3676 usage_count = mem_cgroup_count_swap_user(ent, &page);
3592 */ 3677 }
3593 if (!move_anon || !PageAnon(page)) 3678 } else {
3594 return 0; 3679 page = vm_normal_page(vma, addr, ptent);
3595 /* 3680 if (!page || !page_mapped(page))
3596 * TODO: We don't move charges of shared(used by multiple processes) 3681 return 0;
3597 * pages for now. 3682 /*
3598 */ 3683 * TODO: We don't move charges of file(including shmem/tmpfs)
3599 if (page_mapcount(page) > 1) 3684 * pages for now.
3600 return 0; 3685 */
3601 if (!get_page_unless_zero(page)) 3686 if (!move_anon || !PageAnon(page))
3687 return 0;
3688 if (!get_page_unless_zero(page))
3689 return 0;
3690 usage_count = page_mapcount(page);
3691 }
3692 if (usage_count > 1) {
3693 /*
3694 * TODO: We don't move charges of shared(used by multiple
3695 * processes) pages for now.
3696 */
3697 if (page)
3698 put_page(page);
3602 return 0; 3699 return 0;
3603 3700 }
3604 pc = lookup_page_cgroup(page); 3701 if (page) {
3605 /* 3702 pc = lookup_page_cgroup(page);
3606 * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account() 3703 /*
3607 * checks the pc is valid or not under the lock. 3704 * Do only loose check w/o page_cgroup lock.
3608 */ 3705 * mem_cgroup_move_account() checks the pc is valid or not under
3609 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 3706 * the lock.
3610 ret = MC_TARGET_PAGE; 3707 */
3708 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
3709 ret = MC_TARGET_PAGE;
3710 if (target)
3711 target->page = page;
3712 }
3713 if (!ret || !target)
3714 put_page(page);
3715 }
3716 /* throught */
3717 if (ent.val && do_swap_account && !ret &&
3718 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
3719 ret = MC_TARGET_SWAP;
3611 if (target) 3720 if (target)
3612 target->page = page; 3721 target->ent = ent;
3613 } 3722 }
3614
3615 if (!ret || !target)
3616 put_page(page);
3617
3618 return ret; 3723 return ret;
3619} 3724}
3620 3725
@@ -3754,6 +3859,7 @@ retry:
3754 int type; 3859 int type;
3755 struct page *page; 3860 struct page *page;
3756 struct page_cgroup *pc; 3861 struct page_cgroup *pc;
3862 swp_entry_t ent;
3757 3863
3758 if (!mc.precharge) 3864 if (!mc.precharge)
3759 break; 3865 break;
@@ -3775,6 +3881,11 @@ retry:
3775put: /* is_target_pte_for_mc() gets the page */ 3881put: /* is_target_pte_for_mc() gets the page */
3776 put_page(page); 3882 put_page(page);
3777 break; 3883 break;
3884 case MC_TARGET_SWAP:
3885 ent = target.ent;
3886 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to))
3887 mc.precharge--;
3888 break;
3778 default: 3889 default:
3779 break; 3890 break;
3780 } 3891 }
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d594826..3dd88539a0e6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -335,6 +335,37 @@ not_enough_page:
335} 335}
336 336
337/** 337/**
338 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
339 * @end: swap entry to be cmpxchged
340 * @old: old id
341 * @new: new id
342 *
343 * Returns old id at success, 0 at failure.
344 * (There is no mem_cgroup useing 0 as its id)
345 */
346unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
347 unsigned short old, unsigned short new)
348{
349 int type = swp_type(ent);
350 unsigned long offset = swp_offset(ent);
351 unsigned long idx = offset / SC_PER_PAGE;
352 unsigned long pos = offset & SC_POS_MASK;
353 struct swap_cgroup_ctrl *ctrl;
354 struct page *mappage;
355 struct swap_cgroup *sc;
356
357 ctrl = &swap_cgroup_ctrl[type];
358
359 mappage = ctrl->map[idx];
360 sc = page_address(mappage);
361 sc += pos;
362 if (cmpxchg(&sc->id, old, new) == old)
363 return old;
364 else
365 return 0;
366}
367
368/**
338 * swap_cgroup_record - record mem_cgroup for this swp_entry. 369 * swap_cgroup_record - record mem_cgroup for this swp_entry.
339 * @ent: swap entry to be recorded into 370 * @ent: swap entry to be recorded into
340 * @mem: mem_cgroup to be recorded 371 * @mem: mem_cgroup to be recorded
@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
358 mappage = ctrl->map[idx]; 389 mappage = ctrl->map[idx];
359 sc = page_address(mappage); 390 sc = page_address(mappage);
360 sc += pos; 391 sc += pos;
361 old = sc->id; 392 old = xchg(&sc->id, id);
362 sc->id = id;
363 393
364 return old; 394 return old;
365} 395}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 84374d8cf814..6cd0a8f90dc7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
723 return p != NULL; 723 return p != NULL;
724} 724}
725 725
726#ifdef CONFIG_CGROUP_MEM_RES_CTLR
727/**
728 * mem_cgroup_count_swap_user - count the user of a swap entry
729 * @ent: the swap entry to be checked
730 * @pagep: the pointer for the swap cache page of the entry to be stored
731 *
732 * Returns the number of the user of the swap entry. The number is valid only
733 * for swaps of anonymous pages.
734 * If the entry is found on swap cache, the page is stored to pagep with
735 * refcount of it being incremented.
736 */
737int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
738{
739 struct page *page;
740 struct swap_info_struct *p;
741 int count = 0;
742
743 page = find_get_page(&swapper_space, ent.val);
744 if (page)
745 count += page_mapcount(page);
746 p = swap_info_get(ent);
747 if (p) {
748 count += swap_count(p->swap_map[swp_offset(ent)]);
749 spin_unlock(&swap_lock);
750 }
751
752 *pagep = page;
753 return count;
754}
755#endif
756
726#ifdef CONFIG_HIBERNATION 757#ifdef CONFIG_HIBERNATION
727/* 758/*
728 * Find the swap type that corresponds to given device (if any). 759 * Find the swap type that corresponds to given device (if any).