diff options
author | Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> | 2010-03-10 18:22:17 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-12 18:52:36 -0500 |
commit | 024914477e15ef8b17f271ec47f1bb8a589f0806 (patch) | |
tree | 9a6a8b4224c94fcdd1b8c3127b301ee3537f8cc2 /mm | |
parent | 8033b97c9b5ef063e3f4bf2efe1cd0a22093aaff (diff) |
memcg: move charges of anonymous swap
This patch is another core part of this move-charge-at-task-migration
feature. It enables moving charges of anonymous swaps.
To move the charge of swap, we need to exchange swap_cgroup's record.
In current implementation, swap_cgroup's record is protected by:
- page lock: if the entry is on swap cache.
- swap_lock: if the entry is not on swap cache.
This works well in usual swap-in/out activity.
But this behavior make the feature of moving swap charge check many
conditions to exchange swap_cgroup's record safely.
So I changed modification of swap_cgroup's recored(swap_cgroup_record())
to use xchg, and define a new function to cmpxchg swap_cgroup's record.
This patch also enables moving charge of non pte_present but not uncharged
swap caches, which can be exist on swap-out path, by getting the target
pages via find_get_page() as do_mincore() does.
[kosaki.motohiro@jp.fujitsu.com: fix ia64 build]
[akpm@linux-foundation.org: fix typos]
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memcontrol.c | 183 | ||||
-rw-r--r-- | mm/page_cgroup.c | 34 | ||||
-rw-r--r-- | mm/swapfile.c | 31 |
3 files changed, 210 insertions, 38 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 589084f00b70..e883198baf81 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/rbtree.h> | 33 | #include <linux/rbtree.h> |
34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
35 | #include <linux/swap.h> | 35 | #include <linux/swap.h> |
36 | #include <linux/swapops.h> | ||
36 | #include <linux/spinlock.h> | 37 | #include <linux/spinlock.h> |
37 | #include <linux/fs.h> | 38 | #include <linux/fs.h> |
38 | #include <linux/seq_file.h> | 39 | #include <linux/seq_file.h> |
@@ -2270,6 +2271,54 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
2270 | } | 2271 | } |
2271 | rcu_read_unlock(); | 2272 | rcu_read_unlock(); |
2272 | } | 2273 | } |
2274 | |||
2275 | /** | ||
2276 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | ||
2277 | * @entry: swap entry to be moved | ||
2278 | * @from: mem_cgroup which the entry is moved from | ||
2279 | * @to: mem_cgroup which the entry is moved to | ||
2280 | * | ||
2281 | * It succeeds only when the swap_cgroup's record for this entry is the same | ||
2282 | * as the mem_cgroup's id of @from. | ||
2283 | * | ||
2284 | * Returns 0 on success, -EINVAL on failure. | ||
2285 | * | ||
2286 | * The caller must have charged to @to, IOW, called res_counter_charge() about | ||
2287 | * both res and memsw, and called css_get(). | ||
2288 | */ | ||
2289 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2290 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
2291 | { | ||
2292 | unsigned short old_id, new_id; | ||
2293 | |||
2294 | old_id = css_id(&from->css); | ||
2295 | new_id = css_id(&to->css); | ||
2296 | |||
2297 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | ||
2298 | if (!mem_cgroup_is_root(from)) | ||
2299 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
2300 | mem_cgroup_swap_statistics(from, false); | ||
2301 | mem_cgroup_put(from); | ||
2302 | /* | ||
2303 | * we charged both to->res and to->memsw, so we should uncharge | ||
2304 | * to->res. | ||
2305 | */ | ||
2306 | if (!mem_cgroup_is_root(to)) | ||
2307 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
2308 | mem_cgroup_swap_statistics(to, true); | ||
2309 | mem_cgroup_get(to); | ||
2310 | css_put(&to->css); | ||
2311 | |||
2312 | return 0; | ||
2313 | } | ||
2314 | return -EINVAL; | ||
2315 | } | ||
2316 | #else | ||
2317 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2318 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
2319 | { | ||
2320 | return -EINVAL; | ||
2321 | } | ||
2273 | #endif | 2322 | #endif |
2274 | 2323 | ||
2275 | /* | 2324 | /* |
@@ -2949,6 +2998,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | |||
2949 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | 2998 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; |
2950 | } | 2999 | } |
2951 | 3000 | ||
3001 | #ifdef CONFIG_MMU | ||
2952 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | 3002 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, |
2953 | struct cftype *cft, u64 val) | 3003 | struct cftype *cft, u64 val) |
2954 | { | 3004 | { |
@@ -2967,6 +3017,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
2967 | 3017 | ||
2968 | return 0; | 3018 | return 0; |
2969 | } | 3019 | } |
3020 | #else | ||
3021 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3022 | struct cftype *cft, u64 val) | ||
3023 | { | ||
3024 | return -ENOSYS; | ||
3025 | } | ||
3026 | #endif | ||
2970 | 3027 | ||
2971 | 3028 | ||
2972 | /* For read statistics */ | 3029 | /* For read statistics */ |
@@ -3489,6 +3546,7 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3489 | return ret; | 3546 | return ret; |
3490 | } | 3547 | } |
3491 | 3548 | ||
3549 | #ifdef CONFIG_MMU | ||
3492 | /* Handlers for move charge at task migration. */ | 3550 | /* Handlers for move charge at task migration. */ |
3493 | #define PRECHARGE_COUNT_AT_ONCE 256 | 3551 | #define PRECHARGE_COUNT_AT_ONCE 256 |
3494 | static int mem_cgroup_do_precharge(unsigned long count) | 3552 | static int mem_cgroup_do_precharge(unsigned long count) |
@@ -3544,77 +3602,124 @@ one_by_one: | |||
3544 | } | 3602 | } |
3545 | return ret; | 3603 | return ret; |
3546 | } | 3604 | } |
3605 | #else /* !CONFIG_MMU */ | ||
3606 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
3607 | struct cgroup *cgroup, | ||
3608 | struct task_struct *p, | ||
3609 | bool threadgroup) | ||
3610 | { | ||
3611 | return 0; | ||
3612 | } | ||
3613 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
3614 | struct cgroup *cgroup, | ||
3615 | struct task_struct *p, | ||
3616 | bool threadgroup) | ||
3617 | { | ||
3618 | } | ||
3619 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
3620 | struct cgroup *cont, | ||
3621 | struct cgroup *old_cont, | ||
3622 | struct task_struct *p, | ||
3623 | bool threadgroup) | ||
3624 | { | ||
3625 | } | ||
3626 | #endif | ||
3547 | 3627 | ||
3548 | /** | 3628 | /** |
3549 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | 3629 | * is_target_pte_for_mc - check a pte whether it is valid for move charge |
3550 | * @vma: the vma the pte to be checked belongs | 3630 | * @vma: the vma the pte to be checked belongs |
3551 | * @addr: the address corresponding to the pte to be checked | 3631 | * @addr: the address corresponding to the pte to be checked |
3552 | * @ptent: the pte to be checked | 3632 | * @ptent: the pte to be checked |
3553 | * @target: the pointer the target page will be stored(can be NULL) | 3633 | * @target: the pointer the target page or swap ent will be stored(can be NULL) |
3554 | * | 3634 | * |
3555 | * Returns | 3635 | * Returns |
3556 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | 3636 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. |
3557 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | 3637 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for |
3558 | * move charge. if @target is not NULL, the page is stored in target->page | 3638 | * move charge. if @target is not NULL, the page is stored in target->page |
3559 | * with extra refcnt got(Callers should handle it). | 3639 | * with extra refcnt got(Callers should handle it). |
3640 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
3641 | * target for charge migration. if @target is not NULL, the entry is stored | ||
3642 | * in target->ent. | ||
3560 | * | 3643 | * |
3561 | * Called with pte lock held. | 3644 | * Called with pte lock held. |
3562 | */ | 3645 | */ |
3563 | /* We add a new member later. */ | ||
3564 | union mc_target { | 3646 | union mc_target { |
3565 | struct page *page; | 3647 | struct page *page; |
3648 | swp_entry_t ent; | ||
3566 | }; | 3649 | }; |
3567 | 3650 | ||
3568 | /* We add a new type later. */ | ||
3569 | enum mc_target_type { | 3651 | enum mc_target_type { |
3570 | MC_TARGET_NONE, /* not used */ | 3652 | MC_TARGET_NONE, /* not used */ |
3571 | MC_TARGET_PAGE, | 3653 | MC_TARGET_PAGE, |
3654 | MC_TARGET_SWAP, | ||
3572 | }; | 3655 | }; |
3573 | 3656 | ||
3574 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 3657 | static int is_target_pte_for_mc(struct vm_area_struct *vma, |
3575 | unsigned long addr, pte_t ptent, union mc_target *target) | 3658 | unsigned long addr, pte_t ptent, union mc_target *target) |
3576 | { | 3659 | { |
3577 | struct page *page; | 3660 | struct page *page = NULL; |
3578 | struct page_cgroup *pc; | 3661 | struct page_cgroup *pc; |
3579 | int ret = 0; | 3662 | int ret = 0; |
3663 | swp_entry_t ent = { .val = 0 }; | ||
3664 | int usage_count = 0; | ||
3580 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | 3665 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, |
3581 | &mc.to->move_charge_at_immigrate); | 3666 | &mc.to->move_charge_at_immigrate); |
3582 | 3667 | ||
3583 | if (!pte_present(ptent)) | 3668 | if (!pte_present(ptent)) { |
3584 | return 0; | 3669 | /* TODO: handle swap of shmes/tmpfs */ |
3585 | 3670 | if (pte_none(ptent) || pte_file(ptent)) | |
3586 | page = vm_normal_page(vma, addr, ptent); | 3671 | return 0; |
3587 | if (!page || !page_mapped(page)) | 3672 | else if (is_swap_pte(ptent)) { |
3588 | return 0; | 3673 | ent = pte_to_swp_entry(ptent); |
3589 | /* | 3674 | if (!move_anon || non_swap_entry(ent)) |
3590 | * TODO: We don't move charges of file(including shmem/tmpfs) pages for | 3675 | return 0; |
3591 | * now. | 3676 | usage_count = mem_cgroup_count_swap_user(ent, &page); |
3592 | */ | 3677 | } |
3593 | if (!move_anon || !PageAnon(page)) | 3678 | } else { |
3594 | return 0; | 3679 | page = vm_normal_page(vma, addr, ptent); |
3595 | /* | 3680 | if (!page || !page_mapped(page)) |
3596 | * TODO: We don't move charges of shared(used by multiple processes) | 3681 | return 0; |
3597 | * pages for now. | 3682 | /* |
3598 | */ | 3683 | * TODO: We don't move charges of file(including shmem/tmpfs) |
3599 | if (page_mapcount(page) > 1) | 3684 | * pages for now. |
3600 | return 0; | 3685 | */ |
3601 | if (!get_page_unless_zero(page)) | 3686 | if (!move_anon || !PageAnon(page)) |
3687 | return 0; | ||
3688 | if (!get_page_unless_zero(page)) | ||
3689 | return 0; | ||
3690 | usage_count = page_mapcount(page); | ||
3691 | } | ||
3692 | if (usage_count > 1) { | ||
3693 | /* | ||
3694 | * TODO: We don't move charges of shared(used by multiple | ||
3695 | * processes) pages for now. | ||
3696 | */ | ||
3697 | if (page) | ||
3698 | put_page(page); | ||
3602 | return 0; | 3699 | return 0; |
3603 | 3700 | } | |
3604 | pc = lookup_page_cgroup(page); | 3701 | if (page) { |
3605 | /* | 3702 | pc = lookup_page_cgroup(page); |
3606 | * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account() | 3703 | /* |
3607 | * checks the pc is valid or not under the lock. | 3704 | * Do only loose check w/o page_cgroup lock. |
3608 | */ | 3705 | * mem_cgroup_move_account() checks the pc is valid or not under |
3609 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | 3706 | * the lock. |
3610 | ret = MC_TARGET_PAGE; | 3707 | */ |
3708 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
3709 | ret = MC_TARGET_PAGE; | ||
3710 | if (target) | ||
3711 | target->page = page; | ||
3712 | } | ||
3713 | if (!ret || !target) | ||
3714 | put_page(page); | ||
3715 | } | ||
3716 | /* throught */ | ||
3717 | if (ent.val && do_swap_account && !ret && | ||
3718 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | ||
3719 | ret = MC_TARGET_SWAP; | ||
3611 | if (target) | 3720 | if (target) |
3612 | target->page = page; | 3721 | target->ent = ent; |
3613 | } | 3722 | } |
3614 | |||
3615 | if (!ret || !target) | ||
3616 | put_page(page); | ||
3617 | |||
3618 | return ret; | 3723 | return ret; |
3619 | } | 3724 | } |
3620 | 3725 | ||
@@ -3754,6 +3859,7 @@ retry: | |||
3754 | int type; | 3859 | int type; |
3755 | struct page *page; | 3860 | struct page *page; |
3756 | struct page_cgroup *pc; | 3861 | struct page_cgroup *pc; |
3862 | swp_entry_t ent; | ||
3757 | 3863 | ||
3758 | if (!mc.precharge) | 3864 | if (!mc.precharge) |
3759 | break; | 3865 | break; |
@@ -3775,6 +3881,11 @@ retry: | |||
3775 | put: /* is_target_pte_for_mc() gets the page */ | 3881 | put: /* is_target_pte_for_mc() gets the page */ |
3776 | put_page(page); | 3882 | put_page(page); |
3777 | break; | 3883 | break; |
3884 | case MC_TARGET_SWAP: | ||
3885 | ent = target.ent; | ||
3886 | if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) | ||
3887 | mc.precharge--; | ||
3888 | break; | ||
3778 | default: | 3889 | default: |
3779 | break; | 3890 | break; |
3780 | } | 3891 | } |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d594826..3dd88539a0e6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -335,6 +335,37 @@ not_enough_page: | |||
335 | } | 335 | } |
336 | 336 | ||
337 | /** | 337 | /** |
338 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
339 | * @end: swap entry to be cmpxchged | ||
340 | * @old: old id | ||
341 | * @new: new id | ||
342 | * | ||
343 | * Returns old id at success, 0 at failure. | ||
344 | * (There is no mem_cgroup useing 0 as its id) | ||
345 | */ | ||
346 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
347 | unsigned short old, unsigned short new) | ||
348 | { | ||
349 | int type = swp_type(ent); | ||
350 | unsigned long offset = swp_offset(ent); | ||
351 | unsigned long idx = offset / SC_PER_PAGE; | ||
352 | unsigned long pos = offset & SC_POS_MASK; | ||
353 | struct swap_cgroup_ctrl *ctrl; | ||
354 | struct page *mappage; | ||
355 | struct swap_cgroup *sc; | ||
356 | |||
357 | ctrl = &swap_cgroup_ctrl[type]; | ||
358 | |||
359 | mappage = ctrl->map[idx]; | ||
360 | sc = page_address(mappage); | ||
361 | sc += pos; | ||
362 | if (cmpxchg(&sc->id, old, new) == old) | ||
363 | return old; | ||
364 | else | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | /** | ||
338 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 369 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
339 | * @ent: swap entry to be recorded into | 370 | * @ent: swap entry to be recorded into |
340 | * @mem: mem_cgroup to be recorded | 371 | * @mem: mem_cgroup to be recorded |
@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
358 | mappage = ctrl->map[idx]; | 389 | mappage = ctrl->map[idx]; |
359 | sc = page_address(mappage); | 390 | sc = page_address(mappage); |
360 | sc += pos; | 391 | sc += pos; |
361 | old = sc->id; | 392 | old = xchg(&sc->id, id); |
362 | sc->id = id; | ||
363 | 393 | ||
364 | return old; | 394 | return old; |
365 | } | 395 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 84374d8cf814..6cd0a8f90dc7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) | |||
723 | return p != NULL; | 723 | return p != NULL; |
724 | } | 724 | } |
725 | 725 | ||
726 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
727 | /** | ||
728 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
729 | * @ent: the swap entry to be checked | ||
730 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
731 | * | ||
732 | * Returns the number of the user of the swap entry. The number is valid only | ||
733 | * for swaps of anonymous pages. | ||
734 | * If the entry is found on swap cache, the page is stored to pagep with | ||
735 | * refcount of it being incremented. | ||
736 | */ | ||
737 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
738 | { | ||
739 | struct page *page; | ||
740 | struct swap_info_struct *p; | ||
741 | int count = 0; | ||
742 | |||
743 | page = find_get_page(&swapper_space, ent.val); | ||
744 | if (page) | ||
745 | count += page_mapcount(page); | ||
746 | p = swap_info_get(ent); | ||
747 | if (p) { | ||
748 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
749 | spin_unlock(&swap_lock); | ||
750 | } | ||
751 | |||
752 | *pagep = page; | ||
753 | return count; | ||
754 | } | ||
755 | #endif | ||
756 | |||
726 | #ifdef CONFIG_HIBERNATION | 757 | #ifdef CONFIG_HIBERNATION |
727 | /* | 758 | /* |
728 | * Find the swap type that corresponds to given device (if any). | 759 | * Find the swap type that corresponds to given device (if any). |