diff options
author | Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> | 2010-03-10 18:22:14 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-12 18:52:36 -0500 |
commit | 4ffef5feff4e4240e767d2f1144b1634a41762e3 (patch) | |
tree | 14793120e5809008c2587d89162d8d57130d6fc8 /mm | |
parent | 7dc74be032bfcaa2f9d9e4296ff5bbddfa9e2f19 (diff) |
memcg: move charges of anonymous page
This patch is the core part of this move-charge-at-task-migration feature.
It implements functions to move charges of anonymous pages mapped only by
the target task.
Implementation:
- define struct move_charge_struct and a valuable of it(mc) to remember the
count of pre-charges and other information.
- At can_attach(), get anon_rss of the target mm, call __mem_cgroup_try_charge()
repeatedly and count up mc.precharge.
- At attach(), parse the page table, find a target page to be move, and call
mem_cgroup_move_account() about the page.
- Cancel all precharges if mc.precharge > 0 on failure or at the end of
task move.
[akpm@linux-foundation.org: a little simplification]
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memcontrol.c | 294 |
1 files changed, 284 insertions, 10 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59ffaf511d77..22f088f22102 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/hugetlb.h> | ||
24 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
25 | #include <linux/smp.h> | 26 | #include <linux/smp.h> |
26 | #include <linux/page-flags.h> | 27 | #include <linux/page-flags.h> |
@@ -243,9 +244,17 @@ struct mem_cgroup { | |||
243 | * left-shifted bitmap of these types. | 244 | * left-shifted bitmap of these types. |
244 | */ | 245 | */ |
245 | enum move_type { | 246 | enum move_type { |
247 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | ||
246 | NR_MOVE_TYPE, | 248 | NR_MOVE_TYPE, |
247 | }; | 249 | }; |
248 | 250 | ||
251 | /* "mc" and its members are protected by cgroup_mutex */ | ||
252 | static struct move_charge_struct { | ||
253 | struct mem_cgroup *from; | ||
254 | struct mem_cgroup *to; | ||
255 | unsigned long precharge; | ||
256 | } mc; | ||
257 | |||
249 | /* | 258 | /* |
250 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 259 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
251 | * limit reclaim to prevent infinite loops, if they ever occur. | 260 | * limit reclaim to prevent infinite loops, if they ever occur. |
@@ -1513,7 +1522,7 @@ charged: | |||
1513 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1522 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1514 | * if they exceeds softlimit. | 1523 | * if they exceeds softlimit. |
1515 | */ | 1524 | */ |
1516 | if (mem_cgroup_soft_limit_check(mem)) | 1525 | if (page && mem_cgroup_soft_limit_check(mem)) |
1517 | mem_cgroup_update_tree(mem, page); | 1526 | mem_cgroup_update_tree(mem, page); |
1518 | done: | 1527 | done: |
1519 | return 0; | 1528 | return 0; |
@@ -1690,8 +1699,9 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
1690 | /* | 1699 | /* |
1691 | * We charges against "to" which may not have any tasks. Then, "to" | 1700 | * We charges against "to" which may not have any tasks. Then, "to" |
1692 | * can be under rmdir(). But in current implementation, caller of | 1701 | * can be under rmdir(). But in current implementation, caller of |
1693 | * this function is just force_empty() and it's garanteed that | 1702 | * this function is just force_empty() and move charge, so it's |
1694 | * "to" is never removed. So, we don't check rmdir status here. | 1703 | * garanteed that "to" is never removed. So, we don't check rmdir |
1704 | * status here. | ||
1695 | */ | 1705 | */ |
1696 | } | 1706 | } |
1697 | 1707 | ||
@@ -3428,11 +3438,171 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3428 | } | 3438 | } |
3429 | 3439 | ||
3430 | /* Handlers for move charge at task migration. */ | 3440 | /* Handlers for move charge at task migration. */ |
3431 | static int mem_cgroup_can_move_charge(void) | 3441 | static int mem_cgroup_do_precharge(void) |
3432 | { | 3442 | { |
3443 | int ret = -ENOMEM; | ||
3444 | struct mem_cgroup *mem = mc.to; | ||
3445 | |||
3446 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, NULL); | ||
3447 | if (ret || !mem) | ||
3448 | return -ENOMEM; | ||
3449 | |||
3450 | mc.precharge++; | ||
3451 | return ret; | ||
3452 | } | ||
3453 | |||
3454 | /** | ||
3455 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | ||
3456 | * @vma: the vma the pte to be checked belongs | ||
3457 | * @addr: the address corresponding to the pte to be checked | ||
3458 | * @ptent: the pte to be checked | ||
3459 | * @target: the pointer the target page will be stored(can be NULL) | ||
3460 | * | ||
3461 | * Returns | ||
3462 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
3463 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
3464 | * move charge. if @target is not NULL, the page is stored in target->page | ||
3465 | * with extra refcnt got(Callers should handle it). | ||
3466 | * | ||
3467 | * Called with pte lock held. | ||
3468 | */ | ||
3469 | /* We add a new member later. */ | ||
3470 | union mc_target { | ||
3471 | struct page *page; | ||
3472 | }; | ||
3473 | |||
3474 | /* We add a new type later. */ | ||
3475 | enum mc_target_type { | ||
3476 | MC_TARGET_NONE, /* not used */ | ||
3477 | MC_TARGET_PAGE, | ||
3478 | }; | ||
3479 | |||
3480 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | ||
3481 | unsigned long addr, pte_t ptent, union mc_target *target) | ||
3482 | { | ||
3483 | struct page *page; | ||
3484 | struct page_cgroup *pc; | ||
3485 | int ret = 0; | ||
3486 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
3487 | &mc.to->move_charge_at_immigrate); | ||
3488 | |||
3489 | if (!pte_present(ptent)) | ||
3490 | return 0; | ||
3491 | |||
3492 | page = vm_normal_page(vma, addr, ptent); | ||
3493 | if (!page || !page_mapped(page)) | ||
3494 | return 0; | ||
3495 | /* | ||
3496 | * TODO: We don't move charges of file(including shmem/tmpfs) pages for | ||
3497 | * now. | ||
3498 | */ | ||
3499 | if (!move_anon || !PageAnon(page)) | ||
3500 | return 0; | ||
3501 | /* | ||
3502 | * TODO: We don't move charges of shared(used by multiple processes) | ||
3503 | * pages for now. | ||
3504 | */ | ||
3505 | if (page_mapcount(page) > 1) | ||
3506 | return 0; | ||
3507 | if (!get_page_unless_zero(page)) | ||
3508 | return 0; | ||
3509 | |||
3510 | pc = lookup_page_cgroup(page); | ||
3511 | /* | ||
3512 | * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account() | ||
3513 | * checks the pc is valid or not under the lock. | ||
3514 | */ | ||
3515 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
3516 | ret = MC_TARGET_PAGE; | ||
3517 | if (target) | ||
3518 | target->page = page; | ||
3519 | } | ||
3520 | |||
3521 | if (!ret || !target) | ||
3522 | put_page(page); | ||
3523 | |||
3524 | return ret; | ||
3525 | } | ||
3526 | |||
3527 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | ||
3528 | unsigned long addr, unsigned long end, | ||
3529 | struct mm_walk *walk) | ||
3530 | { | ||
3531 | struct vm_area_struct *vma = walk->private; | ||
3532 | pte_t *pte; | ||
3533 | spinlock_t *ptl; | ||
3534 | |||
3535 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
3536 | for (; addr != end; pte++, addr += PAGE_SIZE) | ||
3537 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | ||
3538 | mc.precharge++; /* increment precharge temporarily */ | ||
3539 | pte_unmap_unlock(pte - 1, ptl); | ||
3540 | cond_resched(); | ||
3541 | |||
3433 | return 0; | 3542 | return 0; |
3434 | } | 3543 | } |
3435 | 3544 | ||
3545 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | ||
3546 | { | ||
3547 | unsigned long precharge; | ||
3548 | struct vm_area_struct *vma; | ||
3549 | |||
3550 | down_read(&mm->mmap_sem); | ||
3551 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
3552 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
3553 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
3554 | .mm = mm, | ||
3555 | .private = vma, | ||
3556 | }; | ||
3557 | if (is_vm_hugetlb_page(vma)) | ||
3558 | continue; | ||
3559 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
3560 | if (vma->vm_flags & VM_SHARED) | ||
3561 | continue; | ||
3562 | walk_page_range(vma->vm_start, vma->vm_end, | ||
3563 | &mem_cgroup_count_precharge_walk); | ||
3564 | } | ||
3565 | up_read(&mm->mmap_sem); | ||
3566 | |||
3567 | precharge = mc.precharge; | ||
3568 | mc.precharge = 0; | ||
3569 | |||
3570 | return precharge; | ||
3571 | } | ||
3572 | |||
3573 | #define PRECHARGE_AT_ONCE 256 | ||
3574 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | ||
3575 | { | ||
3576 | int ret = 0; | ||
3577 | int count = PRECHARGE_AT_ONCE; | ||
3578 | unsigned long precharge = mem_cgroup_count_precharge(mm); | ||
3579 | |||
3580 | while (!ret && precharge--) { | ||
3581 | if (signal_pending(current)) { | ||
3582 | ret = -EINTR; | ||
3583 | break; | ||
3584 | } | ||
3585 | if (!count--) { | ||
3586 | count = PRECHARGE_AT_ONCE; | ||
3587 | cond_resched(); | ||
3588 | } | ||
3589 | ret = mem_cgroup_do_precharge(); | ||
3590 | } | ||
3591 | |||
3592 | return ret; | ||
3593 | } | ||
3594 | |||
3595 | static void mem_cgroup_clear_mc(void) | ||
3596 | { | ||
3597 | /* we must uncharge all the leftover precharges from mc.to */ | ||
3598 | while (mc.precharge) { | ||
3599 | mem_cgroup_cancel_charge(mc.to); | ||
3600 | mc.precharge--; | ||
3601 | } | ||
3602 | mc.from = NULL; | ||
3603 | mc.to = NULL; | ||
3604 | } | ||
3605 | |||
3436 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 3606 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
3437 | struct cgroup *cgroup, | 3607 | struct cgroup *cgroup, |
3438 | struct task_struct *p, | 3608 | struct task_struct *p, |
@@ -3450,11 +3620,19 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
3450 | mm = get_task_mm(p); | 3620 | mm = get_task_mm(p); |
3451 | if (!mm) | 3621 | if (!mm) |
3452 | return 0; | 3622 | return 0; |
3453 | |||
3454 | /* We move charges only when we move a owner of the mm */ | 3623 | /* We move charges only when we move a owner of the mm */ |
3455 | if (mm->owner == p) | 3624 | if (mm->owner == p) { |
3456 | ret = mem_cgroup_can_move_charge(); | 3625 | VM_BUG_ON(mc.from); |
3457 | 3626 | VM_BUG_ON(mc.to); | |
3627 | VM_BUG_ON(mc.precharge); | ||
3628 | mc.from = from; | ||
3629 | mc.to = mem; | ||
3630 | mc.precharge = 0; | ||
3631 | |||
3632 | ret = mem_cgroup_precharge_mc(mm); | ||
3633 | if (ret) | ||
3634 | mem_cgroup_clear_mc(); | ||
3635 | } | ||
3458 | mmput(mm); | 3636 | mmput(mm); |
3459 | } | 3637 | } |
3460 | return ret; | 3638 | return ret; |
@@ -3465,10 +3643,95 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | |||
3465 | struct task_struct *p, | 3643 | struct task_struct *p, |
3466 | bool threadgroup) | 3644 | bool threadgroup) |
3467 | { | 3645 | { |
3646 | mem_cgroup_clear_mc(); | ||
3468 | } | 3647 | } |
3469 | 3648 | ||
3470 | static void mem_cgroup_move_charge(void) | 3649 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, |
3650 | unsigned long addr, unsigned long end, | ||
3651 | struct mm_walk *walk) | ||
3471 | { | 3652 | { |
3653 | int ret = 0; | ||
3654 | struct vm_area_struct *vma = walk->private; | ||
3655 | pte_t *pte; | ||
3656 | spinlock_t *ptl; | ||
3657 | |||
3658 | retry: | ||
3659 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
3660 | for (; addr != end; addr += PAGE_SIZE) { | ||
3661 | pte_t ptent = *(pte++); | ||
3662 | union mc_target target; | ||
3663 | int type; | ||
3664 | struct page *page; | ||
3665 | struct page_cgroup *pc; | ||
3666 | |||
3667 | if (!mc.precharge) | ||
3668 | break; | ||
3669 | |||
3670 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | ||
3671 | switch (type) { | ||
3672 | case MC_TARGET_PAGE: | ||
3673 | page = target.page; | ||
3674 | if (isolate_lru_page(page)) | ||
3675 | goto put; | ||
3676 | pc = lookup_page_cgroup(page); | ||
3677 | if (!mem_cgroup_move_account(pc, mc.from, mc.to)) { | ||
3678 | css_put(&mc.to->css); | ||
3679 | mc.precharge--; | ||
3680 | } | ||
3681 | putback_lru_page(page); | ||
3682 | put: /* is_target_pte_for_mc() gets the page */ | ||
3683 | put_page(page); | ||
3684 | break; | ||
3685 | default: | ||
3686 | break; | ||
3687 | } | ||
3688 | } | ||
3689 | pte_unmap_unlock(pte - 1, ptl); | ||
3690 | cond_resched(); | ||
3691 | |||
3692 | if (addr != end) { | ||
3693 | /* | ||
3694 | * We have consumed all precharges we got in can_attach(). | ||
3695 | * We try charge one by one, but don't do any additional | ||
3696 | * charges to mc.to if we have failed in charge once in attach() | ||
3697 | * phase. | ||
3698 | */ | ||
3699 | ret = mem_cgroup_do_precharge(); | ||
3700 | if (!ret) | ||
3701 | goto retry; | ||
3702 | } | ||
3703 | |||
3704 | return ret; | ||
3705 | } | ||
3706 | |||
3707 | static void mem_cgroup_move_charge(struct mm_struct *mm) | ||
3708 | { | ||
3709 | struct vm_area_struct *vma; | ||
3710 | |||
3711 | lru_add_drain_all(); | ||
3712 | down_read(&mm->mmap_sem); | ||
3713 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
3714 | int ret; | ||
3715 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
3716 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
3717 | .mm = mm, | ||
3718 | .private = vma, | ||
3719 | }; | ||
3720 | if (is_vm_hugetlb_page(vma)) | ||
3721 | continue; | ||
3722 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
3723 | if (vma->vm_flags & VM_SHARED) | ||
3724 | continue; | ||
3725 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
3726 | &mem_cgroup_move_charge_walk); | ||
3727 | if (ret) | ||
3728 | /* | ||
3729 | * means we have consumed all precharges and failed in | ||
3730 | * doing additional charge. Just abandon here. | ||
3731 | */ | ||
3732 | break; | ||
3733 | } | ||
3734 | up_read(&mm->mmap_sem); | ||
3472 | } | 3735 | } |
3473 | 3736 | ||
3474 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3737 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
@@ -3477,7 +3740,18 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
3477 | struct task_struct *p, | 3740 | struct task_struct *p, |
3478 | bool threadgroup) | 3741 | bool threadgroup) |
3479 | { | 3742 | { |
3480 | mem_cgroup_move_charge(); | 3743 | struct mm_struct *mm; |
3744 | |||
3745 | if (!mc.to) | ||
3746 | /* no need to move charge */ | ||
3747 | return; | ||
3748 | |||
3749 | mm = get_task_mm(p); | ||
3750 | if (mm) { | ||
3751 | mem_cgroup_move_charge(mm); | ||
3752 | mmput(mm); | ||
3753 | } | ||
3754 | mem_cgroup_clear_mc(); | ||
3481 | } | 3755 | } |
3482 | 3756 | ||
3483 | struct cgroup_subsys mem_cgroup_subsys = { | 3757 | struct cgroup_subsys mem_cgroup_subsys = { |