aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c294
1 files changed, 284 insertions, 10 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 59ffaf511d77..22f088f22102 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,7 @@
21#include <linux/memcontrol.h> 21#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/hugetlb.h>
24#include <linux/pagemap.h> 25#include <linux/pagemap.h>
25#include <linux/smp.h> 26#include <linux/smp.h>
26#include <linux/page-flags.h> 27#include <linux/page-flags.h>
@@ -243,9 +244,17 @@ struct mem_cgroup {
243 * left-shifted bitmap of these types. 244 * left-shifted bitmap of these types.
244 */ 245 */
245enum move_type { 246enum move_type {
247 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
246 NR_MOVE_TYPE, 248 NR_MOVE_TYPE,
247}; 249};
248 250
251/* "mc" and its members are protected by cgroup_mutex */
252static struct move_charge_struct {
253 struct mem_cgroup *from;
254 struct mem_cgroup *to;
255 unsigned long precharge;
256} mc;
257
249/* 258/*
250 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 259 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
251 * limit reclaim to prevent infinite loops, if they ever occur. 260 * limit reclaim to prevent infinite loops, if they ever occur.
@@ -1513,7 +1522,7 @@ charged:
1513 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1522 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1514 * if they exceeds softlimit. 1523 * if they exceeds softlimit.
1515 */ 1524 */
1516 if (mem_cgroup_soft_limit_check(mem)) 1525 if (page && mem_cgroup_soft_limit_check(mem))
1517 mem_cgroup_update_tree(mem, page); 1526 mem_cgroup_update_tree(mem, page);
1518done: 1527done:
1519 return 0; 1528 return 0;
@@ -1690,8 +1699,9 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
1690 /* 1699 /*
1691 * We charges against "to" which may not have any tasks. Then, "to" 1700 * We charges against "to" which may not have any tasks. Then, "to"
1692 * can be under rmdir(). But in current implementation, caller of 1701 * can be under rmdir(). But in current implementation, caller of
1693 * this function is just force_empty() and it's garanteed that 1702 * this function is just force_empty() and move charge, so it's
1694 * "to" is never removed. So, we don't check rmdir status here. 1703 * garanteed that "to" is never removed. So, we don't check rmdir
1704 * status here.
1695 */ 1705 */
1696} 1706}
1697 1707
@@ -3428,11 +3438,171 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
3428} 3438}
3429 3439
3430/* Handlers for move charge at task migration. */ 3440/* Handlers for move charge at task migration. */
3431static int mem_cgroup_can_move_charge(void) 3441static int mem_cgroup_do_precharge(void)
3432{ 3442{
3443 int ret = -ENOMEM;
3444 struct mem_cgroup *mem = mc.to;
3445
3446 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, NULL);
3447 if (ret || !mem)
3448 return -ENOMEM;
3449
3450 mc.precharge++;
3451 return ret;
3452}
3453
3454/**
3455 * is_target_pte_for_mc - check a pte whether it is valid for move charge
3456 * @vma: the vma the pte to be checked belongs
3457 * @addr: the address corresponding to the pte to be checked
3458 * @ptent: the pte to be checked
3459 * @target: the pointer the target page will be stored(can be NULL)
3460 *
3461 * Returns
3462 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
3463 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3464 * move charge. if @target is not NULL, the page is stored in target->page
3465 * with extra refcnt got(Callers should handle it).
3466 *
3467 * Called with pte lock held.
3468 */
3469/* We add a new member later. */
3470union mc_target {
3471 struct page *page;
3472};
3473
3474/* We add a new type later. */
3475enum mc_target_type {
3476 MC_TARGET_NONE, /* not used */
3477 MC_TARGET_PAGE,
3478};
3479
3480static int is_target_pte_for_mc(struct vm_area_struct *vma,
3481 unsigned long addr, pte_t ptent, union mc_target *target)
3482{
3483 struct page *page;
3484 struct page_cgroup *pc;
3485 int ret = 0;
3486 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3487 &mc.to->move_charge_at_immigrate);
3488
3489 if (!pte_present(ptent))
3490 return 0;
3491
3492 page = vm_normal_page(vma, addr, ptent);
3493 if (!page || !page_mapped(page))
3494 return 0;
3495 /*
3496 * TODO: We don't move charges of file(including shmem/tmpfs) pages for
3497 * now.
3498 */
3499 if (!move_anon || !PageAnon(page))
3500 return 0;
3501 /*
3502 * TODO: We don't move charges of shared(used by multiple processes)
3503 * pages for now.
3504 */
3505 if (page_mapcount(page) > 1)
3506 return 0;
3507 if (!get_page_unless_zero(page))
3508 return 0;
3509
3510 pc = lookup_page_cgroup(page);
3511 /*
3512 * Do only loose check w/o page_cgroup lock. mem_cgroup_move_account()
3513 * checks the pc is valid or not under the lock.
3514 */
3515 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
3516 ret = MC_TARGET_PAGE;
3517 if (target)
3518 target->page = page;
3519 }
3520
3521 if (!ret || !target)
3522 put_page(page);
3523
3524 return ret;
3525}
3526
3527static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
3528 unsigned long addr, unsigned long end,
3529 struct mm_walk *walk)
3530{
3531 struct vm_area_struct *vma = walk->private;
3532 pte_t *pte;
3533 spinlock_t *ptl;
3534
3535 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
3536 for (; addr != end; pte++, addr += PAGE_SIZE)
3537 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
3538 mc.precharge++; /* increment precharge temporarily */
3539 pte_unmap_unlock(pte - 1, ptl);
3540 cond_resched();
3541
3433 return 0; 3542 return 0;
3434} 3543}
3435 3544
3545static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
3546{
3547 unsigned long precharge;
3548 struct vm_area_struct *vma;
3549
3550 down_read(&mm->mmap_sem);
3551 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3552 struct mm_walk mem_cgroup_count_precharge_walk = {
3553 .pmd_entry = mem_cgroup_count_precharge_pte_range,
3554 .mm = mm,
3555 .private = vma,
3556 };
3557 if (is_vm_hugetlb_page(vma))
3558 continue;
3559 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
3560 if (vma->vm_flags & VM_SHARED)
3561 continue;
3562 walk_page_range(vma->vm_start, vma->vm_end,
3563 &mem_cgroup_count_precharge_walk);
3564 }
3565 up_read(&mm->mmap_sem);
3566
3567 precharge = mc.precharge;
3568 mc.precharge = 0;
3569
3570 return precharge;
3571}
3572
3573#define PRECHARGE_AT_ONCE 256
3574static int mem_cgroup_precharge_mc(struct mm_struct *mm)
3575{
3576 int ret = 0;
3577 int count = PRECHARGE_AT_ONCE;
3578 unsigned long precharge = mem_cgroup_count_precharge(mm);
3579
3580 while (!ret && precharge--) {
3581 if (signal_pending(current)) {
3582 ret = -EINTR;
3583 break;
3584 }
3585 if (!count--) {
3586 count = PRECHARGE_AT_ONCE;
3587 cond_resched();
3588 }
3589 ret = mem_cgroup_do_precharge();
3590 }
3591
3592 return ret;
3593}
3594
3595static void mem_cgroup_clear_mc(void)
3596{
3597 /* we must uncharge all the leftover precharges from mc.to */
3598 while (mc.precharge) {
3599 mem_cgroup_cancel_charge(mc.to);
3600 mc.precharge--;
3601 }
3602 mc.from = NULL;
3603 mc.to = NULL;
3604}
3605
3436static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 3606static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3437 struct cgroup *cgroup, 3607 struct cgroup *cgroup,
3438 struct task_struct *p, 3608 struct task_struct *p,
@@ -3450,11 +3620,19 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3450 mm = get_task_mm(p); 3620 mm = get_task_mm(p);
3451 if (!mm) 3621 if (!mm)
3452 return 0; 3622 return 0;
3453
3454 /* We move charges only when we move a owner of the mm */ 3623 /* We move charges only when we move a owner of the mm */
3455 if (mm->owner == p) 3624 if (mm->owner == p) {
3456 ret = mem_cgroup_can_move_charge(); 3625 VM_BUG_ON(mc.from);
3457 3626 VM_BUG_ON(mc.to);
3627 VM_BUG_ON(mc.precharge);
3628 mc.from = from;
3629 mc.to = mem;
3630 mc.precharge = 0;
3631
3632 ret = mem_cgroup_precharge_mc(mm);
3633 if (ret)
3634 mem_cgroup_clear_mc();
3635 }
3458 mmput(mm); 3636 mmput(mm);
3459 } 3637 }
3460 return ret; 3638 return ret;
@@ -3465,10 +3643,95 @@ static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
3465 struct task_struct *p, 3643 struct task_struct *p,
3466 bool threadgroup) 3644 bool threadgroup)
3467{ 3645{
3646 mem_cgroup_clear_mc();
3468} 3647}
3469 3648
3470static void mem_cgroup_move_charge(void) 3649static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
3650 unsigned long addr, unsigned long end,
3651 struct mm_walk *walk)
3471{ 3652{
3653 int ret = 0;
3654 struct vm_area_struct *vma = walk->private;
3655 pte_t *pte;
3656 spinlock_t *ptl;
3657
3658retry:
3659 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
3660 for (; addr != end; addr += PAGE_SIZE) {
3661 pte_t ptent = *(pte++);
3662 union mc_target target;
3663 int type;
3664 struct page *page;
3665 struct page_cgroup *pc;
3666
3667 if (!mc.precharge)
3668 break;
3669
3670 type = is_target_pte_for_mc(vma, addr, ptent, &target);
3671 switch (type) {
3672 case MC_TARGET_PAGE:
3673 page = target.page;
3674 if (isolate_lru_page(page))
3675 goto put;
3676 pc = lookup_page_cgroup(page);
3677 if (!mem_cgroup_move_account(pc, mc.from, mc.to)) {
3678 css_put(&mc.to->css);
3679 mc.precharge--;
3680 }
3681 putback_lru_page(page);
3682put: /* is_target_pte_for_mc() gets the page */
3683 put_page(page);
3684 break;
3685 default:
3686 break;
3687 }
3688 }
3689 pte_unmap_unlock(pte - 1, ptl);
3690 cond_resched();
3691
3692 if (addr != end) {
3693 /*
3694 * We have consumed all precharges we got in can_attach().
3695 * We try charge one by one, but don't do any additional
3696 * charges to mc.to if we have failed in charge once in attach()
3697 * phase.
3698 */
3699 ret = mem_cgroup_do_precharge();
3700 if (!ret)
3701 goto retry;
3702 }
3703
3704 return ret;
3705}
3706
3707static void mem_cgroup_move_charge(struct mm_struct *mm)
3708{
3709 struct vm_area_struct *vma;
3710
3711 lru_add_drain_all();
3712 down_read(&mm->mmap_sem);
3713 for (vma = mm->mmap; vma; vma = vma->vm_next) {
3714 int ret;
3715 struct mm_walk mem_cgroup_move_charge_walk = {
3716 .pmd_entry = mem_cgroup_move_charge_pte_range,
3717 .mm = mm,
3718 .private = vma,
3719 };
3720 if (is_vm_hugetlb_page(vma))
3721 continue;
3722 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
3723 if (vma->vm_flags & VM_SHARED)
3724 continue;
3725 ret = walk_page_range(vma->vm_start, vma->vm_end,
3726 &mem_cgroup_move_charge_walk);
3727 if (ret)
3728 /*
3729 * means we have consumed all precharges and failed in
3730 * doing additional charge. Just abandon here.
3731 */
3732 break;
3733 }
3734 up_read(&mm->mmap_sem);
3472} 3735}
3473 3736
3474static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3737static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -3477,7 +3740,18 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3477 struct task_struct *p, 3740 struct task_struct *p,
3478 bool threadgroup) 3741 bool threadgroup)
3479{ 3742{
3480 mem_cgroup_move_charge(); 3743 struct mm_struct *mm;
3744
3745 if (!mc.to)
3746 /* no need to move charge */
3747 return;
3748
3749 mm = get_task_mm(p);
3750 if (mm) {
3751 mem_cgroup_move_charge(mm);
3752 mmput(mm);
3753 }
3754 mem_cgroup_clear_mc();
3481} 3755}
3482 3756
3483struct cgroup_subsys mem_cgroup_subsys = { 3757struct cgroup_subsys mem_cgroup_subsys = {