diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 63 |
1 files changed, 36 insertions, 27 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9be3cf8a5da4..a9a534a38ac0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -269,13 +269,14 @@ enum move_type { | |||
269 | 269 | ||
270 | /* "mc" and its members are protected by cgroup_mutex */ | 270 | /* "mc" and its members are protected by cgroup_mutex */ |
271 | static struct move_charge_struct { | 271 | static struct move_charge_struct { |
272 | spinlock_t lock; /* for from, to, moving_task */ | 272 | spinlock_t lock; /* for from, to */ |
273 | struct mem_cgroup *from; | 273 | struct mem_cgroup *from; |
274 | struct mem_cgroup *to; | 274 | struct mem_cgroup *to; |
275 | unsigned long precharge; | 275 | unsigned long precharge; |
276 | unsigned long moved_charge; | 276 | unsigned long moved_charge; |
277 | unsigned long moved_swap; | 277 | unsigned long moved_swap; |
278 | struct task_struct *moving_task; /* a task moving charges */ | 278 | struct task_struct *moving_task; /* a task moving charges */ |
279 | struct mm_struct *mm; | ||
279 | wait_queue_head_t waitq; /* a waitq for other context */ | 280 | wait_queue_head_t waitq; /* a waitq for other context */ |
280 | } mc = { | 281 | } mc = { |
281 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 282 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
@@ -1646,6 +1647,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1646 | if (likely(!ret)) | 1647 | if (likely(!ret)) |
1647 | return CHARGE_OK; | 1648 | return CHARGE_OK; |
1648 | 1649 | ||
1650 | res_counter_uncharge(&mem->res, csize); | ||
1649 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 1651 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
1650 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1652 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1651 | } else | 1653 | } else |
@@ -1729,19 +1731,18 @@ again: | |||
1729 | 1731 | ||
1730 | rcu_read_lock(); | 1732 | rcu_read_lock(); |
1731 | p = rcu_dereference(mm->owner); | 1733 | p = rcu_dereference(mm->owner); |
1732 | VM_BUG_ON(!p); | ||
1733 | /* | 1734 | /* |
1734 | * because we don't have task_lock(), "p" can exit while | 1735 | * Because we don't have task_lock(), "p" can exit. |
1735 | * we're here. In that case, "mem" can point to root | 1736 | * In that case, "mem" can point to root or p can be NULL with |
1736 | * cgroup but never be NULL. (and task_struct itself is freed | 1737 | * race with swapoff. Then, we have small risk of mis-accouning. |
1737 | * by RCU, cgroup itself is RCU safe.) Then, we have small | 1738 | * But such kind of mis-account by race always happens because |
1738 | * risk here to get wrong cgroup. But such kind of mis-account | 1739 | * we don't have cgroup_mutex(). It's overkill and we allo that |
1739 | * by race always happens because we don't have cgroup_mutex(). | 1740 | * small race, here. |
1740 | * It's overkill and we allow that small race, here. | 1741 | * (*) swapoff at el will charge against mm-struct not against |
1742 | * task-struct. So, mm->owner can be NULL. | ||
1741 | */ | 1743 | */ |
1742 | mem = mem_cgroup_from_task(p); | 1744 | mem = mem_cgroup_from_task(p); |
1743 | VM_BUG_ON(!mem); | 1745 | if (!mem || mem_cgroup_is_root(mem)) { |
1744 | if (mem_cgroup_is_root(mem)) { | ||
1745 | rcu_read_unlock(); | 1746 | rcu_read_unlock(); |
1746 | goto done; | 1747 | goto done; |
1747 | } | 1748 | } |
@@ -4445,7 +4446,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4445 | unsigned long precharge; | 4446 | unsigned long precharge; |
4446 | struct vm_area_struct *vma; | 4447 | struct vm_area_struct *vma; |
4447 | 4448 | ||
4448 | down_read(&mm->mmap_sem); | 4449 | /* We've already held the mmap_sem */ |
4449 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4450 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4450 | struct mm_walk mem_cgroup_count_precharge_walk = { | 4451 | struct mm_walk mem_cgroup_count_precharge_walk = { |
4451 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | 4452 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
@@ -4457,7 +4458,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4457 | walk_page_range(vma->vm_start, vma->vm_end, | 4458 | walk_page_range(vma->vm_start, vma->vm_end, |
4458 | &mem_cgroup_count_precharge_walk); | 4459 | &mem_cgroup_count_precharge_walk); |
4459 | } | 4460 | } |
4460 | up_read(&mm->mmap_sem); | ||
4461 | 4461 | ||
4462 | precharge = mc.precharge; | 4462 | precharge = mc.precharge; |
4463 | mc.precharge = 0; | 4463 | mc.precharge = 0; |
@@ -4508,11 +4508,16 @@ static void mem_cgroup_clear_mc(void) | |||
4508 | 4508 | ||
4509 | mc.moved_swap = 0; | 4509 | mc.moved_swap = 0; |
4510 | } | 4510 | } |
4511 | if (mc.mm) { | ||
4512 | up_read(&mc.mm->mmap_sem); | ||
4513 | mmput(mc.mm); | ||
4514 | } | ||
4511 | spin_lock(&mc.lock); | 4515 | spin_lock(&mc.lock); |
4512 | mc.from = NULL; | 4516 | mc.from = NULL; |
4513 | mc.to = NULL; | 4517 | mc.to = NULL; |
4514 | mc.moving_task = NULL; | ||
4515 | spin_unlock(&mc.lock); | 4518 | spin_unlock(&mc.lock); |
4519 | mc.moving_task = NULL; | ||
4520 | mc.mm = NULL; | ||
4516 | memcg_oom_recover(from); | 4521 | memcg_oom_recover(from); |
4517 | memcg_oom_recover(to); | 4522 | memcg_oom_recover(to); |
4518 | wake_up_all(&mc.waitq); | 4523 | wake_up_all(&mc.waitq); |
@@ -4537,26 +4542,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4537 | return 0; | 4542 | return 0; |
4538 | /* We move charges only when we move a owner of the mm */ | 4543 | /* We move charges only when we move a owner of the mm */ |
4539 | if (mm->owner == p) { | 4544 | if (mm->owner == p) { |
4545 | /* | ||
4546 | * We do all the move charge works under one mmap_sem to | ||
4547 | * avoid deadlock with down_write(&mmap_sem) | ||
4548 | * -> try_charge() -> if (mc.moving_task) -> sleep. | ||
4549 | */ | ||
4550 | down_read(&mm->mmap_sem); | ||
4551 | |||
4540 | VM_BUG_ON(mc.from); | 4552 | VM_BUG_ON(mc.from); |
4541 | VM_BUG_ON(mc.to); | 4553 | VM_BUG_ON(mc.to); |
4542 | VM_BUG_ON(mc.precharge); | 4554 | VM_BUG_ON(mc.precharge); |
4543 | VM_BUG_ON(mc.moved_charge); | 4555 | VM_BUG_ON(mc.moved_charge); |
4544 | VM_BUG_ON(mc.moved_swap); | 4556 | VM_BUG_ON(mc.moved_swap); |
4545 | VM_BUG_ON(mc.moving_task); | 4557 | VM_BUG_ON(mc.moving_task); |
4558 | VM_BUG_ON(mc.mm); | ||
4559 | |||
4546 | spin_lock(&mc.lock); | 4560 | spin_lock(&mc.lock); |
4547 | mc.from = from; | 4561 | mc.from = from; |
4548 | mc.to = mem; | 4562 | mc.to = mem; |
4549 | mc.precharge = 0; | 4563 | mc.precharge = 0; |
4550 | mc.moved_charge = 0; | 4564 | mc.moved_charge = 0; |
4551 | mc.moved_swap = 0; | 4565 | mc.moved_swap = 0; |
4552 | mc.moving_task = current; | ||
4553 | spin_unlock(&mc.lock); | 4566 | spin_unlock(&mc.lock); |
4567 | mc.moving_task = current; | ||
4568 | mc.mm = mm; | ||
4554 | 4569 | ||
4555 | ret = mem_cgroup_precharge_mc(mm); | 4570 | ret = mem_cgroup_precharge_mc(mm); |
4556 | if (ret) | 4571 | if (ret) |
4557 | mem_cgroup_clear_mc(); | 4572 | mem_cgroup_clear_mc(); |
4558 | } | 4573 | /* We call up_read() and mmput() in clear_mc(). */ |
4559 | mmput(mm); | 4574 | } else |
4575 | mmput(mm); | ||
4560 | } | 4576 | } |
4561 | return ret; | 4577 | return ret; |
4562 | } | 4578 | } |
@@ -4644,7 +4660,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4644 | struct vm_area_struct *vma; | 4660 | struct vm_area_struct *vma; |
4645 | 4661 | ||
4646 | lru_add_drain_all(); | 4662 | lru_add_drain_all(); |
4647 | down_read(&mm->mmap_sem); | 4663 | /* We've already held the mmap_sem */ |
4648 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4664 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4649 | int ret; | 4665 | int ret; |
4650 | struct mm_walk mem_cgroup_move_charge_walk = { | 4666 | struct mm_walk mem_cgroup_move_charge_walk = { |
@@ -4663,7 +4679,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4663 | */ | 4679 | */ |
4664 | break; | 4680 | break; |
4665 | } | 4681 | } |
4666 | up_read(&mm->mmap_sem); | ||
4667 | } | 4682 | } |
4668 | 4683 | ||
4669 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4684 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
@@ -4672,17 +4687,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
4672 | struct task_struct *p, | 4687 | struct task_struct *p, |
4673 | bool threadgroup) | 4688 | bool threadgroup) |
4674 | { | 4689 | { |
4675 | struct mm_struct *mm; | 4690 | if (!mc.mm) |
4676 | |||
4677 | if (!mc.to) | ||
4678 | /* no need to move charge */ | 4691 | /* no need to move charge */ |
4679 | return; | 4692 | return; |
4680 | 4693 | ||
4681 | mm = get_task_mm(p); | 4694 | mem_cgroup_move_charge(mc.mm); |
4682 | if (mm) { | ||
4683 | mem_cgroup_move_charge(mm); | ||
4684 | mmput(mm); | ||
4685 | } | ||
4686 | mem_cgroup_clear_mc(); | 4695 | mem_cgroup_clear_mc(); |
4687 | } | 4696 | } |
4688 | #else /* !CONFIG_MMU */ | 4697 | #else /* !CONFIG_MMU */ |