diff options
author | Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> | 2011-01-13 18:47:41 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 20:32:51 -0500 |
commit | dfe076b0971a783469bc2066e85d46e23c8acb1c (patch) | |
tree | ee0856c119ed898212da2555f936fb3284c5ff59 | |
parent | 043d18b1e5bdfc4870b8a19d00f0d5c636a5c231 (diff) |
memcg: fix deadlock between cpuset and memcg
Commit b1dd693e ("memcg: avoid deadlock between move charge and
try_charge()") can cause another deadlock about mmap_sem on task migration
if cpuset and memcg are mounted onto the same mount point.
After the commit, cgroup_attach_task() has sequence like:
cgroup_attach_task()
ss->can_attach()
cpuset_can_attach()
mem_cgroup_can_attach()
down_read(&mmap_sem) (1)
ss->attach()
cpuset_attach()
mpol_rebind_mm()
down_write(&mmap_sem) (2)
up_write(&mmap_sem)
cpuset_migrate_mm()
do_migrate_pages()
down_read(&mmap_sem)
up_read(&mmap_sem)
mem_cgroup_move_task()
mem_cgroup_clear_mc()
up_read(&mmap_sem)
We can cause deadlock at (2) because we've already aquire the mmap_sem at (1).
But the commit itself is necessary to fix deadlocks which have existed
before the commit like:
Ex.1)
move charge | try charge
--------------------------------------+------------------------------
mem_cgroup_can_attach() | down_write(&mmap_sem)
mc.moving_task = current | ..
mem_cgroup_precharge_mc() | __mem_cgroup_try_charge()
mem_cgroup_count_precharge() | prepare_to_wait()
down_read(&mmap_sem) | if (mc.moving_task)
-> cannot aquire the lock | -> true
| schedule()
| -> move charge should wake it up
Ex.2)
move charge | try charge
--------------------------------------+------------------------------
mem_cgroup_can_attach() |
mc.moving_task = current |
mem_cgroup_precharge_mc() |
mem_cgroup_count_precharge() |
down_read(&mmap_sem) |
.. |
up_read(&mmap_sem) |
| down_write(&mmap_sem)
mem_cgroup_move_task() | ..
mem_cgroup_move_charge() | __mem_cgroup_try_charge()
down_read(&mmap_sem) | prepare_to_wait()
-> cannot aquire the lock | if (mc.moving_task)
| -> true
| schedule()
| -> move charge should wake it up
This patch fixes all of these problems by:
1. revert the commit.
2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge()
has released the mmap_sem.
3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in
mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel
all extra charges, wake up all waiters, and retry trylock.
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reported-by: Ben Blum <bblum@andrew.cmu.edu>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com>
Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/memcontrol.c | 84 |
1 files changed, 49 insertions, 35 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b44ad64f281..c339d7431bda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -292,7 +292,6 @@ static struct move_charge_struct { | |||
292 | unsigned long moved_charge; | 292 | unsigned long moved_charge; |
293 | unsigned long moved_swap; | 293 | unsigned long moved_swap; |
294 | struct task_struct *moving_task; /* a task moving charges */ | 294 | struct task_struct *moving_task; /* a task moving charges */ |
295 | struct mm_struct *mm; | ||
296 | wait_queue_head_t waitq; /* a waitq for other context */ | 295 | wait_queue_head_t waitq; /* a waitq for other context */ |
297 | } mc = { | 296 | } mc = { |
298 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 297 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
@@ -4681,7 +4680,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4681 | unsigned long precharge; | 4680 | unsigned long precharge; |
4682 | struct vm_area_struct *vma; | 4681 | struct vm_area_struct *vma; |
4683 | 4682 | ||
4684 | /* We've already held the mmap_sem */ | 4683 | down_read(&mm->mmap_sem); |
4685 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4684 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4686 | struct mm_walk mem_cgroup_count_precharge_walk = { | 4685 | struct mm_walk mem_cgroup_count_precharge_walk = { |
4687 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | 4686 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
@@ -4693,6 +4692,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4693 | walk_page_range(vma->vm_start, vma->vm_end, | 4692 | walk_page_range(vma->vm_start, vma->vm_end, |
4694 | &mem_cgroup_count_precharge_walk); | 4693 | &mem_cgroup_count_precharge_walk); |
4695 | } | 4694 | } |
4695 | up_read(&mm->mmap_sem); | ||
4696 | 4696 | ||
4697 | precharge = mc.precharge; | 4697 | precharge = mc.precharge; |
4698 | mc.precharge = 0; | 4698 | mc.precharge = 0; |
@@ -4702,10 +4702,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4702 | 4702 | ||
4703 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 4703 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
4704 | { | 4704 | { |
4705 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | 4705 | unsigned long precharge = mem_cgroup_count_precharge(mm); |
4706 | |||
4707 | VM_BUG_ON(mc.moving_task); | ||
4708 | mc.moving_task = current; | ||
4709 | return mem_cgroup_do_precharge(precharge); | ||
4706 | } | 4710 | } |
4707 | 4711 | ||
4708 | static void mem_cgroup_clear_mc(void) | 4712 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ |
4713 | static void __mem_cgroup_clear_mc(void) | ||
4709 | { | 4714 | { |
4710 | struct mem_cgroup *from = mc.from; | 4715 | struct mem_cgroup *from = mc.from; |
4711 | struct mem_cgroup *to = mc.to; | 4716 | struct mem_cgroup *to = mc.to; |
@@ -4740,23 +4745,28 @@ static void mem_cgroup_clear_mc(void) | |||
4740 | PAGE_SIZE * mc.moved_swap); | 4745 | PAGE_SIZE * mc.moved_swap); |
4741 | } | 4746 | } |
4742 | /* we've already done mem_cgroup_get(mc.to) */ | 4747 | /* we've already done mem_cgroup_get(mc.to) */ |
4743 | |||
4744 | mc.moved_swap = 0; | 4748 | mc.moved_swap = 0; |
4745 | } | 4749 | } |
4746 | if (mc.mm) { | 4750 | memcg_oom_recover(from); |
4747 | up_read(&mc.mm->mmap_sem); | 4751 | memcg_oom_recover(to); |
4748 | mmput(mc.mm); | 4752 | wake_up_all(&mc.waitq); |
4749 | } | 4753 | } |
4754 | |||
4755 | static void mem_cgroup_clear_mc(void) | ||
4756 | { | ||
4757 | struct mem_cgroup *from = mc.from; | ||
4758 | |||
4759 | /* | ||
4760 | * we must clear moving_task before waking up waiters at the end of | ||
4761 | * task migration. | ||
4762 | */ | ||
4763 | mc.moving_task = NULL; | ||
4764 | __mem_cgroup_clear_mc(); | ||
4750 | spin_lock(&mc.lock); | 4765 | spin_lock(&mc.lock); |
4751 | mc.from = NULL; | 4766 | mc.from = NULL; |
4752 | mc.to = NULL; | 4767 | mc.to = NULL; |
4753 | spin_unlock(&mc.lock); | 4768 | spin_unlock(&mc.lock); |
4754 | mc.moving_task = NULL; | ||
4755 | mc.mm = NULL; | ||
4756 | mem_cgroup_end_move(from); | 4769 | mem_cgroup_end_move(from); |
4757 | memcg_oom_recover(from); | ||
4758 | memcg_oom_recover(to); | ||
4759 | wake_up_all(&mc.waitq); | ||
4760 | } | 4770 | } |
4761 | 4771 | ||
4762 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 4772 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
@@ -4778,38 +4788,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4778 | return 0; | 4788 | return 0; |
4779 | /* We move charges only when we move a owner of the mm */ | 4789 | /* We move charges only when we move a owner of the mm */ |
4780 | if (mm->owner == p) { | 4790 | if (mm->owner == p) { |
4781 | /* | ||
4782 | * We do all the move charge works under one mmap_sem to | ||
4783 | * avoid deadlock with down_write(&mmap_sem) | ||
4784 | * -> try_charge() -> if (mc.moving_task) -> sleep. | ||
4785 | */ | ||
4786 | down_read(&mm->mmap_sem); | ||
4787 | |||
4788 | VM_BUG_ON(mc.from); | 4791 | VM_BUG_ON(mc.from); |
4789 | VM_BUG_ON(mc.to); | 4792 | VM_BUG_ON(mc.to); |
4790 | VM_BUG_ON(mc.precharge); | 4793 | VM_BUG_ON(mc.precharge); |
4791 | VM_BUG_ON(mc.moved_charge); | 4794 | VM_BUG_ON(mc.moved_charge); |
4792 | VM_BUG_ON(mc.moved_swap); | 4795 | VM_BUG_ON(mc.moved_swap); |
4793 | VM_BUG_ON(mc.moving_task); | ||
4794 | VM_BUG_ON(mc.mm); | ||
4795 | |||
4796 | mem_cgroup_start_move(from); | 4796 | mem_cgroup_start_move(from); |
4797 | spin_lock(&mc.lock); | 4797 | spin_lock(&mc.lock); |
4798 | mc.from = from; | 4798 | mc.from = from; |
4799 | mc.to = mem; | 4799 | mc.to = mem; |
4800 | mc.precharge = 0; | ||
4801 | mc.moved_charge = 0; | ||
4802 | mc.moved_swap = 0; | ||
4803 | spin_unlock(&mc.lock); | 4800 | spin_unlock(&mc.lock); |
4804 | mc.moving_task = current; | 4801 | /* We set mc.moving_task later */ |
4805 | mc.mm = mm; | ||
4806 | 4802 | ||
4807 | ret = mem_cgroup_precharge_mc(mm); | 4803 | ret = mem_cgroup_precharge_mc(mm); |
4808 | if (ret) | 4804 | if (ret) |
4809 | mem_cgroup_clear_mc(); | 4805 | mem_cgroup_clear_mc(); |
4810 | /* We call up_read() and mmput() in clear_mc(). */ | 4806 | } |
4811 | } else | 4807 | mmput(mm); |
4812 | mmput(mm); | ||
4813 | } | 4808 | } |
4814 | return ret; | 4809 | return ret; |
4815 | } | 4810 | } |
@@ -4898,7 +4893,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4898 | struct vm_area_struct *vma; | 4893 | struct vm_area_struct *vma; |
4899 | 4894 | ||
4900 | lru_add_drain_all(); | 4895 | lru_add_drain_all(); |
4901 | /* We've already held the mmap_sem */ | 4896 | retry: |
4897 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | ||
4898 | /* | ||
4899 | * Someone who are holding the mmap_sem might be waiting in | ||
4900 | * waitq. So we cancel all extra charges, wake up all waiters, | ||
4901 | * and retry. Because we cancel precharges, we might not be able | ||
4902 | * to move enough charges, but moving charge is a best-effort | ||
4903 | * feature anyway, so it wouldn't be a big problem. | ||
4904 | */ | ||
4905 | __mem_cgroup_clear_mc(); | ||
4906 | cond_resched(); | ||
4907 | goto retry; | ||
4908 | } | ||
4902 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4909 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4903 | int ret; | 4910 | int ret; |
4904 | struct mm_walk mem_cgroup_move_charge_walk = { | 4911 | struct mm_walk mem_cgroup_move_charge_walk = { |
@@ -4917,6 +4924,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4917 | */ | 4924 | */ |
4918 | break; | 4925 | break; |
4919 | } | 4926 | } |
4927 | up_read(&mm->mmap_sem); | ||
4920 | } | 4928 | } |
4921 | 4929 | ||
4922 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4930 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
@@ -4925,11 +4933,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
4925 | struct task_struct *p, | 4933 | struct task_struct *p, |
4926 | bool threadgroup) | 4934 | bool threadgroup) |
4927 | { | 4935 | { |
4928 | if (!mc.mm) | 4936 | struct mm_struct *mm; |
4937 | |||
4938 | if (!mc.to) | ||
4929 | /* no need to move charge */ | 4939 | /* no need to move charge */ |
4930 | return; | 4940 | return; |
4931 | 4941 | ||
4932 | mem_cgroup_move_charge(mc.mm); | 4942 | mm = get_task_mm(p); |
4943 | if (mm) { | ||
4944 | mem_cgroup_move_charge(mm); | ||
4945 | mmput(mm); | ||
4946 | } | ||
4933 | mem_cgroup_clear_mc(); | 4947 | mem_cgroup_clear_mc(); |
4934 | } | 4948 | } |
4935 | #else /* !CONFIG_MMU */ | 4949 | #else /* !CONFIG_MMU */ |