aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>2011-01-13 18:47:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:51 -0500
commitdfe076b0971a783469bc2066e85d46e23c8acb1c (patch)
treeee0856c119ed898212da2555f936fb3284c5ff59 /mm
parent043d18b1e5bdfc4870b8a19d00f0d5c636a5c231 (diff)
memcg: fix deadlock between cpuset and memcg
Commit b1dd693e ("memcg: avoid deadlock between move charge and try_charge()") can cause another deadlock about mmap_sem on task migration if cpuset and memcg are mounted onto the same mount point. After the commit, cgroup_attach_task() has sequence like: cgroup_attach_task() ss->can_attach() cpuset_can_attach() mem_cgroup_can_attach() down_read(&mmap_sem) (1) ss->attach() cpuset_attach() mpol_rebind_mm() down_write(&mmap_sem) (2) up_write(&mmap_sem) cpuset_migrate_mm() do_migrate_pages() down_read(&mmap_sem) up_read(&mmap_sem) mem_cgroup_move_task() mem_cgroup_clear_mc() up_read(&mmap_sem) We can cause deadlock at (2) because we've already aquire the mmap_sem at (1). But the commit itself is necessary to fix deadlocks which have existed before the commit like: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() | -> move charge should wake it up Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() | -> move charge should wake it up This patch fixes all of these problems by: 1. revert the commit. 2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge() has released the mmap_sem. 3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel all extra charges, wake up all waiters, and retry trylock. Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Reported-by: Ben Blum <bblum@andrew.cmu.edu> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Paul Menage <menage@google.com> Cc: Hiroyuki Kamezawa <kamezawa.hiroyuki@gmail.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c84
1 files changed, 49 insertions, 35 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1b44ad64f281..c339d7431bda 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
292 unsigned long moved_charge; 292 unsigned long moved_charge;
293 unsigned long moved_swap; 293 unsigned long moved_swap;
294 struct task_struct *moving_task; /* a task moving charges */ 294 struct task_struct *moving_task; /* a task moving charges */
295 struct mm_struct *mm;
296 wait_queue_head_t waitq; /* a waitq for other context */ 295 wait_queue_head_t waitq; /* a waitq for other context */
297} mc = { 296} mc = {
298 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 297 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -4681,7 +4680,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4681 unsigned long precharge; 4680 unsigned long precharge;
4682 struct vm_area_struct *vma; 4681 struct vm_area_struct *vma;
4683 4682
4684 /* We've already held the mmap_sem */ 4683 down_read(&mm->mmap_sem);
4685 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4684 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4686 struct mm_walk mem_cgroup_count_precharge_walk = { 4685 struct mm_walk mem_cgroup_count_precharge_walk = {
4687 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4686 .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4693,6 +4692,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4693 walk_page_range(vma->vm_start, vma->vm_end, 4692 walk_page_range(vma->vm_start, vma->vm_end,
4694 &mem_cgroup_count_precharge_walk); 4693 &mem_cgroup_count_precharge_walk);
4695 } 4694 }
4695 up_read(&mm->mmap_sem);
4696 4696
4697 precharge = mc.precharge; 4697 precharge = mc.precharge;
4698 mc.precharge = 0; 4698 mc.precharge = 0;
@@ -4702,10 +4702,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4702 4702
4703static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4703static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4704{ 4704{
4705 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4705 unsigned long precharge = mem_cgroup_count_precharge(mm);
4706
4707 VM_BUG_ON(mc.moving_task);
4708 mc.moving_task = current;
4709 return mem_cgroup_do_precharge(precharge);
4706} 4710}
4707 4711
4708static void mem_cgroup_clear_mc(void) 4712/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4713static void __mem_cgroup_clear_mc(void)
4709{ 4714{
4710 struct mem_cgroup *from = mc.from; 4715 struct mem_cgroup *from = mc.from;
4711 struct mem_cgroup *to = mc.to; 4716 struct mem_cgroup *to = mc.to;
@@ -4740,23 +4745,28 @@ static void mem_cgroup_clear_mc(void)
4740 PAGE_SIZE * mc.moved_swap); 4745 PAGE_SIZE * mc.moved_swap);
4741 } 4746 }
4742 /* we've already done mem_cgroup_get(mc.to) */ 4747 /* we've already done mem_cgroup_get(mc.to) */
4743
4744 mc.moved_swap = 0; 4748 mc.moved_swap = 0;
4745 } 4749 }
4746 if (mc.mm) { 4750 memcg_oom_recover(from);
4747 up_read(&mc.mm->mmap_sem); 4751 memcg_oom_recover(to);
4748 mmput(mc.mm); 4752 wake_up_all(&mc.waitq);
4749 } 4753}
4754
4755static void mem_cgroup_clear_mc(void)
4756{
4757 struct mem_cgroup *from = mc.from;
4758
4759 /*
4760 * we must clear moving_task before waking up waiters at the end of
4761 * task migration.
4762 */
4763 mc.moving_task = NULL;
4764 __mem_cgroup_clear_mc();
4750 spin_lock(&mc.lock); 4765 spin_lock(&mc.lock);
4751 mc.from = NULL; 4766 mc.from = NULL;
4752 mc.to = NULL; 4767 mc.to = NULL;
4753 spin_unlock(&mc.lock); 4768 spin_unlock(&mc.lock);
4754 mc.moving_task = NULL;
4755 mc.mm = NULL;
4756 mem_cgroup_end_move(from); 4769 mem_cgroup_end_move(from);
4757 memcg_oom_recover(from);
4758 memcg_oom_recover(to);
4759 wake_up_all(&mc.waitq);
4760} 4770}
4761 4771
4762static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4772static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4778,38 +4788,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4778 return 0; 4788 return 0;
4779 /* We move charges only when we move a owner of the mm */ 4789 /* We move charges only when we move a owner of the mm */
4780 if (mm->owner == p) { 4790 if (mm->owner == p) {
4781 /*
4782 * We do all the move charge works under one mmap_sem to
4783 * avoid deadlock with down_write(&mmap_sem)
4784 * -> try_charge() -> if (mc.moving_task) -> sleep.
4785 */
4786 down_read(&mm->mmap_sem);
4787
4788 VM_BUG_ON(mc.from); 4791 VM_BUG_ON(mc.from);
4789 VM_BUG_ON(mc.to); 4792 VM_BUG_ON(mc.to);
4790 VM_BUG_ON(mc.precharge); 4793 VM_BUG_ON(mc.precharge);
4791 VM_BUG_ON(mc.moved_charge); 4794 VM_BUG_ON(mc.moved_charge);
4792 VM_BUG_ON(mc.moved_swap); 4795 VM_BUG_ON(mc.moved_swap);
4793 VM_BUG_ON(mc.moving_task);
4794 VM_BUG_ON(mc.mm);
4795
4796 mem_cgroup_start_move(from); 4796 mem_cgroup_start_move(from);
4797 spin_lock(&mc.lock); 4797 spin_lock(&mc.lock);
4798 mc.from = from; 4798 mc.from = from;
4799 mc.to = mem; 4799 mc.to = mem;
4800 mc.precharge = 0;
4801 mc.moved_charge = 0;
4802 mc.moved_swap = 0;
4803 spin_unlock(&mc.lock); 4800 spin_unlock(&mc.lock);
4804 mc.moving_task = current; 4801 /* We set mc.moving_task later */
4805 mc.mm = mm;
4806 4802
4807 ret = mem_cgroup_precharge_mc(mm); 4803 ret = mem_cgroup_precharge_mc(mm);
4808 if (ret) 4804 if (ret)
4809 mem_cgroup_clear_mc(); 4805 mem_cgroup_clear_mc();
4810 /* We call up_read() and mmput() in clear_mc(). */ 4806 }
4811 } else 4807 mmput(mm);
4812 mmput(mm);
4813 } 4808 }
4814 return ret; 4809 return ret;
4815} 4810}
@@ -4898,7 +4893,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4898 struct vm_area_struct *vma; 4893 struct vm_area_struct *vma;
4899 4894
4900 lru_add_drain_all(); 4895 lru_add_drain_all();
4901 /* We've already held the mmap_sem */ 4896retry:
4897 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
4898 /*
4899 * Someone who are holding the mmap_sem might be waiting in
4900 * waitq. So we cancel all extra charges, wake up all waiters,
4901 * and retry. Because we cancel precharges, we might not be able
4902 * to move enough charges, but moving charge is a best-effort
4903 * feature anyway, so it wouldn't be a big problem.
4904 */
4905 __mem_cgroup_clear_mc();
4906 cond_resched();
4907 goto retry;
4908 }
4902 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4909 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4903 int ret; 4910 int ret;
4904 struct mm_walk mem_cgroup_move_charge_walk = { 4911 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4917,6 +4924,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4917 */ 4924 */
4918 break; 4925 break;
4919 } 4926 }
4927 up_read(&mm->mmap_sem);
4920} 4928}
4921 4929
4922static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4930static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4925,11 +4933,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4925 struct task_struct *p, 4933 struct task_struct *p,
4926 bool threadgroup) 4934 bool threadgroup)
4927{ 4935{
4928 if (!mc.mm) 4936 struct mm_struct *mm;
4937
4938 if (!mc.to)
4929 /* no need to move charge */ 4939 /* no need to move charge */
4930 return; 4940 return;
4931 4941
4932 mem_cgroup_move_charge(mc.mm); 4942 mm = get_task_mm(p);
4943 if (mm) {
4944 mem_cgroup_move_charge(mm);
4945 mmput(mm);
4946 }
4933 mem_cgroup_clear_mc(); 4947 mem_cgroup_clear_mc();
4934} 4948}
4935#else /* !CONFIG_MMU */ 4949#else /* !CONFIG_MMU */