aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaisuke Nishimura <nishimura@mxp.nes.nec.co.jp>2010-11-24 15:57:06 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-11-24 16:50:44 -0500
commitb1dd693e5b9348bd68a80e679e03cf9c0973b01b (patch)
tree557c8e0634fba47ea61ed9fbb8742d907c567944
parent11e7946f196e5fdde20584e3e58c60335ee3b3bc (diff)
memcg: avoid deadlock between move charge and try_charge()
__mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. mlock does it). This means it can cause deadlock if it races with move charge: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() To avoid this deadlock, we do all the move charge works (both can_attach() and attach()) under one mmap_sem section. And after this patch, we set/clear mc.moving_task outside mc.lock, because we use the lock only to check mc.from/to. Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memcontrol.c43
1 files changed, 26 insertions, 17 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 62d1880f6992..26218df8d19d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -278,13 +278,14 @@ enum move_type {
278 278
279/* "mc" and its members are protected by cgroup_mutex */ 279/* "mc" and its members are protected by cgroup_mutex */
280static struct move_charge_struct { 280static struct move_charge_struct {
281 spinlock_t lock; /* for from, to, moving_task */ 281 spinlock_t lock; /* for from, to */
282 struct mem_cgroup *from; 282 struct mem_cgroup *from;
283 struct mem_cgroup *to; 283 struct mem_cgroup *to;
284 unsigned long precharge; 284 unsigned long precharge;
285 unsigned long moved_charge; 285 unsigned long moved_charge;
286 unsigned long moved_swap; 286 unsigned long moved_swap;
287 struct task_struct *moving_task; /* a task moving charges */ 287 struct task_struct *moving_task; /* a task moving charges */
288 struct mm_struct *mm;
288 wait_queue_head_t waitq; /* a waitq for other context */ 289 wait_queue_head_t waitq; /* a waitq for other context */
289} mc = { 290} mc = {
290 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 291 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4631 unsigned long precharge; 4632 unsigned long precharge;
4632 struct vm_area_struct *vma; 4633 struct vm_area_struct *vma;
4633 4634
4634 down_read(&mm->mmap_sem); 4635 /* We've already held the mmap_sem */
4635 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4636 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4636 struct mm_walk mem_cgroup_count_precharge_walk = { 4637 struct mm_walk mem_cgroup_count_precharge_walk = {
4637 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4638 .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4643 walk_page_range(vma->vm_start, vma->vm_end, 4644 walk_page_range(vma->vm_start, vma->vm_end,
4644 &mem_cgroup_count_precharge_walk); 4645 &mem_cgroup_count_precharge_walk);
4645 } 4646 }
4646 up_read(&mm->mmap_sem);
4647 4647
4648 precharge = mc.precharge; 4648 precharge = mc.precharge;
4649 mc.precharge = 0; 4649 mc.precharge = 0;
@@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void)
4694 4694
4695 mc.moved_swap = 0; 4695 mc.moved_swap = 0;
4696 } 4696 }
4697 if (mc.mm) {
4698 up_read(&mc.mm->mmap_sem);
4699 mmput(mc.mm);
4700 }
4697 spin_lock(&mc.lock); 4701 spin_lock(&mc.lock);
4698 mc.from = NULL; 4702 mc.from = NULL;
4699 mc.to = NULL; 4703 mc.to = NULL;
4700 mc.moving_task = NULL;
4701 spin_unlock(&mc.lock); 4704 spin_unlock(&mc.lock);
4705 mc.moving_task = NULL;
4706 mc.mm = NULL;
4702 mem_cgroup_end_move(from); 4707 mem_cgroup_end_move(from);
4703 memcg_oom_recover(from); 4708 memcg_oom_recover(from);
4704 memcg_oom_recover(to); 4709 memcg_oom_recover(to);
@@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4724 return 0; 4729 return 0;
4725 /* We move charges only when we move a owner of the mm */ 4730 /* We move charges only when we move a owner of the mm */
4726 if (mm->owner == p) { 4731 if (mm->owner == p) {
4732 /*
4733 * We do all the move charge works under one mmap_sem to
4734 * avoid deadlock with down_write(&mmap_sem)
4735 * -> try_charge() -> if (mc.moving_task) -> sleep.
4736 */
4737 down_read(&mm->mmap_sem);
4738
4727 VM_BUG_ON(mc.from); 4739 VM_BUG_ON(mc.from);
4728 VM_BUG_ON(mc.to); 4740 VM_BUG_ON(mc.to);
4729 VM_BUG_ON(mc.precharge); 4741 VM_BUG_ON(mc.precharge);
4730 VM_BUG_ON(mc.moved_charge); 4742 VM_BUG_ON(mc.moved_charge);
4731 VM_BUG_ON(mc.moved_swap); 4743 VM_BUG_ON(mc.moved_swap);
4732 VM_BUG_ON(mc.moving_task); 4744 VM_BUG_ON(mc.moving_task);
4745 VM_BUG_ON(mc.mm);
4746
4733 mem_cgroup_start_move(from); 4747 mem_cgroup_start_move(from);
4734 spin_lock(&mc.lock); 4748 spin_lock(&mc.lock);
4735 mc.from = from; 4749 mc.from = from;
@@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4737 mc.precharge = 0; 4751 mc.precharge = 0;
4738 mc.moved_charge = 0; 4752 mc.moved_charge = 0;
4739 mc.moved_swap = 0; 4753 mc.moved_swap = 0;
4740 mc.moving_task = current;
4741 spin_unlock(&mc.lock); 4754 spin_unlock(&mc.lock);
4755 mc.moving_task = current;
4756 mc.mm = mm;
4742 4757
4743 ret = mem_cgroup_precharge_mc(mm); 4758 ret = mem_cgroup_precharge_mc(mm);
4744 if (ret) 4759 if (ret)
4745 mem_cgroup_clear_mc(); 4760 mem_cgroup_clear_mc();
4746 } 4761 /* We call up_read() and mmput() in clear_mc(). */
4747 mmput(mm); 4762 } else
4763 mmput(mm);
4748 } 4764 }
4749 return ret; 4765 return ret;
4750} 4766}
@@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4832 struct vm_area_struct *vma; 4848 struct vm_area_struct *vma;
4833 4849
4834 lru_add_drain_all(); 4850 lru_add_drain_all();
4835 down_read(&mm->mmap_sem); 4851 /* We've already held the mmap_sem */
4836 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4852 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4837 int ret; 4853 int ret;
4838 struct mm_walk mem_cgroup_move_charge_walk = { 4854 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4851 */ 4867 */
4852 break; 4868 break;
4853 } 4869 }
4854 up_read(&mm->mmap_sem);
4855} 4870}
4856 4871
4857static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4872static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4860 struct task_struct *p, 4875 struct task_struct *p,
4861 bool threadgroup) 4876 bool threadgroup)
4862{ 4877{
4863 struct mm_struct *mm; 4878 if (!mc.mm)
4864
4865 if (!mc.to)
4866 /* no need to move charge */ 4879 /* no need to move charge */
4867 return; 4880 return;
4868 4881
4869 mm = get_task_mm(p); 4882 mem_cgroup_move_charge(mc.mm);
4870 if (mm) {
4871 mem_cgroup_move_charge(mm);
4872 mmput(mm);
4873 }
4874 mem_cgroup_clear_mc(); 4883 mem_cgroup_clear_mc();
4875} 4884}
4876#else /* !CONFIG_MMU */ 4885#else /* !CONFIG_MMU */