[PATCH] cpuset: rebind vma mempolicies fix

Fix more of longstanding bug in cpuset/mempolicy interaction. NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset to just the Memory Nodes allowed by that cpuset. The kernel maintains internal state for each mempolicy, tracking what nodes are used for the MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies. When a tasks cpuset memory placement changes, whether because the cpuset changed, or because the task was attached to a different cpuset, then the tasks mempolicies have to be rebound to the new cpuset placement, so as to preserve the cpuset-relative numbering of the nodes in that policy. An earlier fix handled such mempolicy rebinding for mempolicies attached to a task. This fix rebinds mempolicies attached to vma's (address ranges in a tasks address space.) Due to the need to hold the task->mm->mmap_sem semaphore while updating vma's, the rebinding of vma mempolicies has to be done when the cpuset memory placement is changed, at which time mmap_sem can be safely acquired. The tasks mempolicy is rebound later, when the task next attempts to allocate memory and notices that its task->cpuset_mems_generation is out-of-date with its cpusets mems_generation. Because walking the tasklist to find all tasks attached to a changing cpuset requires holding tasklist_lock, a spinlock, one cannot update the vma's of the affected tasks while doing the tasklist scan. In general, one cannot acquire a semaphore (which can sleep) while already holding a spinlock (such as tasklist_lock). So a list of mm references has to be built up during the tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem acquired, and the vma's in that mm rebound. Once the tasklist lock is dropped, affected tasks may fork new tasks, before their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to point to the cpuset being rebound (there can only be one; cpuset modifications are done under a global 'manage_sem' semaphore), and the mpol_copy code that is used to copy a tasks mempolicies during fork catches such forking tasks, and ensures their children are also rebound. When a task is moved to a different cpuset, it is easier, as there is only one task involved. It's mm->vma's are scanned, using the same mpol_rebind_policy() as used above. It may happen that both the mpol_copy hook and the update done via the tasklist scan update the same mm twice. This is ok, as the mempolicies of each vma in an mm keep track of what mems_allowed they are relative to, and safely no-op a second request to rebind to the same nodes. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Paul Jackson <pj@sgi.com> 2006-01-08 04:01:59 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-08 23:13:44 -0500
commit: 4225399a66b315d4d1fb1cb61b75dda201c832e3 (patch)
tree: c8bd976bc6590c5fe859c6129abb93072d99cfa8 /kernel/cpuset.c
parent: 202f72d5d1b5c2c084f63ef996c736d208b447b5 (diff)
1 files changed, 90 insertions, 0 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6004719f26ee..19f87565be17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 }
 /*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies.
+ *
 * Call with manage_sem held.  May take callback_sem during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
 */
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
+        struct task_struct *g, *p;
+        struct mm_struct **mmarray;
+        int i, n, ntasks;
+        int fudge;
        int retval;
        trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf)
        cs->mems_generation = atomic_read(&cpuset_mems_generation);
        up(&callback_sem);
+        set_cpuset_being_rebound(cs);           /* causes mpol_copy() rebind */
+        fudge = 10;                             /* spare mmarray[] slots */
+        fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+        retval = -ENOMEM;
+        /*
+         * Allocate mmarray[] to hold mm reference for each task
+         * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+         * tasklist_lock.  We could use GFP_ATOMIC, but with a
+         * few more lines of code, we can retry until we get a big
+         * enough mmarray[] w/o using GFP_ATOMIC.
+         */
+        while (1) {
+                ntasks = atomic_read(&cs->count);       /* guess */
+                ntasks += fudge;
+                mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
+                if (!mmarray)
+                        goto done;
+                write_lock_irq(&tasklist_lock);         /* block fork */
+                if (atomic_read(&cs->count) <= ntasks)
+                        break;                          /* got enough */
+                write_unlock_irq(&tasklist_lock);       /* try again */
+                kfree(mmarray);
+        }
+        n = 0;
+        /* Load up mmarray[] with mm reference for each task in cpuset. */
+        do_each_thread(g, p) {
+                struct mm_struct *mm;
+                if (n >= ntasks) {
+                        printk(KERN_WARNING
+                                "Cpuset mempolicy rebind incomplete.\n");
+                        continue;
+                }
+                if (p->cpuset != cs)
+                        continue;
+                mm = get_task_mm(p);
+                if (!mm)
+                        continue;
+                mmarray[n++] = mm;
+        } while_each_thread(g, p);
+        write_unlock_irq(&tasklist_lock);
+        /*
+         * Now that we've dropped the tasklist spinlock, we can
+         * rebind the vma mempolicies of each mm in mmarray[] to their
+         * new cpuset, and release that mm.  The mpol_rebind_mm()
+         * call takes mmap_sem, which we couldn't take while holding
+         * tasklist_lock.  Forks can happen again now - the mpol_copy()
+         * cpuset_being_rebound check will catch such forks, and rebind
+         * their vma mempolicies too.  Because we still hold the global
+         * cpuset manage_sem, we know that no other rebind effort will
+         * be contending for the global variable cpuset_being_rebound.
+         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
+         * is idempotent.
+         */
+        for (i = 0; i < n; i++) {
+                struct mm_struct *mm = mmarray[i];
+                mpol_rebind_mm(mm, &cs->mems_allowed);
+                mmput(mm);
+        }
+        /* We're done rebinding vma's to this cpusets new mems_allowed. */
+        kfree(mmarray);
+        set_cpuset_being_rebound(NULL);
+        retval = 0;
 done:
        return retval;
 }
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        struct cpuset *oldcs;
        cpumask_t cpus;
        nodemask_t from, to;
+        struct mm_struct *mm;
        if (sscanf(pidbuf, "%d", &pid) != 1)
                return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        to = cs->mems_allowed;
        up(&callback_sem);
+        mm = get_task_mm(tsk);
+        if (mm) {
+                mpol_rebind_mm(mm, &to);
+                mmput(mm);
+        }
        if (is_memory_migrate(cs))
                do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
        put_task_struct(tsk);
author	Paul Jackson <pj@sgi.com>	2006-01-08 04:01:59 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-08 23:13:44 -0500
commit	4225399a66b315d4d1fb1cb61b75dda201c832e3 (patch)
tree	c8bd976bc6590c5fe859c6129abb93072d99cfa8 /kernel/cpuset.c
parent	202f72d5d1b5c2c084f63ef996c736d208b447b5 (diff)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6004719f26ee..19f87565be17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset cs, char buf)
812	}	812	}
813		813
814	/*	814	/*
		815	* Handle user request to change the 'mems' memory placement
		816	* of a cpuset. Needs to validate the request, update the
		817	* cpusets mems_allowed and mems_generation, and for each
		818	* task in the cpuset, rebind any vma mempolicies.
		819	*
815	* Call with manage_sem held. May take callback_sem during call.	820	* Call with manage_sem held. May take callback_sem during call.
		821	* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
		822	* lock each such tasks mm->mmap_sem, scan its vma's and rebind
		823	* their mempolicies to the cpusets new mems_allowed.
816	*/	824	*/
817		825
818	static int update_nodemask(struct cpuset cs, char buf)	826	static int update_nodemask(struct cpuset cs, char buf)
819	{	827	{
820	struct cpuset trialcs;	828	struct cpuset trialcs;
		829	struct task_struct g, p;
		830	struct mm_struct **mmarray;
		831	int i, n, ntasks;
		832	int fudge;
821	int retval;	833	int retval;
822		834
823	trialcs = *cs;	835	trialcs = *cs;
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset cs, char buf)
839	cs->mems_generation = atomic_read(&cpuset_mems_generation);	851	cs->mems_generation = atomic_read(&cpuset_mems_generation);
840	up(&callback_sem);	852	up(&callback_sem);
841		853
		854	set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
		855
		856	fudge = 10; /* spare mmarray[] slots */
		857	fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
		858	retval = -ENOMEM;
		859
		860	/*
		861	* Allocate mmarray[] to hold mm reference for each task
		862	* in cpuset cs. Can't kmalloc GFP_KERNEL while holding
		863	* tasklist_lock. We could use GFP_ATOMIC, but with a
		864	* few more lines of code, we can retry until we get a big
		865	* enough mmarray[] w/o using GFP_ATOMIC.
		866	*/
		867	while (1) {
		868	ntasks = atomic_read(&cs->count); /* guess */
		869	ntasks += fudge;
		870	mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
		871	if (!mmarray)
		872	goto done;
		873	write_lock_irq(&tasklist_lock); /* block fork */
		874	if (atomic_read(&cs->count) <= ntasks)
		875	break; /* got enough */
		876	write_unlock_irq(&tasklist_lock); /* try again */
		877	kfree(mmarray);
		878	}
		879
		880	n = 0;
		881
		882	/* Load up mmarray[] with mm reference for each task in cpuset. */
		883	do_each_thread(g, p) {
		884	struct mm_struct *mm;
		885
		886	if (n >= ntasks) {
		887	printk(KERN_WARNING
		888	"Cpuset mempolicy rebind incomplete.\n");
		889	continue;
		890	}
		891	if (p->cpuset != cs)
		892	continue;
		893	mm = get_task_mm(p);
		894	if (!mm)
		895	continue;
		896	mmarray[n++] = mm;
		897	} while_each_thread(g, p);
		898	write_unlock_irq(&tasklist_lock);
		899
		900	/*
		901	* Now that we've dropped the tasklist spinlock, we can
		902	* rebind the vma mempolicies of each mm in mmarray[] to their
		903	* new cpuset, and release that mm. The mpol_rebind_mm()
		904	* call takes mmap_sem, which we couldn't take while holding
		905	* tasklist_lock. Forks can happen again now - the mpol_copy()
		906	* cpuset_being_rebound check will catch such forks, and rebind
		907	* their vma mempolicies too. Because we still hold the global
		908	* cpuset manage_sem, we know that no other rebind effort will
		909	* be contending for the global variable cpuset_being_rebound.
		910	* It's ok if we rebind the same mm twice; mpol_rebind_mm()
		911	* is idempotent.
		912	*/
		913	for (i = 0; i < n; i++) {
		914	struct mm_struct *mm = mmarray[i];
		915
		916	mpol_rebind_mm(mm, &cs->mems_allowed);
		917	mmput(mm);
		918	}
		919
		920	/* We're done rebinding vma's to this cpusets new mems_allowed. */
		921	kfree(mmarray);
		922	set_cpuset_being_rebound(NULL);
		923	retval = 0;
842	done:	924	done:
843	return retval;	925	return retval;
844	}	926	}
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset cs, char pidbuf, char **ppathbuf)
1011	struct cpuset *oldcs;	1093	struct cpuset *oldcs;
1012	cpumask_t cpus;	1094	cpumask_t cpus;
1013	nodemask_t from, to;	1095	nodemask_t from, to;
		1096	struct mm_struct *mm;
1014		1097
1015	if (sscanf(pidbuf, "%d", &pid) != 1)	1098	if (sscanf(pidbuf, "%d", &pid) != 1)
1016	return -EIO;	1099	return -EIO;
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset cs, char pidbuf, char **ppathbuf)
1060	to = cs->mems_allowed;	1143	to = cs->mems_allowed;
1061		1144
1062	up(&callback_sem);	1145	up(&callback_sem);
		1146
		1147	mm = get_task_mm(tsk);
		1148	if (mm) {
		1149	mpol_rebind_mm(mm, &to);
		1150	mmput(mm);
		1151	}
		1152
1063	if (is_memory_migrate(cs))	1153	if (is_memory_migrate(cs))
1064	do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);	1154	do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
1065	put_task_struct(tsk);	1155	put_task_struct(tsk);