Fix cpusets update_cpumask

Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks: - collect batches of tasks under tasklist_lock and then call set_cpus_allowed() on them outside the lock (since this can sleep). - add a simple generic priority heap type to allow efficient collection of batches of tasks to be processed without duplicating or missing any tasks in subsequent batches. - make "cpus" file update a no-op if the mask hasn't changed - fix race between update_cpumask() and sched_setaffinity() by making sched_setaffinity() post-check that it's not running on any cpus outside cpuset_cpus_allowed(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Menage <menage@google.com> 2007-10-19 02:40:22 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-19 14:53:41 -0400
commit: 8707d8b8c0cbdf4441507f8dded194167da896c7 (patch)
tree: 1e9ac6b15027bd55263378e551c1595a937d66d6 /kernel
parent: 020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff)
2 files changed, 114 insertions, 4 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64ad59cfad9b..fa31cb9f9898 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,6 +38,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/prio_heap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -701,6 +702,36 @@ done:
        /* Don't kfree(doms) -- partition_sched_domains() does that. */
 }
+static inline int started_after_time(struct task_struct *t1,
+                                     struct timespec *time,
+                                     struct task_struct *t2)
+{
+        int start_diff = timespec_compare(&t1->start_time, time);
+        if (start_diff > 0) {
+                return 1;
+        } else if (start_diff < 0) {
+                return 0;
+        } else {
+                /*
+                 * Arbitrarily, if two processes started at the same
+                 * time, we'll say that the lower pointer value
+                 * started first. Note that t2 may have exited by now
+                 * so this may not be a valid pointer any longer, but
+                 * that's fine - it still serves to distinguish
+                 * between two tasks started (effectively)
+                 * simultaneously.
+                 */
+                return t1 > t2;
+        }
+}
+static inline int started_after(void *p1, void *p2)
+{
+        struct task_struct *t1 = p1;
+        struct task_struct *t2 = p2;
+        return started_after_time(t1, &t2->start_time, t2);
+}
 /*
 * Call with manage_mutex held.  May take callback_mutex during call.
 */
@@ -708,8 +739,15 @@ done:
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
-        int retval;
+        int retval, i;
-        int cpus_changed, is_load_balanced;
+        int is_load_balanced;
+        struct cgroup_iter it;
+        struct cgroup *cgrp = cs->css.cgroup;
+        struct task_struct *p, *dropped;
+        /* Never dereference latest_task, since it's not refcounted */
+        struct task_struct *latest_task = NULL;
+        struct ptr_heap heap;
+        struct timespec latest_time = { 0, 0 };
        /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
        if (cs == &top_cpuset)
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (retval < 0)
                return retval;
-        cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+        /* Nothing to do if the cpus didn't change */
+        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
+                return 0;
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+        if (retval)
+                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
        cs->cpus_allowed = trialcs.cpus_allowed;
        mutex_unlock(&callback_mutex);
-        if (cpus_changed && is_load_balanced)
+ again:
+        /*
+         * Scan tasks in the cpuset, and update the cpumasks of any
+         * that need an update. Since we can't call set_cpus_allowed()
+         * while holding tasklist_lock, gather tasks to be processed
+         * in a heap structure. If the statically-sized heap fills up,
+         * overflow tasks that started later, and in future iterations
+         * only consider tasks that started after the latest task in
+         * the previous pass. This guarantees forward progress and
+         * that we don't miss any tasks
+         */
+        heap.size = 0;
+        cgroup_iter_start(cgrp, &it);
+        while ((p = cgroup_iter_next(cgrp, &it))) {
+                /* Only affect tasks that don't have the right cpus_allowed */
+                if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
+                        continue;
+                /*
+                 * Only process tasks that started after the last task
+                 * we processed
+                 */
+                if (!started_after_time(p, &latest_time, latest_task))
+                        continue;
+                dropped = heap_insert(&heap, p);
+                if (dropped == NULL) {
+                        get_task_struct(p);
+                } else if (dropped != p) {
+                        get_task_struct(p);
+                        put_task_struct(dropped);
+                }
+        }
+        cgroup_iter_end(cgrp, &it);
+        if (heap.size) {
+                for (i = 0; i < heap.size; i++) {
+                        struct task_struct *p = heap.ptrs[i];
+                        if (i == 0) {
+                                latest_time = p->start_time;
+                                latest_task = p;
+                        }
+                        set_cpus_allowed(p, cs->cpus_allowed);
+                        put_task_struct(p);
+                }
+                /*
+                 * If we had to process any tasks at all, scan again
+                 * in case some of them were in the middle of forking
+                 * children that didn't notice the new cpumask
+                 * restriction.  Not the most efficient way to do it,
+                 * but it avoids having to take callback_mutex in the
+                 * fork path
+                 */
+                goto again;
+        }
+        heap_free(&heap);
+        if (is_load_balanced)
                rebuild_sched_domains();
        return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 39d6354af489..72a809a54d5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
        cpus_allowed = cpuset_cpus_allowed(p);
        cpus_and(new_mask, new_mask, cpus_allowed);
+ again:
        retval = set_cpus_allowed(p, new_mask);
+        if (!retval) {
+                cpus_allowed = cpuset_cpus_allowed(p);
+                if (!cpus_subset(new_mask, cpus_allowed)) {
+                        /*
+                         * We must have raced with a concurrent cpuset
+                         * update. Just reset the cpus_allowed to the
+                         * cpuset's cpus_allowed
+                         */
+                        new_mask = cpus_allowed;
+                        goto again;
+                }
+        }
 out_unlock:
        put_task_struct(p);
        mutex_unlock(&sched_hotcpu_mutex);
author	Paul Menage <menage@google.com>	2007-10-19 02:40:22 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-19 14:53:41 -0400
commit	8707d8b8c0cbdf4441507f8dded194167da896c7 (patch)
tree	1e9ac6b15027bd55263378e551c1595a937d66d6 /kernel
parent	020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64ad59cfad9b..fa31cb9f9898 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -38,6 +38,7 @@
38	#include <linux/mount.h>	38	#include <linux/mount.h>
39	#include <linux/namei.h>	39	#include <linux/namei.h>
40	#include <linux/pagemap.h>	40	#include <linux/pagemap.h>
		41	#include <linux/prio_heap.h>
41	#include <linux/proc_fs.h>	42	#include <linux/proc_fs.h>
42	#include <linux/rcupdate.h>	43	#include <linux/rcupdate.h>
43	#include <linux/sched.h>	44	#include <linux/sched.h>
@@ -701,6 +702,36 @@ done:
701	/* Don't kfree(doms) -- partition_sched_domains() does that. */	702	/* Don't kfree(doms) -- partition_sched_domains() does that. */
702	}	703	}
703		704
		705	static inline int started_after_time(struct task_struct *t1,
		706	struct timespec *time,
		707	struct task_struct *t2)
		708	{
		709	int start_diff = timespec_compare(&t1->start_time, time);
		710	if (start_diff > 0) {
		711	return 1;
		712	} else if (start_diff < 0) {
		713	return 0;
		714	} else {
		715	/*
		716	* Arbitrarily, if two processes started at the same
		717	* time, we'll say that the lower pointer value
		718	* started first. Note that t2 may have exited by now
		719	* so this may not be a valid pointer any longer, but
		720	* that's fine - it still serves to distinguish
		721	* between two tasks started (effectively)
		722	* simultaneously.
		723	*/
		724	return t1 > t2;
		725	}
		726	}
		727
		728	static inline int started_after(void p1, void p2)
		729	{
		730	struct task_struct *t1 = p1;
		731	struct task_struct *t2 = p2;
		732	return started_after_time(t1, &t2->start_time, t2);
		733	}
		734
704	/*	735	/*
705	* Call with manage_mutex held. May take callback_mutex during call.	736	* Call with manage_mutex held. May take callback_mutex during call.
706	*/	737	*/
@@ -708,8 +739,15 @@ done:
708	static int update_cpumask(struct cpuset cs, char buf)	739	static int update_cpumask(struct cpuset cs, char buf)
709	{	740	{
710	struct cpuset trialcs;	741	struct cpuset trialcs;
711	int retval;	742	int retval, i;
712	int cpus_changed, is_load_balanced;	743	int is_load_balanced;
		744	struct cgroup_iter it;
		745	struct cgroup *cgrp = cs->css.cgroup;
		746	struct task_struct p, dropped;
		747	/* Never dereference latest_task, since it's not refcounted */
		748	struct task_struct *latest_task = NULL;
		749	struct ptr_heap heap;
		750	struct timespec latest_time = { 0, 0 };
713		751
714	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */	752	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
715	if (cs == &top_cpuset)	753	if (cs == &top_cpuset)
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset cs, char buf)
736	if (retval < 0)	774	if (retval < 0)
737	return retval;	775	return retval;
738		776
739	cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);	777	/* Nothing to do if the cpus didn't change */
		778	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
		779	return 0;
		780	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
		781	if (retval)
		782	return retval;
		783
740	is_load_balanced = is_sched_load_balance(&trialcs);	784	is_load_balanced = is_sched_load_balance(&trialcs);
741		785
742	mutex_lock(&callback_mutex);	786	mutex_lock(&callback_mutex);
743	cs->cpus_allowed = trialcs.cpus_allowed;	787	cs->cpus_allowed = trialcs.cpus_allowed;
744	mutex_unlock(&callback_mutex);	788	mutex_unlock(&callback_mutex);
745		789
746	if (cpus_changed && is_load_balanced)	790	again:
		791	/*
		792	* Scan tasks in the cpuset, and update the cpumasks of any
		793	* that need an update. Since we can't call set_cpus_allowed()
		794	* while holding tasklist_lock, gather tasks to be processed
		795	* in a heap structure. If the statically-sized heap fills up,
		796	* overflow tasks that started later, and in future iterations
		797	* only consider tasks that started after the latest task in
		798	* the previous pass. This guarantees forward progress and
		799	* that we don't miss any tasks
		800	*/
		801	heap.size = 0;
		802	cgroup_iter_start(cgrp, &it);
		803	while ((p = cgroup_iter_next(cgrp, &it))) {
		804	/* Only affect tasks that don't have the right cpus_allowed */
		805	if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
		806	continue;
		807	/*
		808	* Only process tasks that started after the last task
		809	* we processed
		810	*/
		811	if (!started_after_time(p, &latest_time, latest_task))
		812	continue;
		813	dropped = heap_insert(&heap, p);
		814	if (dropped == NULL) {
		815	get_task_struct(p);
		816	} else if (dropped != p) {
		817	get_task_struct(p);
		818	put_task_struct(dropped);
		819	}
		820	}
		821	cgroup_iter_end(cgrp, &it);
		822	if (heap.size) {
		823	for (i = 0; i < heap.size; i++) {
		824	struct task_struct *p = heap.ptrs[i];
		825	if (i == 0) {
		826	latest_time = p->start_time;
		827	latest_task = p;
		828	}
		829	set_cpus_allowed(p, cs->cpus_allowed);
		830	put_task_struct(p);
		831	}
		832	/*
		833	* If we had to process any tasks at all, scan again
		834	* in case some of them were in the middle of forking
		835	* children that didn't notice the new cpumask
		836	* restriction. Not the most efficient way to do it,
		837	* but it avoids having to take callback_mutex in the
		838	* fork path
		839	*/
		840	goto again;
		841	}
		842	heap_free(&heap);
		843	if (is_load_balanced)
747	rebuild_sched_domains();	844	rebuild_sched_domains();
748		845
749	return 0;	846	return 0;


diff --git a/kernel/sched.c b/kernel/sched.c index 39d6354af489..72a809a54d5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4471		4471
4472	cpus_allowed = cpuset_cpus_allowed(p);	4472	cpus_allowed = cpuset_cpus_allowed(p);
4473	cpus_and(new_mask, new_mask, cpus_allowed);	4473	cpus_and(new_mask, new_mask, cpus_allowed);
		4474	again:
4474	retval = set_cpus_allowed(p, new_mask);	4475	retval = set_cpus_allowed(p, new_mask);
4475		4476
		4477	if (!retval) {
		4478	cpus_allowed = cpuset_cpus_allowed(p);
		4479	if (!cpus_subset(new_mask, cpus_allowed)) {
		4480	/*
		4481	* We must have raced with a concurrent cpuset
		4482	* update. Just reset the cpus_allowed to the
		4483	* cpuset's cpus_allowed
		4484	*/
		4485	new_mask = cpus_allowed;
		4486	goto again;
		4487	}
		4488	}
4476	out_unlock:	4489	out_unlock:
4477	put_task_struct(p);	4490	put_task_struct(p);
4478	mutex_unlock(&sched_hotcpu_mutex);	4491	mutex_unlock(&sched_hotcpu_mutex);