aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:40:22 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:41 -0400
commit8707d8b8c0cbdf4441507f8dded194167da896c7 (patch)
tree1e9ac6b15027bd55263378e551c1595a937d66d6 /kernel
parent020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff)
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks: - collect batches of tasks under tasklist_lock and then call set_cpus_allowed() on them outside the lock (since this can sleep). - add a simple generic priority heap type to allow efficient collection of batches of tasks to be processed without duplicating or missing any tasks in subsequent batches. - make "cpus" file update a no-op if the mask hasn't changed - fix race between update_cpumask() and sched_setaffinity() by making sched_setaffinity() post-check that it's not running on any cpus outside cpuset_cpus_allowed(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c105
-rw-r--r--kernel/sched.c13
2 files changed, 114 insertions, 4 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64ad59cfad9b..fa31cb9f9898 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,6 +38,7 @@
38#include <linux/mount.h> 38#include <linux/mount.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/prio_heap.h>
41#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
42#include <linux/rcupdate.h> 43#include <linux/rcupdate.h>
43#include <linux/sched.h> 44#include <linux/sched.h>
@@ -701,6 +702,36 @@ done:
701 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 702 /* Don't kfree(doms) -- partition_sched_domains() does that. */
702} 703}
703 704
705static inline int started_after_time(struct task_struct *t1,
706 struct timespec *time,
707 struct task_struct *t2)
708{
709 int start_diff = timespec_compare(&t1->start_time, time);
710 if (start_diff > 0) {
711 return 1;
712 } else if (start_diff < 0) {
713 return 0;
714 } else {
715 /*
716 * Arbitrarily, if two processes started at the same
717 * time, we'll say that the lower pointer value
718 * started first. Note that t2 may have exited by now
719 * so this may not be a valid pointer any longer, but
720 * that's fine - it still serves to distinguish
721 * between two tasks started (effectively)
722 * simultaneously.
723 */
724 return t1 > t2;
725 }
726}
727
728static inline int started_after(void *p1, void *p2)
729{
730 struct task_struct *t1 = p1;
731 struct task_struct *t2 = p2;
732 return started_after_time(t1, &t2->start_time, t2);
733}
734
704/* 735/*
705 * Call with manage_mutex held. May take callback_mutex during call. 736 * Call with manage_mutex held. May take callback_mutex during call.
706 */ 737 */
@@ -708,8 +739,15 @@ done:
708static int update_cpumask(struct cpuset *cs, char *buf) 739static int update_cpumask(struct cpuset *cs, char *buf)
709{ 740{
710 struct cpuset trialcs; 741 struct cpuset trialcs;
711 int retval; 742 int retval, i;
712 int cpus_changed, is_load_balanced; 743 int is_load_balanced;
744 struct cgroup_iter it;
745 struct cgroup *cgrp = cs->css.cgroup;
746 struct task_struct *p, *dropped;
747 /* Never dereference latest_task, since it's not refcounted */
748 struct task_struct *latest_task = NULL;
749 struct ptr_heap heap;
750 struct timespec latest_time = { 0, 0 };
713 751
714 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
715 if (cs == &top_cpuset) 753 if (cs == &top_cpuset)
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf)
736 if (retval < 0) 774 if (retval < 0)
737 return retval; 775 return retval;
738 776
739 cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 777 /* Nothing to do if the cpus didn't change */
778 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779 return 0;
780 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781 if (retval)
782 return retval;
783
740 is_load_balanced = is_sched_load_balance(&trialcs); 784 is_load_balanced = is_sched_load_balance(&trialcs);
741 785
742 mutex_lock(&callback_mutex); 786 mutex_lock(&callback_mutex);
743 cs->cpus_allowed = trialcs.cpus_allowed; 787 cs->cpus_allowed = trialcs.cpus_allowed;
744 mutex_unlock(&callback_mutex); 788 mutex_unlock(&callback_mutex);
745 789
746 if (cpus_changed && is_load_balanced) 790 again:
791 /*
792 * Scan tasks in the cpuset, and update the cpumasks of any
793 * that need an update. Since we can't call set_cpus_allowed()
794 * while holding tasklist_lock, gather tasks to be processed
795 * in a heap structure. If the statically-sized heap fills up,
796 * overflow tasks that started later, and in future iterations
797 * only consider tasks that started after the latest task in
798 * the previous pass. This guarantees forward progress and
799 * that we don't miss any tasks
800 */
801 heap.size = 0;
802 cgroup_iter_start(cgrp, &it);
803 while ((p = cgroup_iter_next(cgrp, &it))) {
804 /* Only affect tasks that don't have the right cpus_allowed */
805 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
806 continue;
807 /*
808 * Only process tasks that started after the last task
809 * we processed
810 */
811 if (!started_after_time(p, &latest_time, latest_task))
812 continue;
813 dropped = heap_insert(&heap, p);
814 if (dropped == NULL) {
815 get_task_struct(p);
816 } else if (dropped != p) {
817 get_task_struct(p);
818 put_task_struct(dropped);
819 }
820 }
821 cgroup_iter_end(cgrp, &it);
822 if (heap.size) {
823 for (i = 0; i < heap.size; i++) {
824 struct task_struct *p = heap.ptrs[i];
825 if (i == 0) {
826 latest_time = p->start_time;
827 latest_task = p;
828 }
829 set_cpus_allowed(p, cs->cpus_allowed);
830 put_task_struct(p);
831 }
832 /*
833 * If we had to process any tasks at all, scan again
834 * in case some of them were in the middle of forking
835 * children that didn't notice the new cpumask
836 * restriction. Not the most efficient way to do it,
837 * but it avoids having to take callback_mutex in the
838 * fork path
839 */
840 goto again;
841 }
842 heap_free(&heap);
843 if (is_load_balanced)
747 rebuild_sched_domains(); 844 rebuild_sched_domains();
748 845
749 return 0; 846 return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 39d6354af489..72a809a54d5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4471 4471
4472 cpus_allowed = cpuset_cpus_allowed(p); 4472 cpus_allowed = cpuset_cpus_allowed(p);
4473 cpus_and(new_mask, new_mask, cpus_allowed); 4473 cpus_and(new_mask, new_mask, cpus_allowed);
4474 again:
4474 retval = set_cpus_allowed(p, new_mask); 4475 retval = set_cpus_allowed(p, new_mask);
4475 4476
4477 if (!retval) {
4478 cpus_allowed = cpuset_cpus_allowed(p);
4479 if (!cpus_subset(new_mask, cpus_allowed)) {
4480 /*
4481 * We must have raced with a concurrent cpuset
4482 * update. Just reset the cpus_allowed to the
4483 * cpuset's cpus_allowed
4484 */
4485 new_mask = cpus_allowed;
4486 goto again;
4487 }
4488 }
4476out_unlock: 4489out_unlock:
4477 put_task_struct(p); 4490 put_task_struct(p);
4478 mutex_unlock(&sched_hotcpu_mutex); 4491 mutex_unlock(&sched_hotcpu_mutex);