diff options
author | Paul Menage <menage@google.com> | 2007-10-19 02:40:22 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-19 14:53:41 -0400 |
commit | 8707d8b8c0cbdf4441507f8dded194167da896c7 (patch) | |
tree | 1e9ac6b15027bd55263378e551c1595a937d66d6 /kernel | |
parent | 020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff) |
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks:
- collect batches of tasks under tasklist_lock and then call
set_cpus_allowed() on them outside the lock (since this can sleep).
- add a simple generic priority heap type to allow efficient collection
of batches of tasks to be processed without duplicating or missing any
tasks in subsequent batches.
- make "cpus" file update a no-op if the mask hasn't changed
- fix race between update_cpumask() and sched_setaffinity() by making
sched_setaffinity() post-check that it's not running on any cpus outside
cpuset_cpus_allowed().
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 105 | ||||
-rw-r--r-- | kernel/sched.c | 13 |
2 files changed, 114 insertions, 4 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64ad59cfad9b..fa31cb9f9898 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
41 | #include <linux/prio_heap.h> | ||
41 | #include <linux/proc_fs.h> | 42 | #include <linux/proc_fs.h> |
42 | #include <linux/rcupdate.h> | 43 | #include <linux/rcupdate.h> |
43 | #include <linux/sched.h> | 44 | #include <linux/sched.h> |
@@ -701,6 +702,36 @@ done: | |||
701 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 702 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
702 | } | 703 | } |
703 | 704 | ||
705 | static inline int started_after_time(struct task_struct *t1, | ||
706 | struct timespec *time, | ||
707 | struct task_struct *t2) | ||
708 | { | ||
709 | int start_diff = timespec_compare(&t1->start_time, time); | ||
710 | if (start_diff > 0) { | ||
711 | return 1; | ||
712 | } else if (start_diff < 0) { | ||
713 | return 0; | ||
714 | } else { | ||
715 | /* | ||
716 | * Arbitrarily, if two processes started at the same | ||
717 | * time, we'll say that the lower pointer value | ||
718 | * started first. Note that t2 may have exited by now | ||
719 | * so this may not be a valid pointer any longer, but | ||
720 | * that's fine - it still serves to distinguish | ||
721 | * between two tasks started (effectively) | ||
722 | * simultaneously. | ||
723 | */ | ||
724 | return t1 > t2; | ||
725 | } | ||
726 | } | ||
727 | |||
728 | static inline int started_after(void *p1, void *p2) | ||
729 | { | ||
730 | struct task_struct *t1 = p1; | ||
731 | struct task_struct *t2 = p2; | ||
732 | return started_after_time(t1, &t2->start_time, t2); | ||
733 | } | ||
734 | |||
704 | /* | 735 | /* |
705 | * Call with manage_mutex held. May take callback_mutex during call. | 736 | * Call with manage_mutex held. May take callback_mutex during call. |
706 | */ | 737 | */ |
@@ -708,8 +739,15 @@ done: | |||
708 | static int update_cpumask(struct cpuset *cs, char *buf) | 739 | static int update_cpumask(struct cpuset *cs, char *buf) |
709 | { | 740 | { |
710 | struct cpuset trialcs; | 741 | struct cpuset trialcs; |
711 | int retval; | 742 | int retval, i; |
712 | int cpus_changed, is_load_balanced; | 743 | int is_load_balanced; |
744 | struct cgroup_iter it; | ||
745 | struct cgroup *cgrp = cs->css.cgroup; | ||
746 | struct task_struct *p, *dropped; | ||
747 | /* Never dereference latest_task, since it's not refcounted */ | ||
748 | struct task_struct *latest_task = NULL; | ||
749 | struct ptr_heap heap; | ||
750 | struct timespec latest_time = { 0, 0 }; | ||
713 | 751 | ||
714 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 752 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
715 | if (cs == &top_cpuset) | 753 | if (cs == &top_cpuset) |
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
736 | if (retval < 0) | 774 | if (retval < 0) |
737 | return retval; | 775 | return retval; |
738 | 776 | ||
739 | cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 777 | /* Nothing to do if the cpus didn't change */ |
778 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | ||
779 | return 0; | ||
780 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
781 | if (retval) | ||
782 | return retval; | ||
783 | |||
740 | is_load_balanced = is_sched_load_balance(&trialcs); | 784 | is_load_balanced = is_sched_load_balance(&trialcs); |
741 | 785 | ||
742 | mutex_lock(&callback_mutex); | 786 | mutex_lock(&callback_mutex); |
743 | cs->cpus_allowed = trialcs.cpus_allowed; | 787 | cs->cpus_allowed = trialcs.cpus_allowed; |
744 | mutex_unlock(&callback_mutex); | 788 | mutex_unlock(&callback_mutex); |
745 | 789 | ||
746 | if (cpus_changed && is_load_balanced) | 790 | again: |
791 | /* | ||
792 | * Scan tasks in the cpuset, and update the cpumasks of any | ||
793 | * that need an update. Since we can't call set_cpus_allowed() | ||
794 | * while holding tasklist_lock, gather tasks to be processed | ||
795 | * in a heap structure. If the statically-sized heap fills up, | ||
796 | * overflow tasks that started later, and in future iterations | ||
797 | * only consider tasks that started after the latest task in | ||
798 | * the previous pass. This guarantees forward progress and | ||
799 | * that we don't miss any tasks | ||
800 | */ | ||
801 | heap.size = 0; | ||
802 | cgroup_iter_start(cgrp, &it); | ||
803 | while ((p = cgroup_iter_next(cgrp, &it))) { | ||
804 | /* Only affect tasks that don't have the right cpus_allowed */ | ||
805 | if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) | ||
806 | continue; | ||
807 | /* | ||
808 | * Only process tasks that started after the last task | ||
809 | * we processed | ||
810 | */ | ||
811 | if (!started_after_time(p, &latest_time, latest_task)) | ||
812 | continue; | ||
813 | dropped = heap_insert(&heap, p); | ||
814 | if (dropped == NULL) { | ||
815 | get_task_struct(p); | ||
816 | } else if (dropped != p) { | ||
817 | get_task_struct(p); | ||
818 | put_task_struct(dropped); | ||
819 | } | ||
820 | } | ||
821 | cgroup_iter_end(cgrp, &it); | ||
822 | if (heap.size) { | ||
823 | for (i = 0; i < heap.size; i++) { | ||
824 | struct task_struct *p = heap.ptrs[i]; | ||
825 | if (i == 0) { | ||
826 | latest_time = p->start_time; | ||
827 | latest_task = p; | ||
828 | } | ||
829 | set_cpus_allowed(p, cs->cpus_allowed); | ||
830 | put_task_struct(p); | ||
831 | } | ||
832 | /* | ||
833 | * If we had to process any tasks at all, scan again | ||
834 | * in case some of them were in the middle of forking | ||
835 | * children that didn't notice the new cpumask | ||
836 | * restriction. Not the most efficient way to do it, | ||
837 | * but it avoids having to take callback_mutex in the | ||
838 | * fork path | ||
839 | */ | ||
840 | goto again; | ||
841 | } | ||
842 | heap_free(&heap); | ||
843 | if (is_load_balanced) | ||
747 | rebuild_sched_domains(); | 844 | rebuild_sched_domains(); |
748 | 845 | ||
749 | return 0; | 846 | return 0; |
diff --git a/kernel/sched.c b/kernel/sched.c index 39d6354af489..72a809a54d5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4471 | 4471 | ||
4472 | cpus_allowed = cpuset_cpus_allowed(p); | 4472 | cpus_allowed = cpuset_cpus_allowed(p); |
4473 | cpus_and(new_mask, new_mask, cpus_allowed); | 4473 | cpus_and(new_mask, new_mask, cpus_allowed); |
4474 | again: | ||
4474 | retval = set_cpus_allowed(p, new_mask); | 4475 | retval = set_cpus_allowed(p, new_mask); |
4475 | 4476 | ||
4477 | if (!retval) { | ||
4478 | cpus_allowed = cpuset_cpus_allowed(p); | ||
4479 | if (!cpus_subset(new_mask, cpus_allowed)) { | ||
4480 | /* | ||
4481 | * We must have raced with a concurrent cpuset | ||
4482 | * update. Just reset the cpus_allowed to the | ||
4483 | * cpuset's cpus_allowed | ||
4484 | */ | ||
4485 | new_mask = cpus_allowed; | ||
4486 | goto again; | ||
4487 | } | ||
4488 | } | ||
4476 | out_unlock: | 4489 | out_unlock: |
4477 | put_task_struct(p); | 4490 | put_task_struct(p); |
4478 | mutex_unlock(&sched_hotcpu_mutex); | 4491 | mutex_unlock(&sched_hotcpu_mutex); |