diff options
| author | Paul Menage <menage@google.com> | 2007-10-19 02:40:22 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-19 14:53:41 -0400 |
| commit | 8707d8b8c0cbdf4441507f8dded194167da896c7 (patch) | |
| tree | 1e9ac6b15027bd55263378e551c1595a937d66d6 /kernel | |
| parent | 020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff) | |
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks:
- collect batches of tasks under tasklist_lock and then call
set_cpus_allowed() on them outside the lock (since this can sleep).
- add a simple generic priority heap type to allow efficient collection
of batches of tasks to be processed without duplicating or missing any
tasks in subsequent batches.
- make "cpus" file update a no-op if the mask hasn't changed
- fix race between update_cpumask() and sched_setaffinity() by making
sched_setaffinity() post-check that it's not running on any cpus outside
cpuset_cpus_allowed().
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cpuset.c | 105 | ||||
| -rw-r--r-- | kernel/sched.c | 13 |
2 files changed, 114 insertions, 4 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64ad59cfad9b..fa31cb9f9898 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
| 39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
| 40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
| 41 | #include <linux/prio_heap.h> | ||
| 41 | #include <linux/proc_fs.h> | 42 | #include <linux/proc_fs.h> |
| 42 | #include <linux/rcupdate.h> | 43 | #include <linux/rcupdate.h> |
| 43 | #include <linux/sched.h> | 44 | #include <linux/sched.h> |
| @@ -701,6 +702,36 @@ done: | |||
| 701 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 702 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
| 702 | } | 703 | } |
| 703 | 704 | ||
| 705 | static inline int started_after_time(struct task_struct *t1, | ||
| 706 | struct timespec *time, | ||
| 707 | struct task_struct *t2) | ||
| 708 | { | ||
| 709 | int start_diff = timespec_compare(&t1->start_time, time); | ||
| 710 | if (start_diff > 0) { | ||
| 711 | return 1; | ||
| 712 | } else if (start_diff < 0) { | ||
| 713 | return 0; | ||
| 714 | } else { | ||
| 715 | /* | ||
| 716 | * Arbitrarily, if two processes started at the same | ||
| 717 | * time, we'll say that the lower pointer value | ||
| 718 | * started first. Note that t2 may have exited by now | ||
| 719 | * so this may not be a valid pointer any longer, but | ||
| 720 | * that's fine - it still serves to distinguish | ||
| 721 | * between two tasks started (effectively) | ||
| 722 | * simultaneously. | ||
| 723 | */ | ||
| 724 | return t1 > t2; | ||
| 725 | } | ||
| 726 | } | ||
| 727 | |||
| 728 | static inline int started_after(void *p1, void *p2) | ||
| 729 | { | ||
| 730 | struct task_struct *t1 = p1; | ||
| 731 | struct task_struct *t2 = p2; | ||
| 732 | return started_after_time(t1, &t2->start_time, t2); | ||
| 733 | } | ||
| 734 | |||
| 704 | /* | 735 | /* |
| 705 | * Call with manage_mutex held. May take callback_mutex during call. | 736 | * Call with manage_mutex held. May take callback_mutex during call. |
| 706 | */ | 737 | */ |
| @@ -708,8 +739,15 @@ done: | |||
| 708 | static int update_cpumask(struct cpuset *cs, char *buf) | 739 | static int update_cpumask(struct cpuset *cs, char *buf) |
| 709 | { | 740 | { |
| 710 | struct cpuset trialcs; | 741 | struct cpuset trialcs; |
| 711 | int retval; | 742 | int retval, i; |
| 712 | int cpus_changed, is_load_balanced; | 743 | int is_load_balanced; |
| 744 | struct cgroup_iter it; | ||
| 745 | struct cgroup *cgrp = cs->css.cgroup; | ||
| 746 | struct task_struct *p, *dropped; | ||
| 747 | /* Never dereference latest_task, since it's not refcounted */ | ||
| 748 | struct task_struct *latest_task = NULL; | ||
| 749 | struct ptr_heap heap; | ||
| 750 | struct timespec latest_time = { 0, 0 }; | ||
| 713 | 751 | ||
| 714 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 752 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
| 715 | if (cs == &top_cpuset) | 753 | if (cs == &top_cpuset) |
| @@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 736 | if (retval < 0) | 774 | if (retval < 0) |
| 737 | return retval; | 775 | return retval; |
| 738 | 776 | ||
| 739 | cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 777 | /* Nothing to do if the cpus didn't change */ |
| 778 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | ||
| 779 | return 0; | ||
| 780 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
| 781 | if (retval) | ||
| 782 | return retval; | ||
| 783 | |||
| 740 | is_load_balanced = is_sched_load_balance(&trialcs); | 784 | is_load_balanced = is_sched_load_balance(&trialcs); |
| 741 | 785 | ||
| 742 | mutex_lock(&callback_mutex); | 786 | mutex_lock(&callback_mutex); |
| 743 | cs->cpus_allowed = trialcs.cpus_allowed; | 787 | cs->cpus_allowed = trialcs.cpus_allowed; |
| 744 | mutex_unlock(&callback_mutex); | 788 | mutex_unlock(&callback_mutex); |
| 745 | 789 | ||
| 746 | if (cpus_changed && is_load_balanced) | 790 | again: |
| 791 | /* | ||
| 792 | * Scan tasks in the cpuset, and update the cpumasks of any | ||
| 793 | * that need an update. Since we can't call set_cpus_allowed() | ||
| 794 | * while holding tasklist_lock, gather tasks to be processed | ||
| 795 | * in a heap structure. If the statically-sized heap fills up, | ||
| 796 | * overflow tasks that started later, and in future iterations | ||
| 797 | * only consider tasks that started after the latest task in | ||
| 798 | * the previous pass. This guarantees forward progress and | ||
| 799 | * that we don't miss any tasks | ||
| 800 | */ | ||
| 801 | heap.size = 0; | ||
| 802 | cgroup_iter_start(cgrp, &it); | ||
| 803 | while ((p = cgroup_iter_next(cgrp, &it))) { | ||
| 804 | /* Only affect tasks that don't have the right cpus_allowed */ | ||
| 805 | if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) | ||
| 806 | continue; | ||
| 807 | /* | ||
| 808 | * Only process tasks that started after the last task | ||
| 809 | * we processed | ||
| 810 | */ | ||
| 811 | if (!started_after_time(p, &latest_time, latest_task)) | ||
| 812 | continue; | ||
| 813 | dropped = heap_insert(&heap, p); | ||
| 814 | if (dropped == NULL) { | ||
| 815 | get_task_struct(p); | ||
| 816 | } else if (dropped != p) { | ||
| 817 | get_task_struct(p); | ||
| 818 | put_task_struct(dropped); | ||
| 819 | } | ||
| 820 | } | ||
| 821 | cgroup_iter_end(cgrp, &it); | ||
| 822 | if (heap.size) { | ||
| 823 | for (i = 0; i < heap.size; i++) { | ||
| 824 | struct task_struct *p = heap.ptrs[i]; | ||
| 825 | if (i == 0) { | ||
| 826 | latest_time = p->start_time; | ||
| 827 | latest_task = p; | ||
| 828 | } | ||
| 829 | set_cpus_allowed(p, cs->cpus_allowed); | ||
| 830 | put_task_struct(p); | ||
| 831 | } | ||
| 832 | /* | ||
| 833 | * If we had to process any tasks at all, scan again | ||
| 834 | * in case some of them were in the middle of forking | ||
| 835 | * children that didn't notice the new cpumask | ||
| 836 | * restriction. Not the most efficient way to do it, | ||
| 837 | * but it avoids having to take callback_mutex in the | ||
| 838 | * fork path | ||
| 839 | */ | ||
| 840 | goto again; | ||
| 841 | } | ||
| 842 | heap_free(&heap); | ||
| 843 | if (is_load_balanced) | ||
| 747 | rebuild_sched_domains(); | 844 | rebuild_sched_domains(); |
| 748 | 845 | ||
| 749 | return 0; | 846 | return 0; |
diff --git a/kernel/sched.c b/kernel/sched.c index 39d6354af489..72a809a54d5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
| 4471 | 4471 | ||
| 4472 | cpus_allowed = cpuset_cpus_allowed(p); | 4472 | cpus_allowed = cpuset_cpus_allowed(p); |
| 4473 | cpus_and(new_mask, new_mask, cpus_allowed); | 4473 | cpus_and(new_mask, new_mask, cpus_allowed); |
| 4474 | again: | ||
| 4474 | retval = set_cpus_allowed(p, new_mask); | 4475 | retval = set_cpus_allowed(p, new_mask); |
| 4475 | 4476 | ||
| 4477 | if (!retval) { | ||
| 4478 | cpus_allowed = cpuset_cpus_allowed(p); | ||
| 4479 | if (!cpus_subset(new_mask, cpus_allowed)) { | ||
| 4480 | /* | ||
| 4481 | * We must have raced with a concurrent cpuset | ||
| 4482 | * update. Just reset the cpus_allowed to the | ||
| 4483 | * cpuset's cpus_allowed | ||
| 4484 | */ | ||
| 4485 | new_mask = cpus_allowed; | ||
| 4486 | goto again; | ||
| 4487 | } | ||
| 4488 | } | ||
| 4476 | out_unlock: | 4489 | out_unlock: |
| 4477 | put_task_struct(p); | 4490 | put_task_struct(p); |
| 4478 | mutex_unlock(&sched_hotcpu_mutex); | 4491 | mutex_unlock(&sched_hotcpu_mutex); |
