diff options
author | Paul Menage <menage@google.com> | 2007-10-19 02:40:22 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-19 14:53:41 -0400 |
commit | 8707d8b8c0cbdf4441507f8dded194167da896c7 (patch) | |
tree | 1e9ac6b15027bd55263378e551c1595a937d66d6 | |
parent | 020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff) |
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks:
- collect batches of tasks under tasklist_lock and then call
set_cpus_allowed() on them outside the lock (since this can sleep).
- add a simple generic priority heap type to allow efficient collection
of batches of tasks to be processed without duplicating or missing any
tasks in subsequent batches.
- make "cpus" file update a no-op if the mask hasn't changed
- fix race between update_cpumask() and sched_setaffinity() by making
sched_setaffinity() post-check that it's not running on any cpus outside
cpuset_cpus_allowed().
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/prio_heap.h | 58 | ||||
-rw-r--r-- | kernel/cpuset.c | 105 | ||||
-rw-r--r-- | kernel/sched.c | 13 | ||||
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/prio_heap.c | 70 |
5 files changed, 243 insertions, 5 deletions
diff --git a/include/linux/prio_heap.h b/include/linux/prio_heap.h new file mode 100644 index 000000000000..08094350f26a --- /dev/null +++ b/include/linux/prio_heap.h | |||
@@ -0,0 +1,58 @@ | |||
1 | #ifndef _LINUX_PRIO_HEAP_H | ||
2 | #define _LINUX_PRIO_HEAP_H | ||
3 | |||
4 | /* | ||
5 | * Simple insertion-only static-sized priority heap containing | ||
6 | * pointers, based on CLR, chapter 7 | ||
7 | */ | ||
8 | |||
9 | #include <linux/gfp.h> | ||
10 | |||
11 | /** | ||
12 | * struct ptr_heap - simple static-sized priority heap | ||
13 | * @ptrs - pointer to data area | ||
14 | * @max - max number of elements that can be stored in @ptrs | ||
15 | * @size - current number of valid elements in @ptrs (in the range 0..@size-1 | ||
16 | * @gt: comparison operator, which should implement "greater than" | ||
17 | */ | ||
18 | struct ptr_heap { | ||
19 | void **ptrs; | ||
20 | int max; | ||
21 | int size; | ||
22 | int (*gt)(void *, void *); | ||
23 | }; | ||
24 | |||
25 | /** | ||
26 | * heap_init - initialize an empty heap with a given memory size | ||
27 | * @heap: the heap structure to be initialized | ||
28 | * @size: amount of memory to use in bytes | ||
29 | * @gfp_mask: mask to pass to kmalloc() | ||
30 | * @gt: comparison operator, which should implement "greater than" | ||
31 | */ | ||
32 | extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, | ||
33 | int (*gt)(void *, void *)); | ||
34 | |||
35 | /** | ||
36 | * heap_free - release a heap's storage | ||
37 | * @heap: the heap structure whose data should be released | ||
38 | */ | ||
39 | void heap_free(struct ptr_heap *heap); | ||
40 | |||
41 | /** | ||
42 | * heap_insert - insert a value into the heap and return any overflowed value | ||
43 | * @heap: the heap to be operated on | ||
44 | * @p: the pointer to be inserted | ||
45 | * | ||
46 | * Attempts to insert the given value into the priority heap. If the | ||
47 | * heap is full prior to the insertion, then the resulting heap will | ||
48 | * consist of the smallest @max elements of the original heap and the | ||
49 | * new element; the greatest element will be removed from the heap and | ||
50 | * returned. Note that the returned element will be the new element | ||
51 | * (i.e. no change to the heap) if the new element is greater than all | ||
52 | * elements currently in the heap. | ||
53 | */ | ||
54 | extern void *heap_insert(struct ptr_heap *heap, void *p); | ||
55 | |||
56 | |||
57 | |||
58 | #endif /* _LINUX_PRIO_HEAP_H */ | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64ad59cfad9b..fa31cb9f9898 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
41 | #include <linux/prio_heap.h> | ||
41 | #include <linux/proc_fs.h> | 42 | #include <linux/proc_fs.h> |
42 | #include <linux/rcupdate.h> | 43 | #include <linux/rcupdate.h> |
43 | #include <linux/sched.h> | 44 | #include <linux/sched.h> |
@@ -701,6 +702,36 @@ done: | |||
701 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 702 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
702 | } | 703 | } |
703 | 704 | ||
705 | static inline int started_after_time(struct task_struct *t1, | ||
706 | struct timespec *time, | ||
707 | struct task_struct *t2) | ||
708 | { | ||
709 | int start_diff = timespec_compare(&t1->start_time, time); | ||
710 | if (start_diff > 0) { | ||
711 | return 1; | ||
712 | } else if (start_diff < 0) { | ||
713 | return 0; | ||
714 | } else { | ||
715 | /* | ||
716 | * Arbitrarily, if two processes started at the same | ||
717 | * time, we'll say that the lower pointer value | ||
718 | * started first. Note that t2 may have exited by now | ||
719 | * so this may not be a valid pointer any longer, but | ||
720 | * that's fine - it still serves to distinguish | ||
721 | * between two tasks started (effectively) | ||
722 | * simultaneously. | ||
723 | */ | ||
724 | return t1 > t2; | ||
725 | } | ||
726 | } | ||
727 | |||
728 | static inline int started_after(void *p1, void *p2) | ||
729 | { | ||
730 | struct task_struct *t1 = p1; | ||
731 | struct task_struct *t2 = p2; | ||
732 | return started_after_time(t1, &t2->start_time, t2); | ||
733 | } | ||
734 | |||
704 | /* | 735 | /* |
705 | * Call with manage_mutex held. May take callback_mutex during call. | 736 | * Call with manage_mutex held. May take callback_mutex during call. |
706 | */ | 737 | */ |
@@ -708,8 +739,15 @@ done: | |||
708 | static int update_cpumask(struct cpuset *cs, char *buf) | 739 | static int update_cpumask(struct cpuset *cs, char *buf) |
709 | { | 740 | { |
710 | struct cpuset trialcs; | 741 | struct cpuset trialcs; |
711 | int retval; | 742 | int retval, i; |
712 | int cpus_changed, is_load_balanced; | 743 | int is_load_balanced; |
744 | struct cgroup_iter it; | ||
745 | struct cgroup *cgrp = cs->css.cgroup; | ||
746 | struct task_struct *p, *dropped; | ||
747 | /* Never dereference latest_task, since it's not refcounted */ | ||
748 | struct task_struct *latest_task = NULL; | ||
749 | struct ptr_heap heap; | ||
750 | struct timespec latest_time = { 0, 0 }; | ||
713 | 751 | ||
714 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 752 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
715 | if (cs == &top_cpuset) | 753 | if (cs == &top_cpuset) |
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
736 | if (retval < 0) | 774 | if (retval < 0) |
737 | return retval; | 775 | return retval; |
738 | 776 | ||
739 | cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 777 | /* Nothing to do if the cpus didn't change */ |
778 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | ||
779 | return 0; | ||
780 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
781 | if (retval) | ||
782 | return retval; | ||
783 | |||
740 | is_load_balanced = is_sched_load_balance(&trialcs); | 784 | is_load_balanced = is_sched_load_balance(&trialcs); |
741 | 785 | ||
742 | mutex_lock(&callback_mutex); | 786 | mutex_lock(&callback_mutex); |
743 | cs->cpus_allowed = trialcs.cpus_allowed; | 787 | cs->cpus_allowed = trialcs.cpus_allowed; |
744 | mutex_unlock(&callback_mutex); | 788 | mutex_unlock(&callback_mutex); |
745 | 789 | ||
746 | if (cpus_changed && is_load_balanced) | 790 | again: |
791 | /* | ||
792 | * Scan tasks in the cpuset, and update the cpumasks of any | ||
793 | * that need an update. Since we can't call set_cpus_allowed() | ||
794 | * while holding tasklist_lock, gather tasks to be processed | ||
795 | * in a heap structure. If the statically-sized heap fills up, | ||
796 | * overflow tasks that started later, and in future iterations | ||
797 | * only consider tasks that started after the latest task in | ||
798 | * the previous pass. This guarantees forward progress and | ||
799 | * that we don't miss any tasks | ||
800 | */ | ||
801 | heap.size = 0; | ||
802 | cgroup_iter_start(cgrp, &it); | ||
803 | while ((p = cgroup_iter_next(cgrp, &it))) { | ||
804 | /* Only affect tasks that don't have the right cpus_allowed */ | ||
805 | if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) | ||
806 | continue; | ||
807 | /* | ||
808 | * Only process tasks that started after the last task | ||
809 | * we processed | ||
810 | */ | ||
811 | if (!started_after_time(p, &latest_time, latest_task)) | ||
812 | continue; | ||
813 | dropped = heap_insert(&heap, p); | ||
814 | if (dropped == NULL) { | ||
815 | get_task_struct(p); | ||
816 | } else if (dropped != p) { | ||
817 | get_task_struct(p); | ||
818 | put_task_struct(dropped); | ||
819 | } | ||
820 | } | ||
821 | cgroup_iter_end(cgrp, &it); | ||
822 | if (heap.size) { | ||
823 | for (i = 0; i < heap.size; i++) { | ||
824 | struct task_struct *p = heap.ptrs[i]; | ||
825 | if (i == 0) { | ||
826 | latest_time = p->start_time; | ||
827 | latest_task = p; | ||
828 | } | ||
829 | set_cpus_allowed(p, cs->cpus_allowed); | ||
830 | put_task_struct(p); | ||
831 | } | ||
832 | /* | ||
833 | * If we had to process any tasks at all, scan again | ||
834 | * in case some of them were in the middle of forking | ||
835 | * children that didn't notice the new cpumask | ||
836 | * restriction. Not the most efficient way to do it, | ||
837 | * but it avoids having to take callback_mutex in the | ||
838 | * fork path | ||
839 | */ | ||
840 | goto again; | ||
841 | } | ||
842 | heap_free(&heap); | ||
843 | if (is_load_balanced) | ||
747 | rebuild_sched_domains(); | 844 | rebuild_sched_domains(); |
748 | 845 | ||
749 | return 0; | 846 | return 0; |
diff --git a/kernel/sched.c b/kernel/sched.c index 39d6354af489..72a809a54d5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4471 | 4471 | ||
4472 | cpus_allowed = cpuset_cpus_allowed(p); | 4472 | cpus_allowed = cpuset_cpus_allowed(p); |
4473 | cpus_and(new_mask, new_mask, cpus_allowed); | 4473 | cpus_and(new_mask, new_mask, cpus_allowed); |
4474 | again: | ||
4474 | retval = set_cpus_allowed(p, new_mask); | 4475 | retval = set_cpus_allowed(p, new_mask); |
4475 | 4476 | ||
4477 | if (!retval) { | ||
4478 | cpus_allowed = cpuset_cpus_allowed(p); | ||
4479 | if (!cpus_subset(new_mask, cpus_allowed)) { | ||
4480 | /* | ||
4481 | * We must have raced with a concurrent cpuset | ||
4482 | * update. Just reset the cpus_allowed to the | ||
4483 | * cpuset's cpus_allowed | ||
4484 | */ | ||
4485 | new_mask = cpus_allowed; | ||
4486 | goto again; | ||
4487 | } | ||
4488 | } | ||
4476 | out_unlock: | 4489 | out_unlock: |
4477 | put_task_struct(p); | 4490 | put_task_struct(p); |
4478 | mutex_unlock(&sched_hotcpu_mutex); | 4491 | mutex_unlock(&sched_hotcpu_mutex); |
diff --git a/lib/Makefile b/lib/Makefile index c5f215d509d3..3a0983b77412 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -6,7 +6,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |||
6 | rbtree.o radix-tree.o dump_stack.o \ | 6 | rbtree.o radix-tree.o dump_stack.o \ |
7 | idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ | 7 | idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ |
8 | sha1.o irq_regs.o reciprocal_div.o argv_split.o \ | 8 | sha1.o irq_regs.o reciprocal_div.o argv_split.o \ |
9 | proportions.o | 9 | proportions.o prio_heap.o |
10 | 10 | ||
11 | lib-$(CONFIG_MMU) += ioremap.o | 11 | lib-$(CONFIG_MMU) += ioremap.o |
12 | lib-$(CONFIG_SMP) += cpumask.o | 12 | lib-$(CONFIG_SMP) += cpumask.o |
diff --git a/lib/prio_heap.c b/lib/prio_heap.c new file mode 100644 index 000000000000..471944a54e23 --- /dev/null +++ b/lib/prio_heap.c | |||
@@ -0,0 +1,70 @@ | |||
1 | /* | ||
2 | * Simple insertion-only static-sized priority heap containing | ||
3 | * pointers, based on CLR, chapter 7 | ||
4 | */ | ||
5 | |||
6 | #include <linux/slab.h> | ||
7 | #include <linux/prio_heap.h> | ||
8 | |||
9 | int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, | ||
10 | int (*gt)(void *, void *)) | ||
11 | { | ||
12 | heap->ptrs = kmalloc(size, gfp_mask); | ||
13 | if (!heap->ptrs) | ||
14 | return -ENOMEM; | ||
15 | heap->size = 0; | ||
16 | heap->max = size / sizeof(void *); | ||
17 | heap->gt = gt; | ||
18 | return 0; | ||
19 | } | ||
20 | |||
21 | void heap_free(struct ptr_heap *heap) | ||
22 | { | ||
23 | kfree(heap->ptrs); | ||
24 | } | ||
25 | |||
26 | void *heap_insert(struct ptr_heap *heap, void *p) | ||
27 | { | ||
28 | void *res; | ||
29 | void **ptrs = heap->ptrs; | ||
30 | int pos; | ||
31 | |||
32 | if (heap->size < heap->max) { | ||
33 | /* Heap insertion */ | ||
34 | int pos = heap->size++; | ||
35 | while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) { | ||
36 | ptrs[pos] = ptrs[(pos-1)/2]; | ||
37 | pos = (pos-1)/2; | ||
38 | } | ||
39 | ptrs[pos] = p; | ||
40 | return NULL; | ||
41 | } | ||
42 | |||
43 | /* The heap is full, so something will have to be dropped */ | ||
44 | |||
45 | /* If the new pointer is greater than the current max, drop it */ | ||
46 | if (heap->gt(p, ptrs[0])) | ||
47 | return p; | ||
48 | |||
49 | /* Replace the current max and heapify */ | ||
50 | res = ptrs[0]; | ||
51 | ptrs[0] = p; | ||
52 | pos = 0; | ||
53 | |||
54 | while (1) { | ||
55 | int left = 2 * pos + 1; | ||
56 | int right = 2 * pos + 2; | ||
57 | int largest = pos; | ||
58 | if (left < heap->size && heap->gt(ptrs[left], p)) | ||
59 | largest = left; | ||
60 | if (right < heap->size && heap->gt(ptrs[right], ptrs[largest])) | ||
61 | largest = right; | ||
62 | if (largest == pos) | ||
63 | break; | ||
64 | /* Push p down the heap one level and bump one up */ | ||
65 | ptrs[pos] = ptrs[largest]; | ||
66 | ptrs[largest] = p; | ||
67 | pos = largest; | ||
68 | } | ||
69 | return res; | ||
70 | } | ||