aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2007-10-19 02:40:22 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:41 -0400
commit8707d8b8c0cbdf4441507f8dded194167da896c7 (patch)
tree1e9ac6b15027bd55263378e551c1595a937d66d6
parent020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff)
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks: - collect batches of tasks under tasklist_lock and then call set_cpus_allowed() on them outside the lock (since this can sleep). - add a simple generic priority heap type to allow efficient collection of batches of tasks to be processed without duplicating or missing any tasks in subsequent batches. - make "cpus" file update a no-op if the mask hasn't changed - fix race between update_cpumask() and sched_setaffinity() by making sched_setaffinity() post-check that it's not running on any cpus outside cpuset_cpus_allowed(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/prio_heap.h58
-rw-r--r--kernel/cpuset.c105
-rw-r--r--kernel/sched.c13
-rw-r--r--lib/Makefile2
-rw-r--r--lib/prio_heap.c70
5 files changed, 243 insertions, 5 deletions
diff --git a/include/linux/prio_heap.h b/include/linux/prio_heap.h
new file mode 100644
index 000000000000..08094350f26a
--- /dev/null
+++ b/include/linux/prio_heap.h
@@ -0,0 +1,58 @@
1#ifndef _LINUX_PRIO_HEAP_H
2#define _LINUX_PRIO_HEAP_H
3
4/*
5 * Simple insertion-only static-sized priority heap containing
6 * pointers, based on CLR, chapter 7
7 */
8
9#include <linux/gfp.h>
10
11/**
12 * struct ptr_heap - simple static-sized priority heap
13 * @ptrs - pointer to data area
14 * @max - max number of elements that can be stored in @ptrs
15 * @size - current number of valid elements in @ptrs (in the range 0..@size-1
16 * @gt: comparison operator, which should implement "greater than"
17 */
18struct ptr_heap {
19 void **ptrs;
20 int max;
21 int size;
22 int (*gt)(void *, void *);
23};
24
25/**
26 * heap_init - initialize an empty heap with a given memory size
27 * @heap: the heap structure to be initialized
28 * @size: amount of memory to use in bytes
29 * @gfp_mask: mask to pass to kmalloc()
30 * @gt: comparison operator, which should implement "greater than"
31 */
32extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
33 int (*gt)(void *, void *));
34
35/**
36 * heap_free - release a heap's storage
37 * @heap: the heap structure whose data should be released
38 */
39void heap_free(struct ptr_heap *heap);
40
41/**
42 * heap_insert - insert a value into the heap and return any overflowed value
43 * @heap: the heap to be operated on
44 * @p: the pointer to be inserted
45 *
46 * Attempts to insert the given value into the priority heap. If the
47 * heap is full prior to the insertion, then the resulting heap will
48 * consist of the smallest @max elements of the original heap and the
49 * new element; the greatest element will be removed from the heap and
50 * returned. Note that the returned element will be the new element
51 * (i.e. no change to the heap) if the new element is greater than all
52 * elements currently in the heap.
53 */
54extern void *heap_insert(struct ptr_heap *heap, void *p);
55
56
57
58#endif /* _LINUX_PRIO_HEAP_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64ad59cfad9b..fa31cb9f9898 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,6 +38,7 @@
38#include <linux/mount.h> 38#include <linux/mount.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/prio_heap.h>
41#include <linux/proc_fs.h> 42#include <linux/proc_fs.h>
42#include <linux/rcupdate.h> 43#include <linux/rcupdate.h>
43#include <linux/sched.h> 44#include <linux/sched.h>
@@ -701,6 +702,36 @@ done:
701 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 702 /* Don't kfree(doms) -- partition_sched_domains() does that. */
702} 703}
703 704
705static inline int started_after_time(struct task_struct *t1,
706 struct timespec *time,
707 struct task_struct *t2)
708{
709 int start_diff = timespec_compare(&t1->start_time, time);
710 if (start_diff > 0) {
711 return 1;
712 } else if (start_diff < 0) {
713 return 0;
714 } else {
715 /*
716 * Arbitrarily, if two processes started at the same
717 * time, we'll say that the lower pointer value
718 * started first. Note that t2 may have exited by now
719 * so this may not be a valid pointer any longer, but
720 * that's fine - it still serves to distinguish
721 * between two tasks started (effectively)
722 * simultaneously.
723 */
724 return t1 > t2;
725 }
726}
727
728static inline int started_after(void *p1, void *p2)
729{
730 struct task_struct *t1 = p1;
731 struct task_struct *t2 = p2;
732 return started_after_time(t1, &t2->start_time, t2);
733}
734
704/* 735/*
705 * Call with manage_mutex held. May take callback_mutex during call. 736 * Call with manage_mutex held. May take callback_mutex during call.
706 */ 737 */
@@ -708,8 +739,15 @@ done:
708static int update_cpumask(struct cpuset *cs, char *buf) 739static int update_cpumask(struct cpuset *cs, char *buf)
709{ 740{
710 struct cpuset trialcs; 741 struct cpuset trialcs;
711 int retval; 742 int retval, i;
712 int cpus_changed, is_load_balanced; 743 int is_load_balanced;
744 struct cgroup_iter it;
745 struct cgroup *cgrp = cs->css.cgroup;
746 struct task_struct *p, *dropped;
747 /* Never dereference latest_task, since it's not refcounted */
748 struct task_struct *latest_task = NULL;
749 struct ptr_heap heap;
750 struct timespec latest_time = { 0, 0 };
713 751
714 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
715 if (cs == &top_cpuset) 753 if (cs == &top_cpuset)
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf)
736 if (retval < 0) 774 if (retval < 0)
737 return retval; 775 return retval;
738 776
739 cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 777 /* Nothing to do if the cpus didn't change */
778 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779 return 0;
780 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781 if (retval)
782 return retval;
783
740 is_load_balanced = is_sched_load_balance(&trialcs); 784 is_load_balanced = is_sched_load_balance(&trialcs);
741 785
742 mutex_lock(&callback_mutex); 786 mutex_lock(&callback_mutex);
743 cs->cpus_allowed = trialcs.cpus_allowed; 787 cs->cpus_allowed = trialcs.cpus_allowed;
744 mutex_unlock(&callback_mutex); 788 mutex_unlock(&callback_mutex);
745 789
746 if (cpus_changed && is_load_balanced) 790 again:
791 /*
792 * Scan tasks in the cpuset, and update the cpumasks of any
793 * that need an update. Since we can't call set_cpus_allowed()
794 * while holding tasklist_lock, gather tasks to be processed
795 * in a heap structure. If the statically-sized heap fills up,
796 * overflow tasks that started later, and in future iterations
797 * only consider tasks that started after the latest task in
798 * the previous pass. This guarantees forward progress and
799 * that we don't miss any tasks
800 */
801 heap.size = 0;
802 cgroup_iter_start(cgrp, &it);
803 while ((p = cgroup_iter_next(cgrp, &it))) {
804 /* Only affect tasks that don't have the right cpus_allowed */
805 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
806 continue;
807 /*
808 * Only process tasks that started after the last task
809 * we processed
810 */
811 if (!started_after_time(p, &latest_time, latest_task))
812 continue;
813 dropped = heap_insert(&heap, p);
814 if (dropped == NULL) {
815 get_task_struct(p);
816 } else if (dropped != p) {
817 get_task_struct(p);
818 put_task_struct(dropped);
819 }
820 }
821 cgroup_iter_end(cgrp, &it);
822 if (heap.size) {
823 for (i = 0; i < heap.size; i++) {
824 struct task_struct *p = heap.ptrs[i];
825 if (i == 0) {
826 latest_time = p->start_time;
827 latest_task = p;
828 }
829 set_cpus_allowed(p, cs->cpus_allowed);
830 put_task_struct(p);
831 }
832 /*
833 * If we had to process any tasks at all, scan again
834 * in case some of them were in the middle of forking
835 * children that didn't notice the new cpumask
836 * restriction. Not the most efficient way to do it,
837 * but it avoids having to take callback_mutex in the
838 * fork path
839 */
840 goto again;
841 }
842 heap_free(&heap);
843 if (is_load_balanced)
747 rebuild_sched_domains(); 844 rebuild_sched_domains();
748 845
749 return 0; 846 return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 39d6354af489..72a809a54d5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4471 4471
4472 cpus_allowed = cpuset_cpus_allowed(p); 4472 cpus_allowed = cpuset_cpus_allowed(p);
4473 cpus_and(new_mask, new_mask, cpus_allowed); 4473 cpus_and(new_mask, new_mask, cpus_allowed);
4474 again:
4474 retval = set_cpus_allowed(p, new_mask); 4475 retval = set_cpus_allowed(p, new_mask);
4475 4476
4477 if (!retval) {
4478 cpus_allowed = cpuset_cpus_allowed(p);
4479 if (!cpus_subset(new_mask, cpus_allowed)) {
4480 /*
4481 * We must have raced with a concurrent cpuset
4482 * update. Just reset the cpus_allowed to the
4483 * cpuset's cpus_allowed
4484 */
4485 new_mask = cpus_allowed;
4486 goto again;
4487 }
4488 }
4476out_unlock: 4489out_unlock:
4477 put_task_struct(p); 4490 put_task_struct(p);
4478 mutex_unlock(&sched_hotcpu_mutex); 4491 mutex_unlock(&sched_hotcpu_mutex);
diff --git a/lib/Makefile b/lib/Makefile
index c5f215d509d3..3a0983b77412 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -6,7 +6,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
6 rbtree.o radix-tree.o dump_stack.o \ 6 rbtree.o radix-tree.o dump_stack.o \
7 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ 7 idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8 sha1.o irq_regs.o reciprocal_div.o argv_split.o \ 8 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
9 proportions.o 9 proportions.o prio_heap.o
10 10
11lib-$(CONFIG_MMU) += ioremap.o 11lib-$(CONFIG_MMU) += ioremap.o
12lib-$(CONFIG_SMP) += cpumask.o 12lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/prio_heap.c b/lib/prio_heap.c
new file mode 100644
index 000000000000..471944a54e23
--- /dev/null
+++ b/lib/prio_heap.c
@@ -0,0 +1,70 @@
1/*
2 * Simple insertion-only static-sized priority heap containing
3 * pointers, based on CLR, chapter 7
4 */
5
6#include <linux/slab.h>
7#include <linux/prio_heap.h>
8
9int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
10 int (*gt)(void *, void *))
11{
12 heap->ptrs = kmalloc(size, gfp_mask);
13 if (!heap->ptrs)
14 return -ENOMEM;
15 heap->size = 0;
16 heap->max = size / sizeof(void *);
17 heap->gt = gt;
18 return 0;
19}
20
21void heap_free(struct ptr_heap *heap)
22{
23 kfree(heap->ptrs);
24}
25
26void *heap_insert(struct ptr_heap *heap, void *p)
27{
28 void *res;
29 void **ptrs = heap->ptrs;
30 int pos;
31
32 if (heap->size < heap->max) {
33 /* Heap insertion */
34 int pos = heap->size++;
35 while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) {
36 ptrs[pos] = ptrs[(pos-1)/2];
37 pos = (pos-1)/2;
38 }
39 ptrs[pos] = p;
40 return NULL;
41 }
42
43 /* The heap is full, so something will have to be dropped */
44
45 /* If the new pointer is greater than the current max, drop it */
46 if (heap->gt(p, ptrs[0]))
47 return p;
48
49 /* Replace the current max and heapify */
50 res = ptrs[0];
51 ptrs[0] = p;
52 pos = 0;
53
54 while (1) {
55 int left = 2 * pos + 1;
56 int right = 2 * pos + 2;
57 int largest = pos;
58 if (left < heap->size && heap->gt(ptrs[left], p))
59 largest = left;
60 if (right < heap->size && heap->gt(ptrs[right], ptrs[largest]))
61 largest = right;
62 if (largest == pos)
63 break;
64 /* Push p down the heap one level and bump one up */
65 ptrs[pos] = ptrs[largest];
66 ptrs[largest] = p;
67 pos = largest;
68 }
69 return res;
70}