diff options
| author | Paul Menage <menage@google.com> | 2007-10-19 02:40:22 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-19 14:53:41 -0400 |
| commit | 8707d8b8c0cbdf4441507f8dded194167da896c7 (patch) | |
| tree | 1e9ac6b15027bd55263378e551c1595a937d66d6 | |
| parent | 020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff) | |
Fix cpusets update_cpumask
Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks:
- collect batches of tasks under tasklist_lock and then call
set_cpus_allowed() on them outside the lock (since this can sleep).
- add a simple generic priority heap type to allow efficient collection
of batches of tasks to be processed without duplicating or missing any
tasks in subsequent batches.
- make "cpus" file update a no-op if the mask hasn't changed
- fix race between update_cpumask() and sched_setaffinity() by making
sched_setaffinity() post-check that it's not running on any cpus outside
cpuset_cpus_allowed().
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
| -rw-r--r-- | include/linux/prio_heap.h | 58 | ||||
| -rw-r--r-- | kernel/cpuset.c | 105 | ||||
| -rw-r--r-- | kernel/sched.c | 13 | ||||
| -rw-r--r-- | lib/Makefile | 2 | ||||
| -rw-r--r-- | lib/prio_heap.c | 70 |
5 files changed, 243 insertions, 5 deletions
diff --git a/include/linux/prio_heap.h b/include/linux/prio_heap.h new file mode 100644 index 000000000000..08094350f26a --- /dev/null +++ b/include/linux/prio_heap.h | |||
| @@ -0,0 +1,58 @@ | |||
| 1 | #ifndef _LINUX_PRIO_HEAP_H | ||
| 2 | #define _LINUX_PRIO_HEAP_H | ||
| 3 | |||
| 4 | /* | ||
| 5 | * Simple insertion-only static-sized priority heap containing | ||
| 6 | * pointers, based on CLR, chapter 7 | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/gfp.h> | ||
| 10 | |||
| 11 | /** | ||
| 12 | * struct ptr_heap - simple static-sized priority heap | ||
| 13 | * @ptrs - pointer to data area | ||
| 14 | * @max - max number of elements that can be stored in @ptrs | ||
| 15 | * @size - current number of valid elements in @ptrs (in the range 0..@size-1 | ||
| 16 | * @gt: comparison operator, which should implement "greater than" | ||
| 17 | */ | ||
| 18 | struct ptr_heap { | ||
| 19 | void **ptrs; | ||
| 20 | int max; | ||
| 21 | int size; | ||
| 22 | int (*gt)(void *, void *); | ||
| 23 | }; | ||
| 24 | |||
| 25 | /** | ||
| 26 | * heap_init - initialize an empty heap with a given memory size | ||
| 27 | * @heap: the heap structure to be initialized | ||
| 28 | * @size: amount of memory to use in bytes | ||
| 29 | * @gfp_mask: mask to pass to kmalloc() | ||
| 30 | * @gt: comparison operator, which should implement "greater than" | ||
| 31 | */ | ||
| 32 | extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, | ||
| 33 | int (*gt)(void *, void *)); | ||
| 34 | |||
| 35 | /** | ||
| 36 | * heap_free - release a heap's storage | ||
| 37 | * @heap: the heap structure whose data should be released | ||
| 38 | */ | ||
| 39 | void heap_free(struct ptr_heap *heap); | ||
| 40 | |||
| 41 | /** | ||
| 42 | * heap_insert - insert a value into the heap and return any overflowed value | ||
| 43 | * @heap: the heap to be operated on | ||
| 44 | * @p: the pointer to be inserted | ||
| 45 | * | ||
| 46 | * Attempts to insert the given value into the priority heap. If the | ||
| 47 | * heap is full prior to the insertion, then the resulting heap will | ||
| 48 | * consist of the smallest @max elements of the original heap and the | ||
| 49 | * new element; the greatest element will be removed from the heap and | ||
| 50 | * returned. Note that the returned element will be the new element | ||
| 51 | * (i.e. no change to the heap) if the new element is greater than all | ||
| 52 | * elements currently in the heap. | ||
| 53 | */ | ||
| 54 | extern void *heap_insert(struct ptr_heap *heap, void *p); | ||
| 55 | |||
| 56 | |||
| 57 | |||
| 58 | #endif /* _LINUX_PRIO_HEAP_H */ | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64ad59cfad9b..fa31cb9f9898 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
| 39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
| 40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
| 41 | #include <linux/prio_heap.h> | ||
| 41 | #include <linux/proc_fs.h> | 42 | #include <linux/proc_fs.h> |
| 42 | #include <linux/rcupdate.h> | 43 | #include <linux/rcupdate.h> |
| 43 | #include <linux/sched.h> | 44 | #include <linux/sched.h> |
| @@ -701,6 +702,36 @@ done: | |||
| 701 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 702 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
| 702 | } | 703 | } |
| 703 | 704 | ||
| 705 | static inline int started_after_time(struct task_struct *t1, | ||
| 706 | struct timespec *time, | ||
| 707 | struct task_struct *t2) | ||
| 708 | { | ||
| 709 | int start_diff = timespec_compare(&t1->start_time, time); | ||
| 710 | if (start_diff > 0) { | ||
| 711 | return 1; | ||
| 712 | } else if (start_diff < 0) { | ||
| 713 | return 0; | ||
| 714 | } else { | ||
| 715 | /* | ||
| 716 | * Arbitrarily, if two processes started at the same | ||
| 717 | * time, we'll say that the lower pointer value | ||
| 718 | * started first. Note that t2 may have exited by now | ||
| 719 | * so this may not be a valid pointer any longer, but | ||
| 720 | * that's fine - it still serves to distinguish | ||
| 721 | * between two tasks started (effectively) | ||
| 722 | * simultaneously. | ||
| 723 | */ | ||
| 724 | return t1 > t2; | ||
| 725 | } | ||
| 726 | } | ||
| 727 | |||
| 728 | static inline int started_after(void *p1, void *p2) | ||
| 729 | { | ||
| 730 | struct task_struct *t1 = p1; | ||
| 731 | struct task_struct *t2 = p2; | ||
| 732 | return started_after_time(t1, &t2->start_time, t2); | ||
| 733 | } | ||
| 734 | |||
| 704 | /* | 735 | /* |
| 705 | * Call with manage_mutex held. May take callback_mutex during call. | 736 | * Call with manage_mutex held. May take callback_mutex during call. |
| 706 | */ | 737 | */ |
| @@ -708,8 +739,15 @@ done: | |||
| 708 | static int update_cpumask(struct cpuset *cs, char *buf) | 739 | static int update_cpumask(struct cpuset *cs, char *buf) |
| 709 | { | 740 | { |
| 710 | struct cpuset trialcs; | 741 | struct cpuset trialcs; |
| 711 | int retval; | 742 | int retval, i; |
| 712 | int cpus_changed, is_load_balanced; | 743 | int is_load_balanced; |
| 744 | struct cgroup_iter it; | ||
| 745 | struct cgroup *cgrp = cs->css.cgroup; | ||
| 746 | struct task_struct *p, *dropped; | ||
| 747 | /* Never dereference latest_task, since it's not refcounted */ | ||
| 748 | struct task_struct *latest_task = NULL; | ||
| 749 | struct ptr_heap heap; | ||
| 750 | struct timespec latest_time = { 0, 0 }; | ||
| 713 | 751 | ||
| 714 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 752 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
| 715 | if (cs == &top_cpuset) | 753 | if (cs == &top_cpuset) |
| @@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 736 | if (retval < 0) | 774 | if (retval < 0) |
| 737 | return retval; | 775 | return retval; |
| 738 | 776 | ||
| 739 | cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 777 | /* Nothing to do if the cpus didn't change */ |
| 778 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | ||
| 779 | return 0; | ||
| 780 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
| 781 | if (retval) | ||
| 782 | return retval; | ||
| 783 | |||
| 740 | is_load_balanced = is_sched_load_balance(&trialcs); | 784 | is_load_balanced = is_sched_load_balance(&trialcs); |
| 741 | 785 | ||
| 742 | mutex_lock(&callback_mutex); | 786 | mutex_lock(&callback_mutex); |
| 743 | cs->cpus_allowed = trialcs.cpus_allowed; | 787 | cs->cpus_allowed = trialcs.cpus_allowed; |
| 744 | mutex_unlock(&callback_mutex); | 788 | mutex_unlock(&callback_mutex); |
| 745 | 789 | ||
| 746 | if (cpus_changed && is_load_balanced) | 790 | again: |
| 791 | /* | ||
| 792 | * Scan tasks in the cpuset, and update the cpumasks of any | ||
| 793 | * that need an update. Since we can't call set_cpus_allowed() | ||
| 794 | * while holding tasklist_lock, gather tasks to be processed | ||
| 795 | * in a heap structure. If the statically-sized heap fills up, | ||
| 796 | * overflow tasks that started later, and in future iterations | ||
| 797 | * only consider tasks that started after the latest task in | ||
| 798 | * the previous pass. This guarantees forward progress and | ||
| 799 | * that we don't miss any tasks | ||
| 800 | */ | ||
| 801 | heap.size = 0; | ||
| 802 | cgroup_iter_start(cgrp, &it); | ||
| 803 | while ((p = cgroup_iter_next(cgrp, &it))) { | ||
| 804 | /* Only affect tasks that don't have the right cpus_allowed */ | ||
| 805 | if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) | ||
| 806 | continue; | ||
| 807 | /* | ||
| 808 | * Only process tasks that started after the last task | ||
| 809 | * we processed | ||
| 810 | */ | ||
| 811 | if (!started_after_time(p, &latest_time, latest_task)) | ||
| 812 | continue; | ||
| 813 | dropped = heap_insert(&heap, p); | ||
| 814 | if (dropped == NULL) { | ||
| 815 | get_task_struct(p); | ||
| 816 | } else if (dropped != p) { | ||
| 817 | get_task_struct(p); | ||
| 818 | put_task_struct(dropped); | ||
| 819 | } | ||
| 820 | } | ||
| 821 | cgroup_iter_end(cgrp, &it); | ||
| 822 | if (heap.size) { | ||
| 823 | for (i = 0; i < heap.size; i++) { | ||
| 824 | struct task_struct *p = heap.ptrs[i]; | ||
| 825 | if (i == 0) { | ||
| 826 | latest_time = p->start_time; | ||
| 827 | latest_task = p; | ||
| 828 | } | ||
| 829 | set_cpus_allowed(p, cs->cpus_allowed); | ||
| 830 | put_task_struct(p); | ||
| 831 | } | ||
| 832 | /* | ||
| 833 | * If we had to process any tasks at all, scan again | ||
| 834 | * in case some of them were in the middle of forking | ||
| 835 | * children that didn't notice the new cpumask | ||
| 836 | * restriction. Not the most efficient way to do it, | ||
| 837 | * but it avoids having to take callback_mutex in the | ||
| 838 | * fork path | ||
| 839 | */ | ||
| 840 | goto again; | ||
| 841 | } | ||
| 842 | heap_free(&heap); | ||
| 843 | if (is_load_balanced) | ||
| 747 | rebuild_sched_domains(); | 844 | rebuild_sched_domains(); |
| 748 | 845 | ||
| 749 | return 0; | 846 | return 0; |
diff --git a/kernel/sched.c b/kernel/sched.c index 39d6354af489..72a809a54d5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
| 4471 | 4471 | ||
| 4472 | cpus_allowed = cpuset_cpus_allowed(p); | 4472 | cpus_allowed = cpuset_cpus_allowed(p); |
| 4473 | cpus_and(new_mask, new_mask, cpus_allowed); | 4473 | cpus_and(new_mask, new_mask, cpus_allowed); |
| 4474 | again: | ||
| 4474 | retval = set_cpus_allowed(p, new_mask); | 4475 | retval = set_cpus_allowed(p, new_mask); |
| 4475 | 4476 | ||
| 4477 | if (!retval) { | ||
| 4478 | cpus_allowed = cpuset_cpus_allowed(p); | ||
| 4479 | if (!cpus_subset(new_mask, cpus_allowed)) { | ||
| 4480 | /* | ||
| 4481 | * We must have raced with a concurrent cpuset | ||
| 4482 | * update. Just reset the cpus_allowed to the | ||
| 4483 | * cpuset's cpus_allowed | ||
| 4484 | */ | ||
| 4485 | new_mask = cpus_allowed; | ||
| 4486 | goto again; | ||
| 4487 | } | ||
| 4488 | } | ||
| 4476 | out_unlock: | 4489 | out_unlock: |
| 4477 | put_task_struct(p); | 4490 | put_task_struct(p); |
| 4478 | mutex_unlock(&sched_hotcpu_mutex); | 4491 | mutex_unlock(&sched_hotcpu_mutex); |
diff --git a/lib/Makefile b/lib/Makefile index c5f215d509d3..3a0983b77412 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
| @@ -6,7 +6,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ | |||
| 6 | rbtree.o radix-tree.o dump_stack.o \ | 6 | rbtree.o radix-tree.o dump_stack.o \ |
| 7 | idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ | 7 | idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \ |
| 8 | sha1.o irq_regs.o reciprocal_div.o argv_split.o \ | 8 | sha1.o irq_regs.o reciprocal_div.o argv_split.o \ |
| 9 | proportions.o | 9 | proportions.o prio_heap.o |
| 10 | 10 | ||
| 11 | lib-$(CONFIG_MMU) += ioremap.o | 11 | lib-$(CONFIG_MMU) += ioremap.o |
| 12 | lib-$(CONFIG_SMP) += cpumask.o | 12 | lib-$(CONFIG_SMP) += cpumask.o |
diff --git a/lib/prio_heap.c b/lib/prio_heap.c new file mode 100644 index 000000000000..471944a54e23 --- /dev/null +++ b/lib/prio_heap.c | |||
| @@ -0,0 +1,70 @@ | |||
| 1 | /* | ||
| 2 | * Simple insertion-only static-sized priority heap containing | ||
| 3 | * pointers, based on CLR, chapter 7 | ||
| 4 | */ | ||
| 5 | |||
| 6 | #include <linux/slab.h> | ||
| 7 | #include <linux/prio_heap.h> | ||
| 8 | |||
| 9 | int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask, | ||
| 10 | int (*gt)(void *, void *)) | ||
| 11 | { | ||
| 12 | heap->ptrs = kmalloc(size, gfp_mask); | ||
| 13 | if (!heap->ptrs) | ||
| 14 | return -ENOMEM; | ||
| 15 | heap->size = 0; | ||
| 16 | heap->max = size / sizeof(void *); | ||
| 17 | heap->gt = gt; | ||
| 18 | return 0; | ||
| 19 | } | ||
| 20 | |||
| 21 | void heap_free(struct ptr_heap *heap) | ||
| 22 | { | ||
| 23 | kfree(heap->ptrs); | ||
| 24 | } | ||
| 25 | |||
| 26 | void *heap_insert(struct ptr_heap *heap, void *p) | ||
| 27 | { | ||
| 28 | void *res; | ||
| 29 | void **ptrs = heap->ptrs; | ||
| 30 | int pos; | ||
| 31 | |||
| 32 | if (heap->size < heap->max) { | ||
| 33 | /* Heap insertion */ | ||
| 34 | int pos = heap->size++; | ||
| 35 | while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) { | ||
| 36 | ptrs[pos] = ptrs[(pos-1)/2]; | ||
| 37 | pos = (pos-1)/2; | ||
| 38 | } | ||
| 39 | ptrs[pos] = p; | ||
| 40 | return NULL; | ||
| 41 | } | ||
| 42 | |||
| 43 | /* The heap is full, so something will have to be dropped */ | ||
| 44 | |||
| 45 | /* If the new pointer is greater than the current max, drop it */ | ||
| 46 | if (heap->gt(p, ptrs[0])) | ||
| 47 | return p; | ||
| 48 | |||
| 49 | /* Replace the current max and heapify */ | ||
| 50 | res = ptrs[0]; | ||
| 51 | ptrs[0] = p; | ||
| 52 | pos = 0; | ||
| 53 | |||
| 54 | while (1) { | ||
| 55 | int left = 2 * pos + 1; | ||
| 56 | int right = 2 * pos + 2; | ||
| 57 | int largest = pos; | ||
| 58 | if (left < heap->size && heap->gt(ptrs[left], p)) | ||
| 59 | largest = left; | ||
| 60 | if (right < heap->size && heap->gt(ptrs[right], ptrs[largest])) | ||
| 61 | largest = right; | ||
| 62 | if (largest == pos) | ||
| 63 | break; | ||
| 64 | /* Push p down the heap one level and bump one up */ | ||
| 65 | ptrs[pos] = ptrs[largest]; | ||
| 66 | ptrs[largest] = p; | ||
| 67 | pos = largest; | ||
| 68 | } | ||
| 69 | return res; | ||
| 70 | } | ||
