diff options
author | Peter Williams <pwil3058@bigpond.net.au> | 2007-08-09 05:16:46 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2007-08-09 05:16:46 -0400 |
commit | a4ac01c36e286dd1b9a1d5cd7422c5af51dc55f8 (patch) | |
tree | 0c275d58a4835a3d604d9cac4e1dd7c25714e150 | |
parent | aea25401c3347d9f3a64ebdc81043be246a9f631 (diff) |
sched: fix bug in balance_tasks()
There are two problems with balance_tasks() and how it used:
1. The variables best_prio and best_prio_seen (inherited from the old
move_tasks()) were only required to handle problems caused by the
active/expired arrays, the order in which they were processed and the
possibility that the task with the highest priority could be on either.
These issues are no longer present and the extra overhead associated
with their use is unnecessary (and possibly wrong).
2. In the absence of CONFIG_FAIR_GROUP_SCHED being set, the same
this_best_prio variable needs to be used by all scheduling classes or
there is a risk of moving too much load. E.g. if the highest priority
task on this at the beginning is a fairly low priority task and the rt
class migrates a task (during its turn) then that moved task becomes the
new highest priority task on this_rq but when the sched_fair class
initializes its copy of this_best_prio it will get the priority of the
original highest priority task as, due to the run queue locks being
held, the reschedule triggered by pull_task() will not have taken place.
This could result in inappropriate overriding of skip_for_load and
excessive load being moved.
The attached patch addresses these problems by deleting all reference to
best_prio and best_prio_seen and making this_best_prio a reference
parameter to the various functions involved.
load_balance_fair() has also been modified so that this_best_prio is
only reset (in the loop) if CONFIG_FAIR_GROUP_SCHED is set. This should
preserve the effect of helping spread groups' higher priority tasks
around the available CPUs while improving system performance when
CONFIG_FAIR_GROUP_SCHED isn't set.
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/sched.c | 26 | ||||
-rw-r--r-- | kernel/sched_fair.c | 32 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 2 | ||||
-rw-r--r-- | kernel/sched_rt.c | 19 |
5 files changed, 27 insertions, 54 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 24bce423f10d..513b81c60e87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -870,7 +870,7 @@ struct sched_class { | |||
870 | struct rq *busiest, | 870 | struct rq *busiest, |
871 | unsigned long max_nr_move, unsigned long max_load_move, | 871 | unsigned long max_nr_move, unsigned long max_load_move, |
872 | struct sched_domain *sd, enum cpu_idle_type idle, | 872 | struct sched_domain *sd, enum cpu_idle_type idle, |
873 | int *all_pinned); | 873 | int *all_pinned, int *this_best_prio); |
874 | 874 | ||
875 | void (*set_curr_task) (struct rq *rq); | 875 | void (*set_curr_task) (struct rq *rq); |
876 | void (*task_tick) (struct rq *rq, struct task_struct *p); | 876 | void (*task_tick) (struct rq *rq, struct task_struct *p); |
diff --git a/kernel/sched.c b/kernel/sched.c index 85b93118d244..1fa07c14624e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -745,8 +745,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
745 | unsigned long max_nr_move, unsigned long max_load_move, | 745 | unsigned long max_nr_move, unsigned long max_load_move, |
746 | struct sched_domain *sd, enum cpu_idle_type idle, | 746 | struct sched_domain *sd, enum cpu_idle_type idle, |
747 | int *all_pinned, unsigned long *load_moved, | 747 | int *all_pinned, unsigned long *load_moved, |
748 | int this_best_prio, int best_prio, int best_prio_seen, | 748 | int *this_best_prio, struct rq_iterator *iterator); |
749 | struct rq_iterator *iterator); | ||
750 | 749 | ||
751 | #include "sched_stats.h" | 750 | #include "sched_stats.h" |
752 | #include "sched_rt.c" | 751 | #include "sched_rt.c" |
@@ -2165,8 +2164,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2165 | unsigned long max_nr_move, unsigned long max_load_move, | 2164 | unsigned long max_nr_move, unsigned long max_load_move, |
2166 | struct sched_domain *sd, enum cpu_idle_type idle, | 2165 | struct sched_domain *sd, enum cpu_idle_type idle, |
2167 | int *all_pinned, unsigned long *load_moved, | 2166 | int *all_pinned, unsigned long *load_moved, |
2168 | int this_best_prio, int best_prio, int best_prio_seen, | 2167 | int *this_best_prio, struct rq_iterator *iterator) |
2169 | struct rq_iterator *iterator) | ||
2170 | { | 2168 | { |
2171 | int pulled = 0, pinned = 0, skip_for_load; | 2169 | int pulled = 0, pinned = 0, skip_for_load; |
2172 | struct task_struct *p; | 2170 | struct task_struct *p; |
@@ -2191,12 +2189,8 @@ next: | |||
2191 | */ | 2189 | */ |
2192 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | 2190 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + |
2193 | SCHED_LOAD_SCALE_FUZZ; | 2191 | SCHED_LOAD_SCALE_FUZZ; |
2194 | if (skip_for_load && p->prio < this_best_prio) | 2192 | if ((skip_for_load && p->prio >= *this_best_prio) || |
2195 | skip_for_load = !best_prio_seen && p->prio == best_prio; | ||
2196 | if (skip_for_load || | ||
2197 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 2193 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2198 | |||
2199 | best_prio_seen |= p->prio == best_prio; | ||
2200 | p = iterator->next(iterator->arg); | 2194 | p = iterator->next(iterator->arg); |
2201 | goto next; | 2195 | goto next; |
2202 | } | 2196 | } |
@@ -2210,8 +2204,8 @@ next: | |||
2210 | * and the prescribed amount of weighted load. | 2204 | * and the prescribed amount of weighted load. |
2211 | */ | 2205 | */ |
2212 | if (pulled < max_nr_move && rem_load_move > 0) { | 2206 | if (pulled < max_nr_move && rem_load_move > 0) { |
2213 | if (p->prio < this_best_prio) | 2207 | if (p->prio < *this_best_prio) |
2214 | this_best_prio = p->prio; | 2208 | *this_best_prio = p->prio; |
2215 | p = iterator->next(iterator->arg); | 2209 | p = iterator->next(iterator->arg); |
2216 | goto next; | 2210 | goto next; |
2217 | } | 2211 | } |
@@ -2243,12 +2237,13 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2243 | { | 2237 | { |
2244 | struct sched_class *class = sched_class_highest; | 2238 | struct sched_class *class = sched_class_highest; |
2245 | unsigned long total_load_moved = 0; | 2239 | unsigned long total_load_moved = 0; |
2240 | int this_best_prio = this_rq->curr->prio; | ||
2246 | 2241 | ||
2247 | do { | 2242 | do { |
2248 | total_load_moved += | 2243 | total_load_moved += |
2249 | class->load_balance(this_rq, this_cpu, busiest, | 2244 | class->load_balance(this_rq, this_cpu, busiest, |
2250 | ULONG_MAX, max_load_move - total_load_moved, | 2245 | ULONG_MAX, max_load_move - total_load_moved, |
2251 | sd, idle, all_pinned); | 2246 | sd, idle, all_pinned, &this_best_prio); |
2252 | class = class->next; | 2247 | class = class->next; |
2253 | } while (class && max_load_move > total_load_moved); | 2248 | } while (class && max_load_move > total_load_moved); |
2254 | 2249 | ||
@@ -2266,10 +2261,12 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2266 | struct sched_domain *sd, enum cpu_idle_type idle) | 2261 | struct sched_domain *sd, enum cpu_idle_type idle) |
2267 | { | 2262 | { |
2268 | struct sched_class *class; | 2263 | struct sched_class *class; |
2264 | int this_best_prio = MAX_PRIO; | ||
2269 | 2265 | ||
2270 | for (class = sched_class_highest; class; class = class->next) | 2266 | for (class = sched_class_highest; class; class = class->next) |
2271 | if (class->load_balance(this_rq, this_cpu, busiest, | 2267 | if (class->load_balance(this_rq, this_cpu, busiest, |
2272 | 1, ULONG_MAX, sd, idle, NULL)) | 2268 | 1, ULONG_MAX, sd, idle, NULL, |
2269 | &this_best_prio)) | ||
2273 | return 1; | 2270 | return 1; |
2274 | 2271 | ||
2275 | return 0; | 2272 | return 0; |
@@ -3184,8 +3181,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3184 | unsigned long max_nr_move, unsigned long max_load_move, | 3181 | unsigned long max_nr_move, unsigned long max_load_move, |
3185 | struct sched_domain *sd, enum cpu_idle_type idle, | 3182 | struct sched_domain *sd, enum cpu_idle_type idle, |
3186 | int *all_pinned, unsigned long *load_moved, | 3183 | int *all_pinned, unsigned long *load_moved, |
3187 | int this_best_prio, int best_prio, int best_prio_seen, | 3184 | int *this_best_prio, struct rq_iterator *iterator) |
3188 | struct rq_iterator *iterator) | ||
3189 | { | 3185 | { |
3190 | *load_moved = 0; | 3186 | *load_moved = 0; |
3191 | 3187 | ||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 16511e9e5528..923bed0b0c42 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -929,6 +929,7 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
929 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 929 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
930 | } | 930 | } |
931 | 931 | ||
932 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
932 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 933 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) |
933 | { | 934 | { |
934 | struct sched_entity *curr; | 935 | struct sched_entity *curr; |
@@ -942,12 +943,13 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | |||
942 | 943 | ||
943 | return p->prio; | 944 | return p->prio; |
944 | } | 945 | } |
946 | #endif | ||
945 | 947 | ||
946 | static unsigned long | 948 | static unsigned long |
947 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 949 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
948 | unsigned long max_nr_move, unsigned long max_load_move, | 950 | unsigned long max_nr_move, unsigned long max_load_move, |
949 | struct sched_domain *sd, enum cpu_idle_type idle, | 951 | struct sched_domain *sd, enum cpu_idle_type idle, |
950 | int *all_pinned) | 952 | int *all_pinned, int *this_best_prio) |
951 | { | 953 | { |
952 | struct cfs_rq *busy_cfs_rq; | 954 | struct cfs_rq *busy_cfs_rq; |
953 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | 955 | unsigned long load_moved, total_nr_moved = 0, nr_moved; |
@@ -958,10 +960,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
958 | cfs_rq_iterator.next = load_balance_next_fair; | 960 | cfs_rq_iterator.next = load_balance_next_fair; |
959 | 961 | ||
960 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 962 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
963 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
961 | struct cfs_rq *this_cfs_rq; | 964 | struct cfs_rq *this_cfs_rq; |
962 | long imbalance; | 965 | long imbalances; |
963 | unsigned long maxload; | 966 | unsigned long maxload; |
964 | int this_best_prio, best_prio, best_prio_seen = 0; | ||
965 | 967 | ||
966 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 968 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); |
967 | 969 | ||
@@ -975,27 +977,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
975 | imbalance /= 2; | 977 | imbalance /= 2; |
976 | maxload = min(rem_load_move, imbalance); | 978 | maxload = min(rem_load_move, imbalance); |
977 | 979 | ||
978 | this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 980 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); |
979 | best_prio = cfs_rq_best_prio(busy_cfs_rq); | 981 | #else |
980 | 982 | #define maxload rem_load_move | |
981 | /* | 983 | #endif |
982 | * Enable handling of the case where there is more than one task | ||
983 | * with the best priority. If the current running task is one | ||
984 | * of those with prio==best_prio we know it won't be moved | ||
985 | * and therefore it's safe to override the skip (based on load) | ||
986 | * of any task we find with that prio. | ||
987 | */ | ||
988 | if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) | ||
989 | best_prio_seen = 1; | ||
990 | |||
991 | /* pass busy_cfs_rq argument into | 984 | /* pass busy_cfs_rq argument into |
992 | * load_balance_[start|next]_fair iterators | 985 | * load_balance_[start|next]_fair iterators |
993 | */ | 986 | */ |
994 | cfs_rq_iterator.arg = busy_cfs_rq; | 987 | cfs_rq_iterator.arg = busy_cfs_rq; |
995 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, | 988 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, |
996 | max_nr_move, maxload, sd, idle, all_pinned, | 989 | max_nr_move, maxload, sd, idle, all_pinned, |
997 | &load_moved, this_best_prio, best_prio, | 990 | &load_moved, this_best_prio, &cfs_rq_iterator); |
998 | best_prio_seen, &cfs_rq_iterator); | ||
999 | 991 | ||
1000 | total_nr_moved += nr_moved; | 992 | total_nr_moved += nr_moved; |
1001 | max_nr_move -= nr_moved; | 993 | max_nr_move -= nr_moved; |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 1d8d9e13d950..dc9e1068911f 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -41,7 +41,7 @@ static unsigned long | |||
41 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | 41 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, |
42 | unsigned long max_nr_move, unsigned long max_load_move, | 42 | unsigned long max_nr_move, unsigned long max_load_move, |
43 | struct sched_domain *sd, enum cpu_idle_type idle, | 43 | struct sched_domain *sd, enum cpu_idle_type idle, |
44 | int *all_pinned) | 44 | int *all_pinned, int *this_best_prio) |
45 | { | 45 | { |
46 | return 0; | 46 | return 0; |
47 | } | 47 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2b0626a43cb8..5b559e8c8aa6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -176,26 +176,12 @@ static unsigned long | |||
176 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 176 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
177 | unsigned long max_nr_move, unsigned long max_load_move, | 177 | unsigned long max_nr_move, unsigned long max_load_move, |
178 | struct sched_domain *sd, enum cpu_idle_type idle, | 178 | struct sched_domain *sd, enum cpu_idle_type idle, |
179 | int *all_pinned) | 179 | int *all_pinned, int *this_best_prio) |
180 | { | 180 | { |
181 | int this_best_prio, best_prio, best_prio_seen = 0; | ||
182 | int nr_moved; | 181 | int nr_moved; |
183 | struct rq_iterator rt_rq_iterator; | 182 | struct rq_iterator rt_rq_iterator; |
184 | unsigned long load_moved; | 183 | unsigned long load_moved; |
185 | 184 | ||
186 | best_prio = sched_find_first_bit(busiest->rt.active.bitmap); | ||
187 | this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); | ||
188 | |||
189 | /* | ||
190 | * Enable handling of the case where there is more than one task | ||
191 | * with the best priority. If the current running task is one | ||
192 | * of those with prio==best_prio we know it won't be moved | ||
193 | * and therefore it's safe to override the skip (based on load) | ||
194 | * of any task we find with that prio. | ||
195 | */ | ||
196 | if (busiest->curr->prio == best_prio) | ||
197 | best_prio_seen = 1; | ||
198 | |||
199 | rt_rq_iterator.start = load_balance_start_rt; | 185 | rt_rq_iterator.start = load_balance_start_rt; |
200 | rt_rq_iterator.next = load_balance_next_rt; | 186 | rt_rq_iterator.next = load_balance_next_rt; |
201 | /* pass 'busiest' rq argument into | 187 | /* pass 'busiest' rq argument into |
@@ -205,8 +191,7 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
205 | 191 | ||
206 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, | 192 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, |
207 | max_load_move, sd, idle, all_pinned, &load_moved, | 193 | max_load_move, sd, idle, all_pinned, &load_moved, |
208 | this_best_prio, best_prio, best_prio_seen, | 194 | this_best_prio, &rt_rq_iterator); |
209 | &rt_rq_iterator); | ||
210 | 195 | ||
211 | return load_moved; | 196 | return load_moved; |
212 | } | 197 | } |