aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Williams <pwil3058@bigpond.net.au>2007-08-09 05:16:46 -0400
committerIngo Molnar <mingo@elte.hu>2007-08-09 05:16:46 -0400
commita4ac01c36e286dd1b9a1d5cd7422c5af51dc55f8 (patch)
tree0c275d58a4835a3d604d9cac4e1dd7c25714e150
parentaea25401c3347d9f3a64ebdc81043be246a9f631 (diff)
sched: fix bug in balance_tasks()
There are two problems with balance_tasks() and how it used: 1. The variables best_prio and best_prio_seen (inherited from the old move_tasks()) were only required to handle problems caused by the active/expired arrays, the order in which they were processed and the possibility that the task with the highest priority could be on either. These issues are no longer present and the extra overhead associated with their use is unnecessary (and possibly wrong). 2. In the absence of CONFIG_FAIR_GROUP_SCHED being set, the same this_best_prio variable needs to be used by all scheduling classes or there is a risk of moving too much load. E.g. if the highest priority task on this at the beginning is a fairly low priority task and the rt class migrates a task (during its turn) then that moved task becomes the new highest priority task on this_rq but when the sched_fair class initializes its copy of this_best_prio it will get the priority of the original highest priority task as, due to the run queue locks being held, the reschedule triggered by pull_task() will not have taken place. This could result in inappropriate overriding of skip_for_load and excessive load being moved. The attached patch addresses these problems by deleting all reference to best_prio and best_prio_seen and making this_best_prio a reference parameter to the various functions involved. load_balance_fair() has also been modified so that this_best_prio is only reset (in the loop) if CONFIG_FAIR_GROUP_SCHED is set. This should preserve the effect of helping spread groups' higher priority tasks around the available CPUs while improving system performance when CONFIG_FAIR_GROUP_SCHED isn't set. Signed-off-by: Peter Williams <pwil3058@bigpond.net.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/sched.c26
-rw-r--r--kernel/sched_fair.c32
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c19
5 files changed, 27 insertions, 54 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 24bce423f10d..513b81c60e87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,7 +870,7 @@ struct sched_class {
870 struct rq *busiest, 870 struct rq *busiest,
871 unsigned long max_nr_move, unsigned long max_load_move, 871 unsigned long max_nr_move, unsigned long max_load_move,
872 struct sched_domain *sd, enum cpu_idle_type idle, 872 struct sched_domain *sd, enum cpu_idle_type idle,
873 int *all_pinned); 873 int *all_pinned, int *this_best_prio);
874 874
875 void (*set_curr_task) (struct rq *rq); 875 void (*set_curr_task) (struct rq *rq);
876 void (*task_tick) (struct rq *rq, struct task_struct *p); 876 void (*task_tick) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 85b93118d244..1fa07c14624e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -745,8 +745,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
745 unsigned long max_nr_move, unsigned long max_load_move, 745 unsigned long max_nr_move, unsigned long max_load_move,
746 struct sched_domain *sd, enum cpu_idle_type idle, 746 struct sched_domain *sd, enum cpu_idle_type idle,
747 int *all_pinned, unsigned long *load_moved, 747 int *all_pinned, unsigned long *load_moved,
748 int this_best_prio, int best_prio, int best_prio_seen, 748 int *this_best_prio, struct rq_iterator *iterator);
749 struct rq_iterator *iterator);
750 749
751#include "sched_stats.h" 750#include "sched_stats.h"
752#include "sched_rt.c" 751#include "sched_rt.c"
@@ -2165,8 +2164,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2165 unsigned long max_nr_move, unsigned long max_load_move, 2164 unsigned long max_nr_move, unsigned long max_load_move,
2166 struct sched_domain *sd, enum cpu_idle_type idle, 2165 struct sched_domain *sd, enum cpu_idle_type idle,
2167 int *all_pinned, unsigned long *load_moved, 2166 int *all_pinned, unsigned long *load_moved,
2168 int this_best_prio, int best_prio, int best_prio_seen, 2167 int *this_best_prio, struct rq_iterator *iterator)
2169 struct rq_iterator *iterator)
2170{ 2168{
2171 int pulled = 0, pinned = 0, skip_for_load; 2169 int pulled = 0, pinned = 0, skip_for_load;
2172 struct task_struct *p; 2170 struct task_struct *p;
@@ -2191,12 +2189,8 @@ next:
2191 */ 2189 */
2192 skip_for_load = (p->se.load.weight >> 1) > rem_load_move + 2190 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2193 SCHED_LOAD_SCALE_FUZZ; 2191 SCHED_LOAD_SCALE_FUZZ;
2194 if (skip_for_load && p->prio < this_best_prio) 2192 if ((skip_for_load && p->prio >= *this_best_prio) ||
2195 skip_for_load = !best_prio_seen && p->prio == best_prio;
2196 if (skip_for_load ||
2197 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2193 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2198
2199 best_prio_seen |= p->prio == best_prio;
2200 p = iterator->next(iterator->arg); 2194 p = iterator->next(iterator->arg);
2201 goto next; 2195 goto next;
2202 } 2196 }
@@ -2210,8 +2204,8 @@ next:
2210 * and the prescribed amount of weighted load. 2204 * and the prescribed amount of weighted load.
2211 */ 2205 */
2212 if (pulled < max_nr_move && rem_load_move > 0) { 2206 if (pulled < max_nr_move && rem_load_move > 0) {
2213 if (p->prio < this_best_prio) 2207 if (p->prio < *this_best_prio)
2214 this_best_prio = p->prio; 2208 *this_best_prio = p->prio;
2215 p = iterator->next(iterator->arg); 2209 p = iterator->next(iterator->arg);
2216 goto next; 2210 goto next;
2217 } 2211 }
@@ -2243,12 +2237,13 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2243{ 2237{
2244 struct sched_class *class = sched_class_highest; 2238 struct sched_class *class = sched_class_highest;
2245 unsigned long total_load_moved = 0; 2239 unsigned long total_load_moved = 0;
2240 int this_best_prio = this_rq->curr->prio;
2246 2241
2247 do { 2242 do {
2248 total_load_moved += 2243 total_load_moved +=
2249 class->load_balance(this_rq, this_cpu, busiest, 2244 class->load_balance(this_rq, this_cpu, busiest,
2250 ULONG_MAX, max_load_move - total_load_moved, 2245 ULONG_MAX, max_load_move - total_load_moved,
2251 sd, idle, all_pinned); 2246 sd, idle, all_pinned, &this_best_prio);
2252 class = class->next; 2247 class = class->next;
2253 } while (class && max_load_move > total_load_moved); 2248 } while (class && max_load_move > total_load_moved);
2254 2249
@@ -2266,10 +2261,12 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2266 struct sched_domain *sd, enum cpu_idle_type idle) 2261 struct sched_domain *sd, enum cpu_idle_type idle)
2267{ 2262{
2268 struct sched_class *class; 2263 struct sched_class *class;
2264 int this_best_prio = MAX_PRIO;
2269 2265
2270 for (class = sched_class_highest; class; class = class->next) 2266 for (class = sched_class_highest; class; class = class->next)
2271 if (class->load_balance(this_rq, this_cpu, busiest, 2267 if (class->load_balance(this_rq, this_cpu, busiest,
2272 1, ULONG_MAX, sd, idle, NULL)) 2268 1, ULONG_MAX, sd, idle, NULL,
2269 &this_best_prio))
2273 return 1; 2270 return 1;
2274 2271
2275 return 0; 2272 return 0;
@@ -3184,8 +3181,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3184 unsigned long max_nr_move, unsigned long max_load_move, 3181 unsigned long max_nr_move, unsigned long max_load_move,
3185 struct sched_domain *sd, enum cpu_idle_type idle, 3182 struct sched_domain *sd, enum cpu_idle_type idle,
3186 int *all_pinned, unsigned long *load_moved, 3183 int *all_pinned, unsigned long *load_moved,
3187 int this_best_prio, int best_prio, int best_prio_seen, 3184 int *this_best_prio, struct rq_iterator *iterator)
3188 struct rq_iterator *iterator)
3189{ 3185{
3190 *load_moved = 0; 3186 *load_moved = 0;
3191 3187
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 16511e9e5528..923bed0b0c42 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -929,6 +929,7 @@ static struct task_struct *load_balance_next_fair(void *arg)
929 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 929 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
930} 930}
931 931
932#ifdef CONFIG_FAIR_GROUP_SCHED
932static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 933static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
933{ 934{
934 struct sched_entity *curr; 935 struct sched_entity *curr;
@@ -942,12 +943,13 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
942 943
943 return p->prio; 944 return p->prio;
944} 945}
946#endif
945 947
946static unsigned long 948static unsigned long
947load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 949load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
948 unsigned long max_nr_move, unsigned long max_load_move, 950 unsigned long max_nr_move, unsigned long max_load_move,
949 struct sched_domain *sd, enum cpu_idle_type idle, 951 struct sched_domain *sd, enum cpu_idle_type idle,
950 int *all_pinned) 952 int *all_pinned, int *this_best_prio)
951{ 953{
952 struct cfs_rq *busy_cfs_rq; 954 struct cfs_rq *busy_cfs_rq;
953 unsigned long load_moved, total_nr_moved = 0, nr_moved; 955 unsigned long load_moved, total_nr_moved = 0, nr_moved;
@@ -958,10 +960,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
958 cfs_rq_iterator.next = load_balance_next_fair; 960 cfs_rq_iterator.next = load_balance_next_fair;
959 961
960 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 962 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
963#ifdef CONFIG_FAIR_GROUP_SCHED
961 struct cfs_rq *this_cfs_rq; 964 struct cfs_rq *this_cfs_rq;
962 long imbalance; 965 long imbalances;
963 unsigned long maxload; 966 unsigned long maxload;
964 int this_best_prio, best_prio, best_prio_seen = 0;
965 967
966 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 968 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
967 969
@@ -975,27 +977,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
975 imbalance /= 2; 977 imbalance /= 2;
976 maxload = min(rem_load_move, imbalance); 978 maxload = min(rem_load_move, imbalance);
977 979
978 this_best_prio = cfs_rq_best_prio(this_cfs_rq); 980 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
979 best_prio = cfs_rq_best_prio(busy_cfs_rq); 981#else
980 982#define maxload rem_load_move
981 /* 983#endif
982 * Enable handling of the case where there is more than one task
983 * with the best priority. If the current running task is one
984 * of those with prio==best_prio we know it won't be moved
985 * and therefore it's safe to override the skip (based on load)
986 * of any task we find with that prio.
987 */
988 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
989 best_prio_seen = 1;
990
991 /* pass busy_cfs_rq argument into 984 /* pass busy_cfs_rq argument into
992 * load_balance_[start|next]_fair iterators 985 * load_balance_[start|next]_fair iterators
993 */ 986 */
994 cfs_rq_iterator.arg = busy_cfs_rq; 987 cfs_rq_iterator.arg = busy_cfs_rq;
995 nr_moved = balance_tasks(this_rq, this_cpu, busiest, 988 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
996 max_nr_move, maxload, sd, idle, all_pinned, 989 max_nr_move, maxload, sd, idle, all_pinned,
997 &load_moved, this_best_prio, best_prio, 990 &load_moved, this_best_prio, &cfs_rq_iterator);
998 best_prio_seen, &cfs_rq_iterator);
999 991
1000 total_nr_moved += nr_moved; 992 total_nr_moved += nr_moved;
1001 max_nr_move -= nr_moved; 993 max_nr_move -= nr_moved;
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 1d8d9e13d950..dc9e1068911f 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -41,7 +41,7 @@ static unsigned long
41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, 41load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move, 42 unsigned long max_nr_move, unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle, 43 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned) 44 int *all_pinned, int *this_best_prio)
45{ 45{
46 return 0; 46 return 0;
47} 47}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2b0626a43cb8..5b559e8c8aa6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -176,26 +176,12 @@ static unsigned long
176load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 176load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 unsigned long max_nr_move, unsigned long max_load_move, 177 unsigned long max_nr_move, unsigned long max_load_move,
178 struct sched_domain *sd, enum cpu_idle_type idle, 178 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned) 179 int *all_pinned, int *this_best_prio)
180{ 180{
181 int this_best_prio, best_prio, best_prio_seen = 0;
182 int nr_moved; 181 int nr_moved;
183 struct rq_iterator rt_rq_iterator; 182 struct rq_iterator rt_rq_iterator;
184 unsigned long load_moved; 183 unsigned long load_moved;
185 184
186 best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
187 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
188
189 /*
190 * Enable handling of the case where there is more than one task
191 * with the best priority. If the current running task is one
192 * of those with prio==best_prio we know it won't be moved
193 * and therefore it's safe to override the skip (based on load)
194 * of any task we find with that prio.
195 */
196 if (busiest->curr->prio == best_prio)
197 best_prio_seen = 1;
198
199 rt_rq_iterator.start = load_balance_start_rt; 185 rt_rq_iterator.start = load_balance_start_rt;
200 rt_rq_iterator.next = load_balance_next_rt; 186 rt_rq_iterator.next = load_balance_next_rt;
201 /* pass 'busiest' rq argument into 187 /* pass 'busiest' rq argument into
@@ -205,8 +191,7 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
205 191
206 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, 192 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
207 max_load_move, sd, idle, all_pinned, &load_moved, 193 max_load_move, sd, idle, all_pinned, &load_moved,
208 this_best_prio, best_prio, best_prio_seen, 194 this_best_prio, &rt_rq_iterator);
209 &rt_rq_iterator);
210 195
211 return load_moved; 196 return load_moved;
212} 197}