aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-15 18:29:07 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-15 18:29:07 -0400
commit82638844d9a8581bbf33201cc209a14876eca167 (patch)
tree961d7f9360194421a71aa644a9d0c176a960ce49 /kernel/sched_fair.c
parent9982fbface82893e77d211fbabfbd229da6bdde6 (diff)
parent63cf13b77ab785e87c867defa8545e6d4a989774 (diff)
Merge branch 'linus' into cpus4096
Conflicts: arch/x86/xen/smp.c kernel/sched_rt.c net/iucv/iucv.c Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c413
1 files changed, 290 insertions, 123 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 74774bde5264..bb61fe26b62c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 334#endif
335 335
336/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
337 * The idea is to set a period in which each task runs once. 365 * The idea is to set a period in which each task runs once.
338 * 366 *
339 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running)
362 */ 390 */
363static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
364{ 392{
365 u64 slice = __sched_period(cfs_rq->nr_running); 393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
366
367 for_each_sched_entity(se) {
368 cfs_rq = cfs_rq_of(se);
369
370 slice *= se->load.weight;
371 do_div(slice, cfs_rq->load.weight);
372 }
373
374
375 return slice;
376} 394}
377 395
378/* 396/*
379 * We calculate the vruntime slice of a to be inserted task 397 * We calculate the vruntime slice of a to be inserted task
380 * 398 *
381 * vs = s/w = p/rw 399 * vs = s*rw/w = p
382 */ 400 */
383static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
384{ 402{
385 unsigned long nr_running = cfs_rq->nr_running; 403 unsigned long nr_running = cfs_rq->nr_running;
386 unsigned long weight;
387 u64 vslice;
388 404
389 if (!se->on_rq) 405 if (!se->on_rq)
390 nr_running++; 406 nr_running++;
391 407
392 vslice = __sched_period(nr_running); 408 return __sched_period(nr_running);
409}
410
411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
393 429
394 for_each_sched_entity(se) { 430 for_each_sched_entity(se) {
395 cfs_rq = cfs_rq_of(se); 431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
396 457
397 weight = cfs_rq->load.weight; 458 if (se->load.weight < NICE_0_LOAD) {
398 if (!se->on_rq) 459 se_lw = &lw;
399 weight += se->load.weight; 460 rw += NICE_0_LOAD - se->load.weight;
461 }
400 462
401 vslice *= NICE_0_LOAD; 463 delta = calc_delta_mine(delta, rw, se_lw);
402 do_div(vslice, weight);
403 } 464 }
404 465
405 return vslice; 466 return delta;
406} 467}
407 468
408/* 469/*
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
419 480
420 curr->sum_exec_runtime += delta_exec; 481 curr->sum_exec_runtime += delta_exec;
421 schedstat_add(cfs_rq, exec_clock, delta_exec); 482 schedstat_add(cfs_rq, exec_clock, delta_exec);
422 delta_exec_weighted = delta_exec; 483 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
423 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
424 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
425 &curr->load);
426 }
427 curr->vruntime += delta_exec_weighted; 484 curr->vruntime += delta_exec_weighted;
428} 485}
429 486
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
510 * Scheduling class queueing methods: 567 * Scheduling class queueing methods:
511 */ 568 */
512 569
570#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
571static void
572add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
573{
574 cfs_rq->task_weight += weight;
575}
576#else
577static inline void
578add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
579{
580}
581#endif
582
513static void 583static void
514account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 584account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
515{ 585{
516 update_load_add(&cfs_rq->load, se->load.weight); 586 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se))
590 add_cfs_task_weight(cfs_rq, se->load.weight);
517 cfs_rq->nr_running++; 591 cfs_rq->nr_running++;
518 se->on_rq = 1; 592 se->on_rq = 1;
519 list_add(&se->group_node, &cfs_rq->tasks); 593 list_add(&se->group_node, &cfs_rq->tasks);
@@ -523,6 +597,10 @@ static void
523account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 597account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
524{ 598{
525 update_load_sub(&cfs_rq->load, se->load.weight); 599 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se))
603 add_cfs_task_weight(cfs_rq, -se->load.weight);
526 cfs_rq->nr_running--; 604 cfs_rq->nr_running--;
527 se->on_rq = 0; 605 se->on_rq = 0;
528 list_del_init(&se->group_node); 606 list_del_init(&se->group_node);
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
609 687
610 if (!initial) { 688 if (!initial) {
611 /* sleeps upto a single latency don't count. */ 689 /* sleeps upto a single latency don't count. */
612 if (sched_feat(NEW_FAIR_SLEEPERS)) 690 if (sched_feat(NEW_FAIR_SLEEPERS)) {
613 vruntime -= sysctl_sched_latency; 691 unsigned long thresh = sysctl_sched_latency;
692
693 /*
694 * convert the sleeper threshold into virtual time
695 */
696 if (sched_feat(NORMALIZED_SLEEPER))
697 thresh = calc_delta_fair(thresh, se);
698
699 vruntime -= thresh;
700 }
614 701
615 /* ensure we never gain time by being placed backwards. */ 702 /* ensure we never gain time by being placed backwards. */
616 vruntime = max_vruntime(se->vruntime, vruntime); 703 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
639 __enqueue_entity(cfs_rq, se); 726 __enqueue_entity(cfs_rq, se);
640} 727}
641 728
642static void update_avg(u64 *avg, u64 sample)
643{
644 s64 diff = sample - *avg;
645 *avg += diff >> 3;
646}
647
648static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
649{
650 if (!se->last_wakeup)
651 return;
652
653 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
654 se->last_wakeup = 0;
655}
656
657static void 729static void
658dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 730dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
659{ 731{
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
664 736
665 update_stats_dequeue(cfs_rq, se); 737 update_stats_dequeue(cfs_rq, se);
666 if (sleep) { 738 if (sleep) {
667 update_avg_stats(cfs_rq, se);
668#ifdef CONFIG_SCHEDSTATS 739#ifdef CONFIG_SCHEDSTATS
669 if (entity_is_task(se)) { 740 if (entity_is_task(se)) {
670 struct task_struct *tsk = task_of(se); 741 struct task_struct *tsk = task_of(se);
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
726 se->prev_sum_exec_runtime = se->sum_exec_runtime; 797 se->prev_sum_exec_runtime = se->sum_exec_runtime;
727} 798}
728 799
729static int
730wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
731
732static struct sched_entity * 800static struct sched_entity *
733pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 801pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
734{ 802{
735 if (!cfs_rq->next) 803 struct rq *rq = rq_of(cfs_rq);
736 return se; 804 u64 pair_slice = rq->clock - cfs_rq->pair_start;
737 805
738 if (wakeup_preempt_entity(cfs_rq->next, se) != 0) 806 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
807 cfs_rq->pair_start = rq->clock;
739 return se; 808 return se;
809 }
740 810
741 return cfs_rq->next; 811 return cfs_rq->next;
742} 812}
@@ -835,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
835 hrtick_start(rq, delta, requeue); 905 hrtick_start(rq, delta, requeue);
836 } 906 }
837} 907}
838#else 908#else /* !CONFIG_SCHED_HRTICK */
839static inline void 909static inline void
840hrtick_start_fair(struct rq *rq, struct task_struct *p) 910hrtick_start_fair(struct rq *rq, struct task_struct *p)
841{ 911{
@@ -976,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p)
976 } 1046 }
977 return cpu; 1047 return cpu;
978} 1048}
979#else 1049#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
980static inline int wake_idle(int cpu, struct task_struct *p) 1050static inline int wake_idle(int cpu, struct task_struct *p)
981{ 1051{
982 return cpu; 1052 return cpu;
@@ -987,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
987 1057
988static const struct sched_class fair_sched_class; 1058static const struct sched_class fair_sched_class;
989 1059
1060#ifdef CONFIG_FAIR_GROUP_SCHED
1061/*
1062 * effective_load() calculates the load change as seen from the root_task_group
1063 *
1064 * Adding load to a group doesn't make a group heavier, but can cause movement
1065 * of group shares between cpus. Assuming the shares were perfectly aligned one
1066 * can calculate the shift in shares.
1067 *
1068 * The problem is that perfectly aligning the shares is rather expensive, hence
1069 * we try to avoid doing that too often - see update_shares(), which ratelimits
1070 * this change.
1071 *
1072 * We compensate this by not only taking the current delta into account, but
1073 * also considering the delta between when the shares were last adjusted and
1074 * now.
1075 *
1076 * We still saw a performance dip, some tracing learned us that between
1077 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1078 * significantly. Therefore try to bias the error in direction of failing
1079 * the affine wakeup.
1080 *
1081 */
1082static long effective_load(struct task_group *tg, int cpu,
1083 long wl, long wg)
1084{
1085 struct sched_entity *se = tg->se[cpu];
1086 long more_w;
1087
1088 if (!tg->parent)
1089 return wl;
1090
1091 /*
1092 * By not taking the decrease of shares on the other cpu into
1093 * account our error leans towards reducing the affine wakeups.
1094 */
1095 if (!wl && sched_feat(ASYM_EFF_LOAD))
1096 return wl;
1097
1098 /*
1099 * Instead of using this increment, also add the difference
1100 * between when the shares were last updated and now.
1101 */
1102 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1103 wl += more_w;
1104 wg += more_w;
1105
1106 for_each_sched_entity(se) {
1107#define D(n) (likely(n) ? (n) : 1)
1108
1109 long S, rw, s, a, b;
1110
1111 S = se->my_q->tg->shares;
1112 s = se->my_q->shares;
1113 rw = se->my_q->rq_weight;
1114
1115 a = S*(rw + wl);
1116 b = S*rw + s*wg;
1117
1118 wl = s*(a-b)/D(b);
1119 /*
1120 * Assume the group is already running and will
1121 * thus already be accounted for in the weight.
1122 *
1123 * That is, moving shares between CPUs, does not
1124 * alter the group weight.
1125 */
1126 wg = 0;
1127#undef D
1128 }
1129
1130 return wl;
1131}
1132
1133#else
1134
1135static inline unsigned long effective_load(struct task_group *tg, int cpu,
1136 unsigned long wl, unsigned long wg)
1137{
1138 return wl;
1139}
1140
1141#endif
1142
990static int 1143static int
991wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1144wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
992 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1145 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -994,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
994 unsigned int imbalance) 1147 unsigned int imbalance)
995{ 1148{
996 struct task_struct *curr = this_rq->curr; 1149 struct task_struct *curr = this_rq->curr;
1150 struct task_group *tg;
997 unsigned long tl = this_load; 1151 unsigned long tl = this_load;
998 unsigned long tl_per_task; 1152 unsigned long tl_per_task;
1153 unsigned long weight;
999 int balanced; 1154 int balanced;
1000 1155
1001 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1156 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1006,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1006 * effect of the currently running task from the load 1161 * effect of the currently running task from the load
1007 * of the current CPU: 1162 * of the current CPU:
1008 */ 1163 */
1009 if (sync) 1164 if (sync) {
1010 tl -= current->se.load.weight; 1165 tg = task_group(current);
1166 weight = current->se.load.weight;
1167
1168 tl += effective_load(tg, this_cpu, -weight, -weight);
1169 load += effective_load(tg, prev_cpu, 0, -weight);
1170 }
1011 1171
1012 balanced = 100*(tl + p->se.load.weight) <= imbalance*load; 1172 tg = task_group(p);
1173 weight = p->se.load.weight;
1174
1175 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1176 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1013 1177
1014 /* 1178 /*
1015 * If the currently running task will sleep within 1179 * If the currently running task will sleep within
1016 * a reasonable amount of time then attract this newly 1180 * a reasonable amount of time then attract this newly
1017 * woken task: 1181 * woken task:
1018 */ 1182 */
1019 if (sync && balanced && curr->sched_class == &fair_sched_class) { 1183 if (sync && balanced) {
1020 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1184 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1021 p->se.avg_overlap < sysctl_sched_migration_cost) 1185 p->se.avg_overlap < sysctl_sched_migration_cost)
1022 return 1; 1186 return 1;
1023 } 1187 }
1024 1188
@@ -1111,11 +1275,13 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1111 unsigned long gran = sysctl_sched_wakeup_granularity; 1275 unsigned long gran = sysctl_sched_wakeup_granularity;
1112 1276
1113 /* 1277 /*
1114 * More easily preempt - nice tasks, while not making 1278 * More easily preempt - nice tasks, while not making it harder for
1115 * it harder for + nice tasks. 1279 * + nice tasks.
1116 */ 1280 */
1117 if (unlikely(se->load.weight > NICE_0_LOAD)) 1281 if (sched_feat(ASYM_GRAN))
1118 gran = calc_delta_fair(gran, &se->load); 1282 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1283 else
1284 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1119 1285
1120 return gran; 1286 return gran;
1121} 1287}
@@ -1177,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1177 return; 1343 return;
1178 } 1344 }
1179 1345
1180 se->last_wakeup = se->sum_exec_runtime;
1181 if (unlikely(se == pse)) 1346 if (unlikely(se == pse))
1182 return; 1347 return;
1183 1348
@@ -1275,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1275 struct task_struct *p = NULL; 1440 struct task_struct *p = NULL;
1276 struct sched_entity *se; 1441 struct sched_entity *se;
1277 1442
1278 if (next == &cfs_rq->tasks) 1443 while (next != &cfs_rq->tasks) {
1279 return NULL;
1280
1281 /* Skip over entities that are not tasks */
1282 do {
1283 se = list_entry(next, struct sched_entity, group_node); 1444 se = list_entry(next, struct sched_entity, group_node);
1284 next = next->next; 1445 next = next->next;
1285 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1286 1446
1287 if (next == &cfs_rq->tasks) 1447 /* Skip over entities that are not tasks */
1288 return NULL; 1448 if (entity_is_task(se)) {
1449 p = task_of(se);
1450 break;
1451 }
1452 }
1289 1453
1290 cfs_rq->balance_iterator = next; 1454 cfs_rq->balance_iterator = next;
1291
1292 if (entity_is_task(se))
1293 p = task_of(se);
1294
1295 return p; 1455 return p;
1296} 1456}
1297 1457
@@ -1309,75 +1469,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
1309 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1469 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1310} 1470}
1311 1471
1312#ifdef CONFIG_FAIR_GROUP_SCHED 1472static unsigned long
1313static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1473__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1474 unsigned long max_load_move, struct sched_domain *sd,
1475 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1476 struct cfs_rq *cfs_rq)
1314{ 1477{
1315 struct sched_entity *curr; 1478 struct rq_iterator cfs_rq_iterator;
1316 struct task_struct *p;
1317
1318 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1319 return MAX_PRIO;
1320
1321 curr = cfs_rq->curr;
1322 if (!curr)
1323 curr = __pick_next_entity(cfs_rq);
1324 1479
1325 p = task_of(curr); 1480 cfs_rq_iterator.start = load_balance_start_fair;
1481 cfs_rq_iterator.next = load_balance_next_fair;
1482 cfs_rq_iterator.arg = cfs_rq;
1326 1483
1327 return p->prio; 1484 return balance_tasks(this_rq, this_cpu, busiest,
1485 max_load_move, sd, idle, all_pinned,
1486 this_best_prio, &cfs_rq_iterator);
1328} 1487}
1329#endif
1330 1488
1489#ifdef CONFIG_FAIR_GROUP_SCHED
1331static unsigned long 1490static unsigned long
1332load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1491load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1333 unsigned long max_load_move, 1492 unsigned long max_load_move,
1334 struct sched_domain *sd, enum cpu_idle_type idle, 1493 struct sched_domain *sd, enum cpu_idle_type idle,
1335 int *all_pinned, int *this_best_prio) 1494 int *all_pinned, int *this_best_prio)
1336{ 1495{
1337 struct cfs_rq *busy_cfs_rq;
1338 long rem_load_move = max_load_move; 1496 long rem_load_move = max_load_move;
1339 struct rq_iterator cfs_rq_iterator; 1497 int busiest_cpu = cpu_of(busiest);
1340 1498 struct task_group *tg;
1341 cfs_rq_iterator.start = load_balance_start_fair;
1342 cfs_rq_iterator.next = load_balance_next_fair;
1343 1499
1344 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1500 rcu_read_lock();
1345#ifdef CONFIG_FAIR_GROUP_SCHED 1501 update_h_load(busiest_cpu);
1346 struct cfs_rq *this_cfs_rq;
1347 long imbalance;
1348 unsigned long maxload;
1349 1502
1350 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1503 list_for_each_entry(tg, &task_groups, list) {
1504 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1505 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1506 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
1507 u64 rem_load, moved_load;
1351 1508
1352 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1509 /*
1353 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1510 * empty group
1354 if (imbalance <= 0) 1511 */
1512 if (!busiest_cfs_rq->task_weight)
1355 continue; 1513 continue;
1356 1514
1357 /* Don't pull more than imbalance/2 */ 1515 rem_load = (u64)rem_load_move * busiest_weight;
1358 imbalance /= 2; 1516 rem_load = div_u64(rem_load, busiest_h_load + 1);
1359 maxload = min(rem_load_move, imbalance);
1360 1517
1361 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1518 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1362#else 1519 rem_load, sd, idle, all_pinned, this_best_prio,
1363# define maxload rem_load_move 1520 tg->cfs_rq[busiest_cpu]);
1364#endif 1521
1365 /* 1522 if (!moved_load)
1366 * pass busy_cfs_rq argument into 1523 continue;
1367 * load_balance_[start|next]_fair iterators 1524
1368 */ 1525 moved_load *= busiest_h_load;
1369 cfs_rq_iterator.arg = busy_cfs_rq; 1526 moved_load = div_u64(moved_load, busiest_weight + 1);
1370 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1371 maxload, sd, idle, all_pinned,
1372 this_best_prio,
1373 &cfs_rq_iterator);
1374 1527
1375 if (rem_load_move <= 0) 1528 rem_load_move -= moved_load;
1529 if (rem_load_move < 0)
1376 break; 1530 break;
1377 } 1531 }
1532 rcu_read_unlock();
1378 1533
1379 return max_load_move - rem_load_move; 1534 return max_load_move - rem_load_move;
1380} 1535}
1536#else
1537static unsigned long
1538load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1539 unsigned long max_load_move,
1540 struct sched_domain *sd, enum cpu_idle_type idle,
1541 int *all_pinned, int *this_best_prio)
1542{
1543 return __load_balance_fair(this_rq, this_cpu, busiest,
1544 max_load_move, sd, idle, all_pinned,
1545 this_best_prio, &busiest->cfs);
1546}
1547#endif
1381 1548
1382static int 1549static int
1383move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1550move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1402,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1402 1569
1403 return 0; 1570 return 0;
1404} 1571}
1405#endif 1572#endif /* CONFIG_SMP */
1406 1573
1407/* 1574/*
1408 * scheduler tick hitting a task of our scheduling class: 1575 * scheduler tick hitting a task of our scheduling class: