aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c423
1 files changed, 296 insertions, 127 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..cf2cd6ce4cb2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 334#endif
335 335
336/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
337 * The idea is to set a period in which each task runs once. 365 * The idea is to set a period in which each task runs once.
338 * 366 *
339 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -362,47 +390,80 @@ static u64 __sched_period(unsigned long nr_running)
362 */ 390 */
363static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
364{ 392{
365 u64 slice = __sched_period(cfs_rq->nr_running); 393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
366
367 for_each_sched_entity(se) {
368 cfs_rq = cfs_rq_of(se);
369
370 slice *= se->load.weight;
371 do_div(slice, cfs_rq->load.weight);
372 }
373
374
375 return slice;
376} 394}
377 395
378/* 396/*
379 * We calculate the vruntime slice of a to be inserted task 397 * We calculate the vruntime slice of a to be inserted task
380 * 398 *
381 * vs = s/w = p/rw 399 * vs = s*rw/w = p
382 */ 400 */
383static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
384{ 402{
385 unsigned long nr_running = cfs_rq->nr_running; 403 unsigned long nr_running = cfs_rq->nr_running;
386 unsigned long weight;
387 u64 vslice;
388 404
389 if (!se->on_rq) 405 if (!se->on_rq)
390 nr_running++; 406 nr_running++;
391 407
392 vslice = __sched_period(nr_running); 408 return __sched_period(nr_running);
409}
410
411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
393 429
394 for_each_sched_entity(se) { 430 for_each_sched_entity(se) {
395 cfs_rq = cfs_rq_of(se); 431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
396 457
397 weight = cfs_rq->load.weight; 458 if (se->load.weight < NICE_0_LOAD) {
398 if (!se->on_rq) 459 se_lw = &lw;
399 weight += se->load.weight; 460 rw += NICE_0_LOAD - se->load.weight;
461 }
400 462
401 vslice *= NICE_0_LOAD; 463 delta = calc_delta_mine(delta, rw, se_lw);
402 do_div(vslice, weight);
403 } 464 }
404 465
405 return vslice; 466 return delta;
406} 467}
407 468
408/* 469/*
@@ -419,11 +480,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
419 480
420 curr->sum_exec_runtime += delta_exec; 481 curr->sum_exec_runtime += delta_exec;
421 schedstat_add(cfs_rq, exec_clock, delta_exec); 482 schedstat_add(cfs_rq, exec_clock, delta_exec);
422 delta_exec_weighted = delta_exec; 483 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
423 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
424 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
425 &curr->load);
426 }
427 curr->vruntime += delta_exec_weighted; 484 curr->vruntime += delta_exec_weighted;
428} 485}
429 486
@@ -510,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
510 * Scheduling class queueing methods: 567 * Scheduling class queueing methods:
511 */ 568 */
512 569
570#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
571static void
572add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
573{
574 cfs_rq->task_weight += weight;
575}
576#else
577static inline void
578add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
579{
580}
581#endif
582
513static void 583static void
514account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 584account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
515{ 585{
516 update_load_add(&cfs_rq->load, se->load.weight); 586 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se))
590 add_cfs_task_weight(cfs_rq, se->load.weight);
517 cfs_rq->nr_running++; 591 cfs_rq->nr_running++;
518 se->on_rq = 1; 592 se->on_rq = 1;
519 list_add(&se->group_node, &cfs_rq->tasks); 593 list_add(&se->group_node, &cfs_rq->tasks);
@@ -523,6 +597,10 @@ static void
523account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 597account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
524{ 598{
525 update_load_sub(&cfs_rq->load, se->load.weight); 599 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se))
603 add_cfs_task_weight(cfs_rq, -se->load.weight);
526 cfs_rq->nr_running--; 604 cfs_rq->nr_running--;
527 se->on_rq = 0; 605 se->on_rq = 0;
528 list_del_init(&se->group_node); 606 list_del_init(&se->group_node);
@@ -609,8 +687,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
609 687
610 if (!initial) { 688 if (!initial) {
611 /* sleeps upto a single latency don't count. */ 689 /* sleeps upto a single latency don't count. */
612 if (sched_feat(NEW_FAIR_SLEEPERS)) 690 if (sched_feat(NEW_FAIR_SLEEPERS)) {
613 vruntime -= sysctl_sched_latency; 691 unsigned long thresh = sysctl_sched_latency;
692
693 /*
694 * convert the sleeper threshold into virtual time
695 */
696 if (sched_feat(NORMALIZED_SLEEPER))
697 thresh = calc_delta_fair(thresh, se);
698
699 vruntime -= thresh;
700 }
614 701
615 /* ensure we never gain time by being placed backwards. */ 702 /* ensure we never gain time by being placed backwards. */
616 vruntime = max_vruntime(se->vruntime, vruntime); 703 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -639,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
639 __enqueue_entity(cfs_rq, se); 726 __enqueue_entity(cfs_rq, se);
640} 727}
641 728
642static void update_avg(u64 *avg, u64 sample)
643{
644 s64 diff = sample - *avg;
645 *avg += diff >> 3;
646}
647
648static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
649{
650 if (!se->last_wakeup)
651 return;
652
653 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
654 se->last_wakeup = 0;
655}
656
657static void 729static void
658dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 730dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
659{ 731{
@@ -664,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
664 736
665 update_stats_dequeue(cfs_rq, se); 737 update_stats_dequeue(cfs_rq, se);
666 if (sleep) { 738 if (sleep) {
667 update_avg_stats(cfs_rq, se);
668#ifdef CONFIG_SCHEDSTATS 739#ifdef CONFIG_SCHEDSTATS
669 if (entity_is_task(se)) { 740 if (entity_is_task(se)) {
670 struct task_struct *tsk = task_of(se); 741 struct task_struct *tsk = task_of(se);
@@ -726,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
726 se->prev_sum_exec_runtime = se->sum_exec_runtime; 797 se->prev_sum_exec_runtime = se->sum_exec_runtime;
727} 798}
728 799
729static int
730wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
731
732static struct sched_entity * 800static struct sched_entity *
733pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 801pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
734{ 802{
735 if (!cfs_rq->next) 803 struct rq *rq = rq_of(cfs_rq);
736 return se; 804 u64 pair_slice = rq->clock - cfs_rq->pair_start;
737 805
738 if (wakeup_preempt_entity(cfs_rq->next, se) != 0) 806 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
807 cfs_rq->pair_start = rq->clock;
739 return se; 808 return se;
809 }
740 810
741 return cfs_rq->next; 811 return cfs_rq->next;
742} 812}
@@ -808,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
808#ifdef CONFIG_SCHED_HRTICK 878#ifdef CONFIG_SCHED_HRTICK
809static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 879static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
810{ 880{
811 int requeue = rq->curr == p;
812 struct sched_entity *se = &p->se; 881 struct sched_entity *se = &p->se;
813 struct cfs_rq *cfs_rq = cfs_rq_of(se); 882 struct cfs_rq *cfs_rq = cfs_rq_of(se);
814 883
@@ -829,13 +898,13 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
829 * Don't schedule slices shorter than 10000ns, that just 898 * Don't schedule slices shorter than 10000ns, that just
830 * doesn't make sense. Rely on vruntime for fairness. 899 * doesn't make sense. Rely on vruntime for fairness.
831 */ 900 */
832 if (!requeue) 901 if (rq->curr != p)
833 delta = max(10000LL, delta); 902 delta = max(10000LL, delta);
834 903
835 hrtick_start(rq, delta, requeue); 904 hrtick_start(rq, delta);
836 } 905 }
837} 906}
838#else 907#else /* !CONFIG_SCHED_HRTICK */
839static inline void 908static inline void
840hrtick_start_fair(struct rq *rq, struct task_struct *p) 909hrtick_start_fair(struct rq *rq, struct task_struct *p)
841{ 910{
@@ -934,6 +1003,8 @@ static void yield_task_fair(struct rq *rq)
934 * not idle and an idle cpu is available. The span of cpus to 1003 * not idle and an idle cpu is available. The span of cpus to
935 * search starts with cpus closest then further out as needed, 1004 * search starts with cpus closest then further out as needed,
936 * so we always favor a closer, idle cpu. 1005 * so we always favor a closer, idle cpu.
1006 * Domains may include CPUs that are not usable for migration,
1007 * hence we need to mask them out (cpu_active_map)
937 * 1008 *
938 * Returns the CPU we should wake onto. 1009 * Returns the CPU we should wake onto.
939 */ 1010 */
@@ -961,7 +1032,8 @@ static int wake_idle(int cpu, struct task_struct *p)
961 || ((sd->flags & SD_WAKE_IDLE_FAR) 1032 || ((sd->flags & SD_WAKE_IDLE_FAR)
962 && !task_hot(p, task_rq(p)->clock, sd))) { 1033 && !task_hot(p, task_rq(p)->clock, sd))) {
963 cpus_and(tmp, sd->span, p->cpus_allowed); 1034 cpus_and(tmp, sd->span, p->cpus_allowed);
964 for_each_cpu_mask(i, tmp) { 1035 cpus_and(tmp, tmp, cpu_active_map);
1036 for_each_cpu_mask_nr(i, tmp) {
965 if (idle_cpu(i)) { 1037 if (idle_cpu(i)) {
966 if (i != task_cpu(p)) { 1038 if (i != task_cpu(p)) {
967 schedstat_inc(p, 1039 schedstat_inc(p,
@@ -976,7 +1048,7 @@ static int wake_idle(int cpu, struct task_struct *p)
976 } 1048 }
977 return cpu; 1049 return cpu;
978} 1050}
979#else 1051#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
980static inline int wake_idle(int cpu, struct task_struct *p) 1052static inline int wake_idle(int cpu, struct task_struct *p)
981{ 1053{
982 return cpu; 1054 return cpu;
@@ -987,6 +1059,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
987 1059
988static const struct sched_class fair_sched_class; 1060static const struct sched_class fair_sched_class;
989 1061
1062#ifdef CONFIG_FAIR_GROUP_SCHED
1063/*
1064 * effective_load() calculates the load change as seen from the root_task_group
1065 *
1066 * Adding load to a group doesn't make a group heavier, but can cause movement
1067 * of group shares between cpus. Assuming the shares were perfectly aligned one
1068 * can calculate the shift in shares.
1069 *
1070 * The problem is that perfectly aligning the shares is rather expensive, hence
1071 * we try to avoid doing that too often - see update_shares(), which ratelimits
1072 * this change.
1073 *
1074 * We compensate this by not only taking the current delta into account, but
1075 * also considering the delta between when the shares were last adjusted and
1076 * now.
1077 *
1078 * We still saw a performance dip, some tracing learned us that between
1079 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1080 * significantly. Therefore try to bias the error in direction of failing
1081 * the affine wakeup.
1082 *
1083 */
1084static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg)
1086{
1087 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089
1090 if (!tg->parent)
1091 return wl;
1092
1093 /*
1094 * By not taking the decrease of shares on the other cpu into
1095 * account our error leans towards reducing the affine wakeups.
1096 */
1097 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl;
1099
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b;
1112
1113 S = se->my_q->tg->shares;
1114 s = se->my_q->shares;
1115 rw = se->my_q->rq_weight;
1116
1117 a = S*(rw + wl);
1118 b = S*rw + s*wg;
1119
1120 wl = s*(a-b)/D(b);
1121 /*
1122 * Assume the group is already running and will
1123 * thus already be accounted for in the weight.
1124 *
1125 * That is, moving shares between CPUs, does not
1126 * alter the group weight.
1127 */
1128 wg = 0;
1129#undef D
1130 }
1131
1132 return wl;
1133}
1134
1135#else
1136
1137static inline unsigned long effective_load(struct task_group *tg, int cpu,
1138 unsigned long wl, unsigned long wg)
1139{
1140 return wl;
1141}
1142
1143#endif
1144
990static int 1145static int
991wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
992 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -994,8 +1149,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
994 unsigned int imbalance) 1149 unsigned int imbalance)
995{ 1150{
996 struct task_struct *curr = this_rq->curr; 1151 struct task_struct *curr = this_rq->curr;
1152 struct task_group *tg;
997 unsigned long tl = this_load; 1153 unsigned long tl = this_load;
998 unsigned long tl_per_task; 1154 unsigned long tl_per_task;
1155 unsigned long weight;
999 int balanced; 1156 int balanced;
1000 1157
1001 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1158 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1006,19 +1163,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1006 * effect of the currently running task from the load 1163 * effect of the currently running task from the load
1007 * of the current CPU: 1164 * of the current CPU:
1008 */ 1165 */
1009 if (sync) 1166 if (sync) {
1010 tl -= current->se.load.weight; 1167 tg = task_group(current);
1168 weight = current->se.load.weight;
1169
1170 tl += effective_load(tg, this_cpu, -weight, -weight);
1171 load += effective_load(tg, prev_cpu, 0, -weight);
1172 }
1011 1173
1012 balanced = 100*(tl + p->se.load.weight) <= imbalance*load; 1174 tg = task_group(p);
1175 weight = p->se.load.weight;
1176
1177 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1178 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1013 1179
1014 /* 1180 /*
1015 * If the currently running task will sleep within 1181 * If the currently running task will sleep within
1016 * a reasonable amount of time then attract this newly 1182 * a reasonable amount of time then attract this newly
1017 * woken task: 1183 * woken task:
1018 */ 1184 */
1019 if (sync && balanced && curr->sched_class == &fair_sched_class) { 1185 if (sync && balanced) {
1020 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1186 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1021 p->se.avg_overlap < sysctl_sched_migration_cost) 1187 p->se.avg_overlap < sysctl_sched_migration_cost)
1022 return 1; 1188 return 1;
1023 } 1189 }
1024 1190
@@ -1111,11 +1277,13 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1111 unsigned long gran = sysctl_sched_wakeup_granularity; 1277 unsigned long gran = sysctl_sched_wakeup_granularity;
1112 1278
1113 /* 1279 /*
1114 * More easily preempt - nice tasks, while not making 1280 * More easily preempt - nice tasks, while not making it harder for
1115 * it harder for + nice tasks. 1281 * + nice tasks.
1116 */ 1282 */
1117 if (unlikely(se->load.weight > NICE_0_LOAD)) 1283 if (sched_feat(ASYM_GRAN))
1118 gran = calc_delta_fair(gran, &se->load); 1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1119 1287
1120 return gran; 1288 return gran;
1121} 1289}
@@ -1177,7 +1345,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1177 return; 1345 return;
1178 } 1346 }
1179 1347
1180 se->last_wakeup = se->sum_exec_runtime;
1181 if (unlikely(se == pse)) 1348 if (unlikely(se == pse))
1182 return; 1349 return;
1183 1350
@@ -1275,23 +1442,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1275 struct task_struct *p = NULL; 1442 struct task_struct *p = NULL;
1276 struct sched_entity *se; 1443 struct sched_entity *se;
1277 1444
1278 if (next == &cfs_rq->tasks) 1445 while (next != &cfs_rq->tasks) {
1279 return NULL;
1280
1281 /* Skip over entities that are not tasks */
1282 do {
1283 se = list_entry(next, struct sched_entity, group_node); 1446 se = list_entry(next, struct sched_entity, group_node);
1284 next = next->next; 1447 next = next->next;
1285 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1286 1448
1287 if (next == &cfs_rq->tasks) 1449 /* Skip over entities that are not tasks */
1288 return NULL; 1450 if (entity_is_task(se)) {
1451 p = task_of(se);
1452 break;
1453 }
1454 }
1289 1455
1290 cfs_rq->balance_iterator = next; 1456 cfs_rq->balance_iterator = next;
1291
1292 if (entity_is_task(se))
1293 p = task_of(se);
1294
1295 return p; 1457 return p;
1296} 1458}
1297 1459
@@ -1309,75 +1471,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
1309 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1471 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1310} 1472}
1311 1473
1312#ifdef CONFIG_FAIR_GROUP_SCHED 1474static unsigned long
1313static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1475__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1476 unsigned long max_load_move, struct sched_domain *sd,
1477 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1478 struct cfs_rq *cfs_rq)
1314{ 1479{
1315 struct sched_entity *curr; 1480 struct rq_iterator cfs_rq_iterator;
1316 struct task_struct *p;
1317
1318 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1319 return MAX_PRIO;
1320
1321 curr = cfs_rq->curr;
1322 if (!curr)
1323 curr = __pick_next_entity(cfs_rq);
1324 1481
1325 p = task_of(curr); 1482 cfs_rq_iterator.start = load_balance_start_fair;
1483 cfs_rq_iterator.next = load_balance_next_fair;
1484 cfs_rq_iterator.arg = cfs_rq;
1326 1485
1327 return p->prio; 1486 return balance_tasks(this_rq, this_cpu, busiest,
1487 max_load_move, sd, idle, all_pinned,
1488 this_best_prio, &cfs_rq_iterator);
1328} 1489}
1329#endif
1330 1490
1491#ifdef CONFIG_FAIR_GROUP_SCHED
1331static unsigned long 1492static unsigned long
1332load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1493load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1333 unsigned long max_load_move, 1494 unsigned long max_load_move,
1334 struct sched_domain *sd, enum cpu_idle_type idle, 1495 struct sched_domain *sd, enum cpu_idle_type idle,
1335 int *all_pinned, int *this_best_prio) 1496 int *all_pinned, int *this_best_prio)
1336{ 1497{
1337 struct cfs_rq *busy_cfs_rq;
1338 long rem_load_move = max_load_move; 1498 long rem_load_move = max_load_move;
1339 struct rq_iterator cfs_rq_iterator; 1499 int busiest_cpu = cpu_of(busiest);
1340 1500 struct task_group *tg;
1341 cfs_rq_iterator.start = load_balance_start_fair;
1342 cfs_rq_iterator.next = load_balance_next_fair;
1343 1501
1344 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1502 rcu_read_lock();
1345#ifdef CONFIG_FAIR_GROUP_SCHED 1503 update_h_load(busiest_cpu);
1346 struct cfs_rq *this_cfs_rq;
1347 long imbalance;
1348 unsigned long maxload;
1349 1504
1350 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1505 list_for_each_entry(tg, &task_groups, list) {
1506 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1507 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1508 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
1509 u64 rem_load, moved_load;
1351 1510
1352 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1511 /*
1353 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1512 * empty group
1354 if (imbalance <= 0) 1513 */
1514 if (!busiest_cfs_rq->task_weight)
1355 continue; 1515 continue;
1356 1516
1357 /* Don't pull more than imbalance/2 */ 1517 rem_load = (u64)rem_load_move * busiest_weight;
1358 imbalance /= 2; 1518 rem_load = div_u64(rem_load, busiest_h_load + 1);
1359 maxload = min(rem_load_move, imbalance);
1360 1519
1361 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1520 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1362#else 1521 rem_load, sd, idle, all_pinned, this_best_prio,
1363# define maxload rem_load_move 1522 tg->cfs_rq[busiest_cpu]);
1364#endif 1523
1365 /* 1524 if (!moved_load)
1366 * pass busy_cfs_rq argument into 1525 continue;
1367 * load_balance_[start|next]_fair iterators 1526
1368 */ 1527 moved_load *= busiest_h_load;
1369 cfs_rq_iterator.arg = busy_cfs_rq; 1528 moved_load = div_u64(moved_load, busiest_weight + 1);
1370 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1371 maxload, sd, idle, all_pinned,
1372 this_best_prio,
1373 &cfs_rq_iterator);
1374 1529
1375 if (rem_load_move <= 0) 1530 rem_load_move -= moved_load;
1531 if (rem_load_move < 0)
1376 break; 1532 break;
1377 } 1533 }
1534 rcu_read_unlock();
1378 1535
1379 return max_load_move - rem_load_move; 1536 return max_load_move - rem_load_move;
1380} 1537}
1538#else
1539static unsigned long
1540load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1541 unsigned long max_load_move,
1542 struct sched_domain *sd, enum cpu_idle_type idle,
1543 int *all_pinned, int *this_best_prio)
1544{
1545 return __load_balance_fair(this_rq, this_cpu, busiest,
1546 max_load_move, sd, idle, all_pinned,
1547 this_best_prio, &busiest->cfs);
1548}
1549#endif
1381 1550
1382static int 1551static int
1383move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1552move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1402,7 +1571,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1402 1571
1403 return 0; 1572 return 0;
1404} 1573}
1405#endif 1574#endif /* CONFIG_SMP */
1406 1575
1407/* 1576/*
1408 * scheduler tick hitting a task of our scheduling class: 1577 * scheduler tick hitting a task of our scheduling class: