diff options
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 370 |
1 files changed, 299 insertions, 71 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..c62ebae65cf0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | |||
89 | 89 | ||
90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 90 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
91 | 91 | ||
92 | /* | ||
93 | * The exponential sliding window over which load is averaged for shares | ||
94 | * distribution. | ||
95 | * (default: 10msec) | ||
96 | */ | ||
97 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | ||
98 | |||
92 | static const struct sched_class fair_sched_class; | 99 | static const struct sched_class fair_sched_class; |
93 | 100 | ||
94 | /************************************************************** | 101 | /************************************************************** |
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
143 | return cfs_rq->tg->cfs_rq[this_cpu]; | 150 | return cfs_rq->tg->cfs_rq[this_cpu]; |
144 | } | 151 | } |
145 | 152 | ||
153 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
154 | { | ||
155 | if (!cfs_rq->on_list) { | ||
156 | /* | ||
157 | * Ensure we either appear before our parent (if already | ||
158 | * enqueued) or force our parent to appear after us when it is | ||
159 | * enqueued. The fact that we always enqueue bottom-up | ||
160 | * reduces this to two cases. | ||
161 | */ | ||
162 | if (cfs_rq->tg->parent && | ||
163 | cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { | ||
164 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
165 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
166 | } else { | ||
167 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
168 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | ||
169 | } | ||
170 | |||
171 | cfs_rq->on_list = 1; | ||
172 | } | ||
173 | } | ||
174 | |||
175 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
176 | { | ||
177 | if (cfs_rq->on_list) { | ||
178 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
179 | cfs_rq->on_list = 0; | ||
180 | } | ||
181 | } | ||
182 | |||
146 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 183 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
147 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 184 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
148 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 185 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
246 | return &cpu_rq(this_cpu)->cfs; | 283 | return &cpu_rq(this_cpu)->cfs; |
247 | } | 284 | } |
248 | 285 | ||
286 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
287 | { | ||
288 | } | ||
289 | |||
290 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | ||
291 | { | ||
292 | } | ||
293 | |||
249 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 294 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
250 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 295 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
251 | 296 | ||
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
417 | WRT_SYSCTL(sched_min_granularity); | 462 | WRT_SYSCTL(sched_min_granularity); |
418 | WRT_SYSCTL(sched_latency); | 463 | WRT_SYSCTL(sched_latency); |
419 | WRT_SYSCTL(sched_wakeup_granularity); | 464 | WRT_SYSCTL(sched_wakeup_granularity); |
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | 465 | #undef WRT_SYSCTL |
422 | 466 | ||
423 | return 0; | 467 | return 0; |
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
495 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 539 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
496 | } | 540 | } |
497 | 541 | ||
542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | ||
543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | ||
544 | |||
498 | /* | 545 | /* |
499 | * Update the current task's runtime statistics. Skip current tasks that | 546 | * Update the current task's runtime statistics. Skip current tasks that |
500 | * are not in our scheduling class. | 547 | * are not in our scheduling class. |
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, | |||
514 | 561 | ||
515 | curr->vruntime += delta_exec_weighted; | 562 | curr->vruntime += delta_exec_weighted; |
516 | update_min_vruntime(cfs_rq); | 563 | update_min_vruntime(cfs_rq); |
564 | |||
565 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
566 | cfs_rq->load_unacc_exec_time += delta_exec; | ||
567 | #endif | ||
517 | } | 568 | } |
518 | 569 | ||
519 | static void update_curr(struct cfs_rq *cfs_rq) | 570 | static void update_curr(struct cfs_rq *cfs_rq) |
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
633 | list_add(&se->group_node, &cfs_rq->tasks); | 684 | list_add(&se->group_node, &cfs_rq->tasks); |
634 | } | 685 | } |
635 | cfs_rq->nr_running++; | 686 | cfs_rq->nr_running++; |
636 | se->on_rq = 1; | ||
637 | } | 687 | } |
638 | 688 | ||
639 | static void | 689 | static void |
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
647 | list_del_init(&se->group_node); | 697 | list_del_init(&se->group_node); |
648 | } | 698 | } |
649 | cfs_rq->nr_running--; | 699 | cfs_rq->nr_running--; |
650 | se->on_rq = 0; | ||
651 | } | 700 | } |
652 | 701 | ||
702 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
703 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | ||
704 | int global_update) | ||
705 | { | ||
706 | struct task_group *tg = cfs_rq->tg; | ||
707 | long load_avg; | ||
708 | |||
709 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | ||
710 | load_avg -= cfs_rq->load_contribution; | ||
711 | |||
712 | if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { | ||
713 | atomic_add(load_avg, &tg->load_weight); | ||
714 | cfs_rq->load_contribution += load_avg; | ||
715 | } | ||
716 | } | ||
717 | |||
718 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
719 | { | ||
720 | u64 period = sysctl_sched_shares_window; | ||
721 | u64 now, delta; | ||
722 | unsigned long load = cfs_rq->load.weight; | ||
723 | |||
724 | if (!cfs_rq) | ||
725 | return; | ||
726 | |||
727 | now = rq_of(cfs_rq)->clock; | ||
728 | delta = now - cfs_rq->load_stamp; | ||
729 | |||
730 | /* truncate load history at 4 idle periods */ | ||
731 | if (cfs_rq->load_stamp > cfs_rq->load_last && | ||
732 | now - cfs_rq->load_last > 4 * period) { | ||
733 | cfs_rq->load_period = 0; | ||
734 | cfs_rq->load_avg = 0; | ||
735 | } | ||
736 | |||
737 | cfs_rq->load_stamp = now; | ||
738 | cfs_rq->load_unacc_exec_time = 0; | ||
739 | cfs_rq->load_period += delta; | ||
740 | if (load) { | ||
741 | cfs_rq->load_last = now; | ||
742 | cfs_rq->load_avg += delta * load; | ||
743 | } | ||
744 | |||
745 | /* consider updating load contribution on each fold or truncate */ | ||
746 | if (global_update || cfs_rq->load_period > period | ||
747 | || !cfs_rq->load_period) | ||
748 | update_cfs_rq_load_contribution(cfs_rq, global_update); | ||
749 | |||
750 | while (cfs_rq->load_period > period) { | ||
751 | /* | ||
752 | * Inline assembly required to prevent the compiler | ||
753 | * optimising this loop into a divmod call. | ||
754 | * See __iter_div_u64_rem() for another example of this. | ||
755 | */ | ||
756 | asm("" : "+rm" (cfs_rq->load_period)); | ||
757 | cfs_rq->load_period /= 2; | ||
758 | cfs_rq->load_avg /= 2; | ||
759 | } | ||
760 | |||
761 | if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) | ||
762 | list_del_leaf_cfs_rq(cfs_rq); | ||
763 | } | ||
764 | |||
765 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
766 | unsigned long weight) | ||
767 | { | ||
768 | if (se->on_rq) { | ||
769 | /* commit outstanding execution time */ | ||
770 | if (cfs_rq->curr == se) | ||
771 | update_curr(cfs_rq); | ||
772 | account_entity_dequeue(cfs_rq, se); | ||
773 | } | ||
774 | |||
775 | update_load_set(&se->load, weight); | ||
776 | |||
777 | if (se->on_rq) | ||
778 | account_entity_enqueue(cfs_rq, se); | ||
779 | } | ||
780 | |||
781 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
782 | { | ||
783 | struct task_group *tg; | ||
784 | struct sched_entity *se; | ||
785 | long load_weight, load, shares; | ||
786 | |||
787 | if (!cfs_rq) | ||
788 | return; | ||
789 | |||
790 | tg = cfs_rq->tg; | ||
791 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
792 | if (!se) | ||
793 | return; | ||
794 | |||
795 | load = cfs_rq->load.weight + weight_delta; | ||
796 | |||
797 | load_weight = atomic_read(&tg->load_weight); | ||
798 | load_weight -= cfs_rq->load_contribution; | ||
799 | load_weight += load; | ||
800 | |||
801 | shares = (tg->shares * load); | ||
802 | if (load_weight) | ||
803 | shares /= load_weight; | ||
804 | |||
805 | if (shares < MIN_SHARES) | ||
806 | shares = MIN_SHARES; | ||
807 | if (shares > tg->shares) | ||
808 | shares = tg->shares; | ||
809 | |||
810 | reweight_entity(cfs_rq_of(se), se, shares); | ||
811 | } | ||
812 | |||
813 | static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
814 | { | ||
815 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | ||
816 | update_cfs_load(cfs_rq, 0); | ||
817 | update_cfs_shares(cfs_rq, 0); | ||
818 | } | ||
819 | } | ||
820 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
821 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | ||
822 | { | ||
823 | } | ||
824 | |||
825 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | ||
826 | { | ||
827 | } | ||
828 | |||
829 | static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) | ||
830 | { | ||
831 | } | ||
832 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
833 | |||
653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 834 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
654 | { | 835 | { |
655 | #ifdef CONFIG_SCHEDSTATS | 836 | #ifdef CONFIG_SCHEDSTATS |
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
771 | * Update run-time statistics of the 'current'. | 952 | * Update run-time statistics of the 'current'. |
772 | */ | 953 | */ |
773 | update_curr(cfs_rq); | 954 | update_curr(cfs_rq); |
955 | update_cfs_load(cfs_rq, 0); | ||
956 | update_cfs_shares(cfs_rq, se->load.weight); | ||
774 | account_entity_enqueue(cfs_rq, se); | 957 | account_entity_enqueue(cfs_rq, se); |
775 | 958 | ||
776 | if (flags & ENQUEUE_WAKEUP) { | 959 | if (flags & ENQUEUE_WAKEUP) { |
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
782 | check_spread(cfs_rq, se); | 965 | check_spread(cfs_rq, se); |
783 | if (se != cfs_rq->curr) | 966 | if (se != cfs_rq->curr) |
784 | __enqueue_entity(cfs_rq, se); | 967 | __enqueue_entity(cfs_rq, se); |
968 | se->on_rq = 1; | ||
969 | |||
970 | if (cfs_rq->nr_running == 1) | ||
971 | list_add_leaf_cfs_rq(cfs_rq); | ||
785 | } | 972 | } |
786 | 973 | ||
787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 974 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
825 | 1012 | ||
826 | if (se != cfs_rq->curr) | 1013 | if (se != cfs_rq->curr) |
827 | __dequeue_entity(cfs_rq, se); | 1014 | __dequeue_entity(cfs_rq, se); |
1015 | se->on_rq = 0; | ||
1016 | update_cfs_load(cfs_rq, 0); | ||
828 | account_entity_dequeue(cfs_rq, se); | 1017 | account_entity_dequeue(cfs_rq, se); |
829 | update_min_vruntime(cfs_rq); | 1018 | update_min_vruntime(cfs_rq); |
1019 | update_cfs_shares(cfs_rq, 0); | ||
830 | 1020 | ||
831 | /* | 1021 | /* |
832 | * Normalize the entity after updating the min_vruntime because the | 1022 | * Normalize the entity after updating the min_vruntime because the |
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
955 | */ | 1145 | */ |
956 | update_curr(cfs_rq); | 1146 | update_curr(cfs_rq); |
957 | 1147 | ||
1148 | /* | ||
1149 | * Update share accounting for long-running entities. | ||
1150 | */ | ||
1151 | update_entity_shares_tick(cfs_rq); | ||
1152 | |||
958 | #ifdef CONFIG_SCHED_HRTICK | 1153 | #ifdef CONFIG_SCHED_HRTICK |
959 | /* | 1154 | /* |
960 | * queued ticks are scheduled to match the slice, so don't bother | 1155 | * queued ticks are scheduled to match the slice, so don't bother |
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1055 | flags = ENQUEUE_WAKEUP; | 1250 | flags = ENQUEUE_WAKEUP; |
1056 | } | 1251 | } |
1057 | 1252 | ||
1253 | for_each_sched_entity(se) { | ||
1254 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1255 | |||
1256 | update_cfs_load(cfs_rq, 0); | ||
1257 | update_cfs_shares(cfs_rq, 0); | ||
1258 | } | ||
1259 | |||
1058 | hrtick_update(rq); | 1260 | hrtick_update(rq); |
1059 | } | 1261 | } |
1060 | 1262 | ||
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1071 | for_each_sched_entity(se) { | 1273 | for_each_sched_entity(se) { |
1072 | cfs_rq = cfs_rq_of(se); | 1274 | cfs_rq = cfs_rq_of(se); |
1073 | dequeue_entity(cfs_rq, se, flags); | 1275 | dequeue_entity(cfs_rq, se, flags); |
1276 | |||
1074 | /* Don't dequeue parent if it has other entities besides us */ | 1277 | /* Don't dequeue parent if it has other entities besides us */ |
1075 | if (cfs_rq->load.weight) | 1278 | if (cfs_rq->load.weight) |
1076 | break; | 1279 | break; |
1077 | flags |= DEQUEUE_SLEEP; | 1280 | flags |= DEQUEUE_SLEEP; |
1078 | } | 1281 | } |
1079 | 1282 | ||
1283 | for_each_sched_entity(se) { | ||
1284 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1285 | |||
1286 | update_cfs_load(cfs_rq, 0); | ||
1287 | update_cfs_shares(cfs_rq, 0); | ||
1288 | } | ||
1289 | |||
1080 | hrtick_update(rq); | 1290 | hrtick_update(rq); |
1081 | } | 1291 | } |
1082 | 1292 | ||
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1353 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1354 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1145 | * can calculate the shift in shares. | 1355 | * can calculate the shift in shares. |
1146 | * | ||
1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1149 | * this change. | ||
1150 | * | ||
1151 | * We compensate this by not only taking the current delta into account, but | ||
1152 | * also considering the delta between when the shares were last adjusted and | ||
1153 | * now. | ||
1154 | * | ||
1155 | * We still saw a performance dip, some tracing learned us that between | ||
1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1157 | * significantly. Therefore try to bias the error in direction of failing | ||
1158 | * the affine wakeup. | ||
1159 | * | ||
1160 | */ | 1356 | */ |
1161 | static long effective_load(struct task_group *tg, int cpu, | 1357 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1162 | long wl, long wg) | ||
1163 | { | 1358 | { |
1164 | struct sched_entity *se = tg->se[cpu]; | 1359 | struct sched_entity *se = tg->se[cpu]; |
1165 | 1360 | ||
1166 | if (!tg->parent) | 1361 | if (!tg->parent) |
1167 | return wl; | 1362 | return wl; |
1168 | 1363 | ||
1169 | /* | ||
1170 | * By not taking the decrease of shares on the other cpu into | ||
1171 | * account our error leans towards reducing the affine wakeups. | ||
1172 | */ | ||
1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1174 | return wl; | ||
1175 | |||
1176 | for_each_sched_entity(se) { | 1364 | for_each_sched_entity(se) { |
1177 | long S, rw, s, a, b; | 1365 | long S, rw, s, a, b; |
1178 | long more_w; | ||
1179 | |||
1180 | /* | ||
1181 | * Instead of using this increment, also add the difference | ||
1182 | * between when the shares were last updated and now. | ||
1183 | */ | ||
1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1185 | wl += more_w; | ||
1186 | wg += more_w; | ||
1187 | 1366 | ||
1188 | S = se->my_q->tg->shares; | 1367 | S = se->my_q->tg->shares; |
1189 | s = se->my_q->shares; | 1368 | s = se->load.weight; |
1190 | rw = se->my_q->rq_weight; | 1369 | rw = se->my_q->load.weight; |
1191 | 1370 | ||
1192 | a = S*(rw + wl); | 1371 | a = S*(rw + wl); |
1193 | b = S*rw + s*wg; | 1372 | b = S*rw + s*wg; |
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1508 | sd = tmp; | 1687 | sd = tmp; |
1509 | } | 1688 | } |
1510 | 1689 | ||
1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
1513 | /* | ||
1514 | * Pick the largest domain to update shares over | ||
1515 | */ | ||
1516 | tmp = sd; | ||
1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
1518 | tmp = affine_sd; | ||
1519 | |||
1520 | if (tmp) { | ||
1521 | raw_spin_unlock(&rq->lock); | ||
1522 | update_shares(tmp); | ||
1523 | raw_spin_lock(&rq->lock); | ||
1524 | } | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1528 | if (affine_sd) { | 1690 | if (affine_sd) { |
1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1691 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1530 | return select_idle_sibling(p, cpu); | 1692 | return select_idle_sibling(p, cpu); |
@@ -1654,12 +1816,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1654 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1816 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1655 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1817 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
1656 | 1818 | ||
1657 | if (unlikely(rt_prio(p->prio))) | ||
1658 | goto preempt; | ||
1659 | |||
1660 | if (unlikely(p->sched_class != &fair_sched_class)) | ||
1661 | return; | ||
1662 | |||
1663 | if (unlikely(se == pse)) | 1819 | if (unlikely(se == pse)) |
1664 | return; | 1820 | return; |
1665 | 1821 | ||
@@ -1764,10 +1920,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
1764 | set_task_cpu(p, this_cpu); | 1920 | set_task_cpu(p, this_cpu); |
1765 | activate_task(this_rq, p, 0); | 1921 | activate_task(this_rq, p, 0); |
1766 | check_preempt_curr(this_rq, p, 0); | 1922 | check_preempt_curr(this_rq, p, 0); |
1767 | |||
1768 | /* re-arm NEWIDLE balancing when moving tasks */ | ||
1769 | src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
1770 | this_rq->idle_stamp = 0; | ||
1771 | } | 1923 | } |
1772 | 1924 | ||
1773 | /* | 1925 | /* |
@@ -1919,6 +2071,48 @@ out: | |||
1919 | } | 2071 | } |
1920 | 2072 | ||
1921 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2073 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2074 | /* | ||
2075 | * update tg->load_weight by folding this cpu's load_avg | ||
2076 | */ | ||
2077 | static int update_shares_cpu(struct task_group *tg, int cpu) | ||
2078 | { | ||
2079 | struct cfs_rq *cfs_rq; | ||
2080 | unsigned long flags; | ||
2081 | struct rq *rq; | ||
2082 | |||
2083 | if (!tg->se[cpu]) | ||
2084 | return 0; | ||
2085 | |||
2086 | rq = cpu_rq(cpu); | ||
2087 | cfs_rq = tg->cfs_rq[cpu]; | ||
2088 | |||
2089 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
2090 | |||
2091 | update_rq_clock(rq); | ||
2092 | update_cfs_load(cfs_rq, 1); | ||
2093 | |||
2094 | /* | ||
2095 | * We need to update shares after updating tg->load_weight in | ||
2096 | * order to adjust the weight of groups with long running tasks. | ||
2097 | */ | ||
2098 | update_cfs_shares(cfs_rq, 0); | ||
2099 | |||
2100 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
2101 | |||
2102 | return 0; | ||
2103 | } | ||
2104 | |||
2105 | static void update_shares(int cpu) | ||
2106 | { | ||
2107 | struct cfs_rq *cfs_rq; | ||
2108 | struct rq *rq = cpu_rq(cpu); | ||
2109 | |||
2110 | rcu_read_lock(); | ||
2111 | for_each_leaf_cfs_rq(rq, cfs_rq) | ||
2112 | update_shares_cpu(cfs_rq->tg, cpu); | ||
2113 | rcu_read_unlock(); | ||
2114 | } | ||
2115 | |||
1922 | static unsigned long | 2116 | static unsigned long |
1923 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2117 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1924 | unsigned long max_load_move, | 2118 | unsigned long max_load_move, |
@@ -1966,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1966 | return max_load_move - rem_load_move; | 2160 | return max_load_move - rem_load_move; |
1967 | } | 2161 | } |
1968 | #else | 2162 | #else |
2163 | static inline void update_shares(int cpu) | ||
2164 | { | ||
2165 | } | ||
2166 | |||
1969 | static unsigned long | 2167 | static unsigned long |
1970 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2168 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1971 | unsigned long max_load_move, | 2169 | unsigned long max_load_move, |
@@ -2035,13 +2233,16 @@ struct sd_lb_stats { | |||
2035 | unsigned long this_load_per_task; | 2233 | unsigned long this_load_per_task; |
2036 | unsigned long this_nr_running; | 2234 | unsigned long this_nr_running; |
2037 | unsigned long this_has_capacity; | 2235 | unsigned long this_has_capacity; |
2236 | unsigned int this_idle_cpus; | ||
2038 | 2237 | ||
2039 | /* Statistics of the busiest group */ | 2238 | /* Statistics of the busiest group */ |
2239 | unsigned int busiest_idle_cpus; | ||
2040 | unsigned long max_load; | 2240 | unsigned long max_load; |
2041 | unsigned long busiest_load_per_task; | 2241 | unsigned long busiest_load_per_task; |
2042 | unsigned long busiest_nr_running; | 2242 | unsigned long busiest_nr_running; |
2043 | unsigned long busiest_group_capacity; | 2243 | unsigned long busiest_group_capacity; |
2044 | unsigned long busiest_has_capacity; | 2244 | unsigned long busiest_has_capacity; |
2245 | unsigned int busiest_group_weight; | ||
2045 | 2246 | ||
2046 | int group_imb; /* Is there imbalance in this sd */ | 2247 | int group_imb; /* Is there imbalance in this sd */ |
2047 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2248 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -2063,6 +2264,8 @@ struct sg_lb_stats { | |||
2063 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | 2264 | unsigned long sum_nr_running; /* Nr tasks running in the group */ |
2064 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2265 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
2065 | unsigned long group_capacity; | 2266 | unsigned long group_capacity; |
2267 | unsigned long idle_cpus; | ||
2268 | unsigned long group_weight; | ||
2066 | int group_imb; /* Is there an imbalance in the group ? */ | 2269 | int group_imb; /* Is there an imbalance in the group ? */ |
2067 | int group_has_capacity; /* Is there extra capacity in the group? */ | 2270 | int group_has_capacity; /* Is there extra capacity in the group? */ |
2068 | }; | 2271 | }; |
@@ -2431,7 +2634,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2431 | sgs->group_load += load; | 2634 | sgs->group_load += load; |
2432 | sgs->sum_nr_running += rq->nr_running; | 2635 | sgs->sum_nr_running += rq->nr_running; |
2433 | sgs->sum_weighted_load += weighted_cpuload(i); | 2636 | sgs->sum_weighted_load += weighted_cpuload(i); |
2434 | 2637 | if (idle_cpu(i)) | |
2638 | sgs->idle_cpus++; | ||
2435 | } | 2639 | } |
2436 | 2640 | ||
2437 | /* | 2641 | /* |
@@ -2469,6 +2673,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2469 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2673 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
2470 | if (!sgs->group_capacity) | 2674 | if (!sgs->group_capacity) |
2471 | sgs->group_capacity = fix_small_capacity(sd, group); | 2675 | sgs->group_capacity = fix_small_capacity(sd, group); |
2676 | sgs->group_weight = group->group_weight; | ||
2472 | 2677 | ||
2473 | if (sgs->group_capacity > sgs->sum_nr_running) | 2678 | if (sgs->group_capacity > sgs->sum_nr_running) |
2474 | sgs->group_has_capacity = 1; | 2679 | sgs->group_has_capacity = 1; |
@@ -2576,13 +2781,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2576 | sds->this_nr_running = sgs.sum_nr_running; | 2781 | sds->this_nr_running = sgs.sum_nr_running; |
2577 | sds->this_load_per_task = sgs.sum_weighted_load; | 2782 | sds->this_load_per_task = sgs.sum_weighted_load; |
2578 | sds->this_has_capacity = sgs.group_has_capacity; | 2783 | sds->this_has_capacity = sgs.group_has_capacity; |
2784 | sds->this_idle_cpus = sgs.idle_cpus; | ||
2579 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 2785 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2580 | sds->max_load = sgs.avg_load; | 2786 | sds->max_load = sgs.avg_load; |
2581 | sds->busiest = sg; | 2787 | sds->busiest = sg; |
2582 | sds->busiest_nr_running = sgs.sum_nr_running; | 2788 | sds->busiest_nr_running = sgs.sum_nr_running; |
2789 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
2583 | sds->busiest_group_capacity = sgs.group_capacity; | 2790 | sds->busiest_group_capacity = sgs.group_capacity; |
2584 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2791 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2585 | sds->busiest_has_capacity = sgs.group_has_capacity; | 2792 | sds->busiest_has_capacity = sgs.group_has_capacity; |
2793 | sds->busiest_group_weight = sgs.group_weight; | ||
2586 | sds->group_imb = sgs.group_imb; | 2794 | sds->group_imb = sgs.group_imb; |
2587 | } | 2795 | } |
2588 | 2796 | ||
@@ -2860,8 +3068,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2860 | if (sds.this_load >= sds.avg_load) | 3068 | if (sds.this_load >= sds.avg_load) |
2861 | goto out_balanced; | 3069 | goto out_balanced; |
2862 | 3070 | ||
2863 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 3071 | /* |
2864 | goto out_balanced; | 3072 | * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. |
3073 | * And to check for busy balance use !idle_cpu instead of | ||
3074 | * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE | ||
3075 | * even when they are idle. | ||
3076 | */ | ||
3077 | if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { | ||
3078 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3079 | goto out_balanced; | ||
3080 | } else { | ||
3081 | /* | ||
3082 | * This cpu is idle. If the busiest group load doesn't | ||
3083 | * have more tasks than the number of available cpu's and | ||
3084 | * there is no imbalance between this and busiest group | ||
3085 | * wrt to idle cpu's, it is balanced. | ||
3086 | */ | ||
3087 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | ||
3088 | sds.busiest_nr_running <= sds.busiest_group_weight) | ||
3089 | goto out_balanced; | ||
3090 | } | ||
2865 | 3091 | ||
2866 | force_balance: | 3092 | force_balance: |
2867 | /* Looks like there is an imbalance. Compute it */ | 3093 | /* Looks like there is an imbalance. Compute it */ |
@@ -3014,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3014 | schedstat_inc(sd, lb_count[idle]); | 3240 | schedstat_inc(sd, lb_count[idle]); |
3015 | 3241 | ||
3016 | redo: | 3242 | redo: |
3017 | update_shares(sd); | ||
3018 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3243 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3019 | cpus, balance); | 3244 | cpus, balance); |
3020 | 3245 | ||
@@ -3156,8 +3381,6 @@ out_one_pinned: | |||
3156 | else | 3381 | else |
3157 | ld_moved = 0; | 3382 | ld_moved = 0; |
3158 | out: | 3383 | out: |
3159 | if (ld_moved) | ||
3160 | update_shares(sd); | ||
3161 | return ld_moved; | 3384 | return ld_moved; |
3162 | } | 3385 | } |
3163 | 3386 | ||
@@ -3181,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3181 | */ | 3404 | */ |
3182 | raw_spin_unlock(&this_rq->lock); | 3405 | raw_spin_unlock(&this_rq->lock); |
3183 | 3406 | ||
3407 | update_shares(this_cpu); | ||
3184 | for_each_domain(this_cpu, sd) { | 3408 | for_each_domain(this_cpu, sd) { |
3185 | unsigned long interval; | 3409 | unsigned long interval; |
3186 | int balance = 1; | 3410 | int balance = 1; |
@@ -3197,8 +3421,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3197 | interval = msecs_to_jiffies(sd->balance_interval); | 3421 | interval = msecs_to_jiffies(sd->balance_interval); |
3198 | if (time_after(next_balance, sd->last_balance + interval)) | 3422 | if (time_after(next_balance, sd->last_balance + interval)) |
3199 | next_balance = sd->last_balance + interval; | 3423 | next_balance = sd->last_balance + interval; |
3200 | if (pulled_task) | 3424 | if (pulled_task) { |
3425 | this_rq->idle_stamp = 0; | ||
3201 | break; | 3426 | break; |
3427 | } | ||
3202 | } | 3428 | } |
3203 | 3429 | ||
3204 | raw_spin_lock(&this_rq->lock); | 3430 | raw_spin_lock(&this_rq->lock); |
@@ -3549,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3549 | int update_next_balance = 0; | 3775 | int update_next_balance = 0; |
3550 | int need_serialize; | 3776 | int need_serialize; |
3551 | 3777 | ||
3778 | update_shares(cpu); | ||
3779 | |||
3552 | for_each_domain(cpu, sd) { | 3780 | for_each_domain(cpu, sd) { |
3553 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3781 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3554 | continue; | 3782 | continue; |