aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c305
1 files changed, 248 insertions, 57 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd768667..c88671718bc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,14 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
568 update_cfs_load(cfs_rq, 0);
569 update_cfs_shares(cfs_rq, 0);
570 }
571#endif
517} 572}
518 573
519static void update_curr(struct cfs_rq *cfs_rq) 574static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +688,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 688 list_add(&se->group_node, &cfs_rq->tasks);
634 } 689 }
635 cfs_rq->nr_running++; 690 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 691}
638 692
639static void 693static void
@@ -647,9 +701,124 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 701 list_del_init(&se->group_node);
648 } 702 }
649 cfs_rq->nr_running--; 703 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 704}
652 705
706#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
707static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
708 int global_update)
709{
710 struct task_group *tg = cfs_rq->tg;
711 long load_avg;
712
713 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
714 load_avg -= cfs_rq->load_contribution;
715
716 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
717 atomic_add(load_avg, &tg->load_weight);
718 cfs_rq->load_contribution += load_avg;
719 }
720}
721
722static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
723{
724 u64 period = sysctl_sched_shares_window;
725 u64 now, delta;
726 unsigned long load = cfs_rq->load.weight;
727
728 if (!cfs_rq)
729 return;
730
731 now = rq_of(cfs_rq)->clock;
732 delta = now - cfs_rq->load_stamp;
733
734 /* truncate load history at 4 idle periods */
735 if (cfs_rq->load_stamp > cfs_rq->load_last &&
736 now - cfs_rq->load_last > 4 * period) {
737 cfs_rq->load_period = 0;
738 cfs_rq->load_avg = 0;
739 }
740
741 cfs_rq->load_stamp = now;
742 cfs_rq->load_unacc_exec_time = 0;
743 cfs_rq->load_period += delta;
744 if (load) {
745 cfs_rq->load_last = now;
746 cfs_rq->load_avg += delta * load;
747 }
748
749 /* consider updating load contribution on each fold or truncate */
750 if (global_update || cfs_rq->load_period > period
751 || !cfs_rq->load_period)
752 update_cfs_rq_load_contribution(cfs_rq, global_update);
753
754 while (cfs_rq->load_period > period) {
755 /*
756 * Inline assembly required to prevent the compiler
757 * optimising this loop into a divmod call.
758 * See __iter_div_u64_rem() for another example of this.
759 */
760 asm("" : "+rm" (cfs_rq->load_period));
761 cfs_rq->load_period /= 2;
762 cfs_rq->load_avg /= 2;
763 }
764
765 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
766 list_del_leaf_cfs_rq(cfs_rq);
767}
768
769static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
770 unsigned long weight)
771{
772 if (se->on_rq)
773 account_entity_dequeue(cfs_rq, se);
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812#else /* CONFIG_FAIR_GROUP_SCHED */
813static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
814{
815}
816
817static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
818{
819}
820#endif /* CONFIG_FAIR_GROUP_SCHED */
821
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 822static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 823{
655#ifdef CONFIG_SCHEDSTATS 824#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +940,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 940 * Update run-time statistics of the 'current'.
772 */ 941 */
773 update_curr(cfs_rq); 942 update_curr(cfs_rq);
943 update_cfs_load(cfs_rq, 0);
944 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 945 account_entity_enqueue(cfs_rq, se);
775 946
776 if (flags & ENQUEUE_WAKEUP) { 947 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +953,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 953 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 954 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 955 __enqueue_entity(cfs_rq, se);
956 se->on_rq = 1;
957
958 if (cfs_rq->nr_running == 1)
959 list_add_leaf_cfs_rq(cfs_rq);
785} 960}
786 961
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 962static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1000,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1000
826 if (se != cfs_rq->curr) 1001 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1002 __dequeue_entity(cfs_rq, se);
1003 se->on_rq = 0;
1004 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1005 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1006 update_min_vruntime(cfs_rq);
1007 update_cfs_shares(cfs_rq, 0);
830 1008
831 /* 1009 /*
832 * Normalize the entity after updating the min_vruntime because the 1010 * Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1233,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1233 flags = ENQUEUE_WAKEUP;
1056 } 1234 }
1057 1235
1236 for_each_sched_entity(se) {
1237 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1238
1239 update_cfs_load(cfs_rq, 0);
1240 update_cfs_shares(cfs_rq, 0);
1241 }
1242
1058 hrtick_update(rq); 1243 hrtick_update(rq);
1059} 1244}
1060 1245
@@ -1071,12 +1256,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1256 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1257 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1258 dequeue_entity(cfs_rq, se, flags);
1259
1074 /* Don't dequeue parent if it has other entities besides us */ 1260 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1261 if (cfs_rq->load.weight)
1076 break; 1262 break;
1077 flags |= DEQUEUE_SLEEP; 1263 flags |= DEQUEUE_SLEEP;
1078 } 1264 }
1079 1265
1266 for_each_sched_entity(se) {
1267 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1268
1269 update_cfs_load(cfs_rq, 0);
1270 update_cfs_shares(cfs_rq, 0);
1271 }
1272
1080 hrtick_update(rq); 1273 hrtick_update(rq);
1081} 1274}
1082 1275
@@ -1143,51 +1336,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1336 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1337 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1338 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1339 */
1161static long effective_load(struct task_group *tg, int cpu, 1340static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1341{
1164 struct sched_entity *se = tg->se[cpu]; 1342 struct sched_entity *se = tg->se[cpu];
1165 1343
1166 if (!tg->parent) 1344 if (!tg->parent)
1167 return wl; 1345 return wl;
1168 1346
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1347 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1348 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1349
1188 S = se->my_q->tg->shares; 1350 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1351 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1352 rw = se->my_q->load.weight;
1191 1353
1192 a = S*(rw + wl); 1354 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1355 b = S*rw + s*wg;
@@ -1508,23 +1670,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1670 sd = tmp;
1509 } 1671 }
1510 1672
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1673 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1674 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1675 return select_idle_sibling(p, cpu);
@@ -1909,6 +2054,48 @@ out:
1909} 2054}
1910 2055
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2056#ifdef CONFIG_FAIR_GROUP_SCHED
2057/*
2058 * update tg->load_weight by folding this cpu's load_avg
2059 */
2060static int update_shares_cpu(struct task_group *tg, int cpu)
2061{
2062 struct cfs_rq *cfs_rq;
2063 unsigned long flags;
2064 struct rq *rq;
2065
2066 if (!tg->se[cpu])
2067 return 0;
2068
2069 rq = cpu_rq(cpu);
2070 cfs_rq = tg->cfs_rq[cpu];
2071
2072 raw_spin_lock_irqsave(&rq->lock, flags);
2073
2074 update_rq_clock(rq);
2075 update_cfs_load(cfs_rq, 1);
2076
2077 /*
2078 * We need to update shares after updating tg->load_weight in
2079 * order to adjust the weight of groups with long running tasks.
2080 */
2081 update_cfs_shares(cfs_rq, 0);
2082
2083 raw_spin_unlock_irqrestore(&rq->lock, flags);
2084
2085 return 0;
2086}
2087
2088static void update_shares(int cpu)
2089{
2090 struct cfs_rq *cfs_rq;
2091 struct rq *rq = cpu_rq(cpu);
2092
2093 rcu_read_lock();
2094 for_each_leaf_cfs_rq(rq, cfs_rq)
2095 update_shares_cpu(cfs_rq->tg, cpu);
2096 rcu_read_unlock();
2097}
2098
1912static unsigned long 2099static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2100load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2101 unsigned long max_load_move,
@@ -1956,6 +2143,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2143 return max_load_move - rem_load_move;
1957} 2144}
1958#else 2145#else
2146static inline void update_shares(int cpu)
2147{
2148}
2149
1959static unsigned long 2150static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2151load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2152 unsigned long max_load_move,
@@ -3032,7 +3223,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3223 schedstat_inc(sd, lb_count[idle]);
3033 3224
3034redo: 3225redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3226 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3227 cpus, balance);
3038 3228
@@ -3174,8 +3364,6 @@ out_one_pinned:
3174 else 3364 else
3175 ld_moved = 0; 3365 ld_moved = 0;
3176out: 3366out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3367 return ld_moved;
3180} 3368}
3181 3369
@@ -3199,6 +3387,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3387 */
3200 raw_spin_unlock(&this_rq->lock); 3388 raw_spin_unlock(&this_rq->lock);
3201 3389
3390 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3391 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3392 unsigned long interval;
3204 int balance = 1; 3393 int balance = 1;
@@ -3569,6 +3758,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3758 int update_next_balance = 0;
3570 int need_serialize; 3759 int need_serialize;
3571 3760
3761 update_shares(cpu);
3762
3572 for_each_domain(cpu, sd) { 3763 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3764 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3765 continue;