aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-11-15 18:47:00 -0500
committerIngo Molnar <mingo@elte.hu>2010-11-18 07:27:46 -0500
commit2069dd75c7d0f49355939e5586daf5a9ab216db7 (patch)
treec221747420e47b194a2a634024438a55420224d5 /kernel/sched_fair.c
parent48c5ccae88dcd989d9de507e8510313c6cbd352b (diff)
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a global task_group load on each tick we can rework tg_shares_up to be strictly per-cpu. This should improve cpu-cgroup performance for smp systems significantly. [ Paul: changed to use queueing cfs_rq + bug fixes ] Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20101115234937.580480400@google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c164
1 files changed, 107 insertions, 57 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..d86544b4151c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 420#undef WRT_SYSCTL
422 421
423 return 0; 422 return 0;
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 632 list_add(&se->group_node, &cfs_rq->tasks);
634 } 633 }
635 cfs_rq->nr_running++; 634 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 635}
638 636
639static void 637static void
@@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 645 list_del_init(&se->group_node);
648 } 646 }
649 cfs_rq->nr_running--; 647 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 648}
652 649
650#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
651static void update_cfs_load(struct cfs_rq *cfs_rq)
652{
653 u64 period = sched_avg_period();
654 u64 now, delta;
655
656 if (!cfs_rq)
657 return;
658
659 now = rq_of(cfs_rq)->clock;
660 delta = now - cfs_rq->load_stamp;
661
662 cfs_rq->load_stamp = now;
663 cfs_rq->load_period += delta;
664 cfs_rq->load_avg += delta * cfs_rq->load.weight;
665
666 while (cfs_rq->load_period > period) {
667 /*
668 * Inline assembly required to prevent the compiler
669 * optimising this loop into a divmod call.
670 * See __iter_div_u64_rem() for another example of this.
671 */
672 asm("" : "+rm" (cfs_rq->load_period));
673 cfs_rq->load_period /= 2;
674 cfs_rq->load_avg /= 2;
675 }
676}
677
678static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
679 unsigned long weight)
680{
681 if (se->on_rq)
682 account_entity_dequeue(cfs_rq, se);
683
684 update_load_set(&se->load, weight);
685
686 if (se->on_rq)
687 account_entity_enqueue(cfs_rq, se);
688}
689
690static void update_cfs_shares(struct cfs_rq *cfs_rq)
691{
692 struct task_group *tg;
693 struct sched_entity *se;
694 long load_weight, load, shares;
695
696 if (!cfs_rq)
697 return;
698
699 tg = cfs_rq->tg;
700 se = tg->se[cpu_of(rq_of(cfs_rq))];
701 if (!se)
702 return;
703
704 load = cfs_rq->load.weight;
705
706 load_weight = atomic_read(&tg->load_weight);
707 load_weight -= cfs_rq->load_contribution;
708 load_weight += load;
709
710 shares = (tg->shares * load);
711 if (load_weight)
712 shares /= load_weight;
713
714 if (shares < MIN_SHARES)
715 shares = MIN_SHARES;
716 if (shares > tg->shares)
717 shares = tg->shares;
718
719 reweight_entity(cfs_rq_of(se), se, shares);
720}
721#else /* CONFIG_FAIR_GROUP_SCHED */
722static inline void update_cfs_load(struct cfs_rq *cfs_rq)
723{
724}
725
726static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
727{
728}
729#endif /* CONFIG_FAIR_GROUP_SCHED */
730
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 731static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 732{
655#ifdef CONFIG_SCHEDSTATS 733#ifdef CONFIG_SCHEDSTATS
@@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 849 * Update run-time statistics of the 'current'.
772 */ 850 */
773 update_curr(cfs_rq); 851 update_curr(cfs_rq);
852 update_cfs_load(cfs_rq);
774 account_entity_enqueue(cfs_rq, se); 853 account_entity_enqueue(cfs_rq, se);
854 update_cfs_shares(cfs_rq);
775 855
776 if (flags & ENQUEUE_WAKEUP) { 856 if (flags & ENQUEUE_WAKEUP) {
777 place_entity(cfs_rq, se, 0); 857 place_entity(cfs_rq, se, 0);
@@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 862 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 863 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 864 __enqueue_entity(cfs_rq, se);
865 se->on_rq = 1;
785} 866}
786 867
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 868static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 906
826 if (se != cfs_rq->curr) 907 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 908 __dequeue_entity(cfs_rq, se);
909 se->on_rq = 0;
910 update_cfs_load(cfs_rq);
828 account_entity_dequeue(cfs_rq, se); 911 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 912 update_min_vruntime(cfs_rq);
913 update_cfs_shares(cfs_rq);
830 914
831 /* 915 /*
832 * Normalize the entity after updating the min_vruntime because the 916 * Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1139 flags = ENQUEUE_WAKEUP;
1056 } 1140 }
1057 1141
1142 for_each_sched_entity(se) {
1143 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1144
1145 update_cfs_load(cfs_rq);
1146 update_cfs_shares(cfs_rq);
1147 }
1148
1058 hrtick_update(rq); 1149 hrtick_update(rq);
1059} 1150}
1060 1151
@@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1162 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1163 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1164 dequeue_entity(cfs_rq, se, flags);
1165
1074 /* Don't dequeue parent if it has other entities besides us */ 1166 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1167 if (cfs_rq->load.weight)
1076 break; 1168 break;
1077 flags |= DEQUEUE_SLEEP; 1169 flags |= DEQUEUE_SLEEP;
1078 } 1170 }
1079 1171
1172 for_each_sched_entity(se) {
1173 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1174
1175 update_cfs_load(cfs_rq);
1176 update_cfs_shares(cfs_rq);
1177 }
1178
1080 hrtick_update(rq); 1179 hrtick_update(rq);
1081} 1180}
1082 1181
@@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1242 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1243 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1244 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1245 */
1161static long effective_load(struct task_group *tg, int cpu, 1246static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1247{
1164 struct sched_entity *se = tg->se[cpu]; 1248 struct sched_entity *se = tg->se[cpu];
1165 1249
1166 if (!tg->parent) 1250 if (!tg->parent)
1167 return wl; 1251 return wl;
1168 1252
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1253 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1254 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1255
1188 S = se->my_q->tg->shares; 1256 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1257 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1258 rw = se->my_q->load.weight;
1191 1259
1192 a = S*(rw + wl); 1260 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1261 b = S*rw + s*wg;
@@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1576 sd = tmp;
1509 } 1577 }
1510 1578
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1579 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1580 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1581 return select_idle_sibling(p, cpu);
@@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3014 schedstat_inc(sd, lb_count[idle]); 3065 schedstat_inc(sd, lb_count[idle]);
3015 3066
3016redo: 3067redo:
3017 update_shares(sd);
3018 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3068 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3019 cpus, balance); 3069 cpus, balance);
3020 3070
@@ -3156,8 +3206,6 @@ out_one_pinned:
3156 else 3206 else
3157 ld_moved = 0; 3207 ld_moved = 0;
3158out: 3208out:
3159 if (ld_moved)
3160 update_shares(sd);
3161 return ld_moved; 3209 return ld_moved;
3162} 3210}
3163 3211
@@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3549 int update_next_balance = 0; 3597 int update_next_balance = 0;
3550 int need_serialize; 3598 int need_serialize;
3551 3599
3600 update_shares(cpu);
3601
3552 for_each_domain(cpu, sd) { 3602 for_each_domain(cpu, sd) {
3553 if (!(sd->flags & SD_LOAD_BALANCE)) 3603 if (!(sd->flags & SD_LOAD_BALANCE))
3554 continue; 3604 continue;