diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-11-15 18:47:00 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-11-18 07:27:46 -0500 |
commit | 2069dd75c7d0f49355939e5586daf5a9ab216db7 (patch) | |
tree | c221747420e47b194a2a634024438a55420224d5 /kernel/sched_fair.c | |
parent | 48c5ccae88dcd989d9de507e8510313c6cbd352b (diff) |
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.
This should improve cpu-cgroup performance for smp systems
significantly.
[ Paul: changed to use queueing cfs_rq + bug fixes ]
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.580480400@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 164 |
1 files changed, 107 insertions, 57 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..d86544b4151c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
417 | WRT_SYSCTL(sched_min_granularity); | 417 | WRT_SYSCTL(sched_min_granularity); |
418 | WRT_SYSCTL(sched_latency); | 418 | WRT_SYSCTL(sched_latency); |
419 | WRT_SYSCTL(sched_wakeup_granularity); | 419 | WRT_SYSCTL(sched_wakeup_granularity); |
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | 420 | #undef WRT_SYSCTL |
422 | 421 | ||
423 | return 0; | 422 | return 0; |
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
633 | list_add(&se->group_node, &cfs_rq->tasks); | 632 | list_add(&se->group_node, &cfs_rq->tasks); |
634 | } | 633 | } |
635 | cfs_rq->nr_running++; | 634 | cfs_rq->nr_running++; |
636 | se->on_rq = 1; | ||
637 | } | 635 | } |
638 | 636 | ||
639 | static void | 637 | static void |
@@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
647 | list_del_init(&se->group_node); | 645 | list_del_init(&se->group_node); |
648 | } | 646 | } |
649 | cfs_rq->nr_running--; | 647 | cfs_rq->nr_running--; |
650 | se->on_rq = 0; | ||
651 | } | 648 | } |
652 | 649 | ||
650 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
651 | static void update_cfs_load(struct cfs_rq *cfs_rq) | ||
652 | { | ||
653 | u64 period = sched_avg_period(); | ||
654 | u64 now, delta; | ||
655 | |||
656 | if (!cfs_rq) | ||
657 | return; | ||
658 | |||
659 | now = rq_of(cfs_rq)->clock; | ||
660 | delta = now - cfs_rq->load_stamp; | ||
661 | |||
662 | cfs_rq->load_stamp = now; | ||
663 | cfs_rq->load_period += delta; | ||
664 | cfs_rq->load_avg += delta * cfs_rq->load.weight; | ||
665 | |||
666 | while (cfs_rq->load_period > period) { | ||
667 | /* | ||
668 | * Inline assembly required to prevent the compiler | ||
669 | * optimising this loop into a divmod call. | ||
670 | * See __iter_div_u64_rem() for another example of this. | ||
671 | */ | ||
672 | asm("" : "+rm" (cfs_rq->load_period)); | ||
673 | cfs_rq->load_period /= 2; | ||
674 | cfs_rq->load_avg /= 2; | ||
675 | } | ||
676 | } | ||
677 | |||
678 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
679 | unsigned long weight) | ||
680 | { | ||
681 | if (se->on_rq) | ||
682 | account_entity_dequeue(cfs_rq, se); | ||
683 | |||
684 | update_load_set(&se->load, weight); | ||
685 | |||
686 | if (se->on_rq) | ||
687 | account_entity_enqueue(cfs_rq, se); | ||
688 | } | ||
689 | |||
690 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
691 | { | ||
692 | struct task_group *tg; | ||
693 | struct sched_entity *se; | ||
694 | long load_weight, load, shares; | ||
695 | |||
696 | if (!cfs_rq) | ||
697 | return; | ||
698 | |||
699 | tg = cfs_rq->tg; | ||
700 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
701 | if (!se) | ||
702 | return; | ||
703 | |||
704 | load = cfs_rq->load.weight; | ||
705 | |||
706 | load_weight = atomic_read(&tg->load_weight); | ||
707 | load_weight -= cfs_rq->load_contribution; | ||
708 | load_weight += load; | ||
709 | |||
710 | shares = (tg->shares * load); | ||
711 | if (load_weight) | ||
712 | shares /= load_weight; | ||
713 | |||
714 | if (shares < MIN_SHARES) | ||
715 | shares = MIN_SHARES; | ||
716 | if (shares > tg->shares) | ||
717 | shares = tg->shares; | ||
718 | |||
719 | reweight_entity(cfs_rq_of(se), se, shares); | ||
720 | } | ||
721 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
722 | static inline void update_cfs_load(struct cfs_rq *cfs_rq) | ||
723 | { | ||
724 | } | ||
725 | |||
726 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
727 | { | ||
728 | } | ||
729 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
730 | |||
653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 731 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
654 | { | 732 | { |
655 | #ifdef CONFIG_SCHEDSTATS | 733 | #ifdef CONFIG_SCHEDSTATS |
@@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
771 | * Update run-time statistics of the 'current'. | 849 | * Update run-time statistics of the 'current'. |
772 | */ | 850 | */ |
773 | update_curr(cfs_rq); | 851 | update_curr(cfs_rq); |
852 | update_cfs_load(cfs_rq); | ||
774 | account_entity_enqueue(cfs_rq, se); | 853 | account_entity_enqueue(cfs_rq, se); |
854 | update_cfs_shares(cfs_rq); | ||
775 | 855 | ||
776 | if (flags & ENQUEUE_WAKEUP) { | 856 | if (flags & ENQUEUE_WAKEUP) { |
777 | place_entity(cfs_rq, se, 0); | 857 | place_entity(cfs_rq, se, 0); |
@@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
782 | check_spread(cfs_rq, se); | 862 | check_spread(cfs_rq, se); |
783 | if (se != cfs_rq->curr) | 863 | if (se != cfs_rq->curr) |
784 | __enqueue_entity(cfs_rq, se); | 864 | __enqueue_entity(cfs_rq, se); |
865 | se->on_rq = 1; | ||
785 | } | 866 | } |
786 | 867 | ||
787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 868 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
825 | 906 | ||
826 | if (se != cfs_rq->curr) | 907 | if (se != cfs_rq->curr) |
827 | __dequeue_entity(cfs_rq, se); | 908 | __dequeue_entity(cfs_rq, se); |
909 | se->on_rq = 0; | ||
910 | update_cfs_load(cfs_rq); | ||
828 | account_entity_dequeue(cfs_rq, se); | 911 | account_entity_dequeue(cfs_rq, se); |
829 | update_min_vruntime(cfs_rq); | 912 | update_min_vruntime(cfs_rq); |
913 | update_cfs_shares(cfs_rq); | ||
830 | 914 | ||
831 | /* | 915 | /* |
832 | * Normalize the entity after updating the min_vruntime because the | 916 | * Normalize the entity after updating the min_vruntime because the |
@@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1055 | flags = ENQUEUE_WAKEUP; | 1139 | flags = ENQUEUE_WAKEUP; |
1056 | } | 1140 | } |
1057 | 1141 | ||
1142 | for_each_sched_entity(se) { | ||
1143 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1144 | |||
1145 | update_cfs_load(cfs_rq); | ||
1146 | update_cfs_shares(cfs_rq); | ||
1147 | } | ||
1148 | |||
1058 | hrtick_update(rq); | 1149 | hrtick_update(rq); |
1059 | } | 1150 | } |
1060 | 1151 | ||
@@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1071 | for_each_sched_entity(se) { | 1162 | for_each_sched_entity(se) { |
1072 | cfs_rq = cfs_rq_of(se); | 1163 | cfs_rq = cfs_rq_of(se); |
1073 | dequeue_entity(cfs_rq, se, flags); | 1164 | dequeue_entity(cfs_rq, se, flags); |
1165 | |||
1074 | /* Don't dequeue parent if it has other entities besides us */ | 1166 | /* Don't dequeue parent if it has other entities besides us */ |
1075 | if (cfs_rq->load.weight) | 1167 | if (cfs_rq->load.weight) |
1076 | break; | 1168 | break; |
1077 | flags |= DEQUEUE_SLEEP; | 1169 | flags |= DEQUEUE_SLEEP; |
1078 | } | 1170 | } |
1079 | 1171 | ||
1172 | for_each_sched_entity(se) { | ||
1173 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1174 | |||
1175 | update_cfs_load(cfs_rq); | ||
1176 | update_cfs_shares(cfs_rq); | ||
1177 | } | ||
1178 | |||
1080 | hrtick_update(rq); | 1179 | hrtick_update(rq); |
1081 | } | 1180 | } |
1082 | 1181 | ||
@@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1242 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1243 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1145 | * can calculate the shift in shares. | 1244 | * can calculate the shift in shares. |
1146 | * | ||
1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1149 | * this change. | ||
1150 | * | ||
1151 | * We compensate this by not only taking the current delta into account, but | ||
1152 | * also considering the delta between when the shares were last adjusted and | ||
1153 | * now. | ||
1154 | * | ||
1155 | * We still saw a performance dip, some tracing learned us that between | ||
1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1157 | * significantly. Therefore try to bias the error in direction of failing | ||
1158 | * the affine wakeup. | ||
1159 | * | ||
1160 | */ | 1245 | */ |
1161 | static long effective_load(struct task_group *tg, int cpu, | 1246 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1162 | long wl, long wg) | ||
1163 | { | 1247 | { |
1164 | struct sched_entity *se = tg->se[cpu]; | 1248 | struct sched_entity *se = tg->se[cpu]; |
1165 | 1249 | ||
1166 | if (!tg->parent) | 1250 | if (!tg->parent) |
1167 | return wl; | 1251 | return wl; |
1168 | 1252 | ||
1169 | /* | ||
1170 | * By not taking the decrease of shares on the other cpu into | ||
1171 | * account our error leans towards reducing the affine wakeups. | ||
1172 | */ | ||
1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1174 | return wl; | ||
1175 | |||
1176 | for_each_sched_entity(se) { | 1253 | for_each_sched_entity(se) { |
1177 | long S, rw, s, a, b; | 1254 | long S, rw, s, a, b; |
1178 | long more_w; | ||
1179 | |||
1180 | /* | ||
1181 | * Instead of using this increment, also add the difference | ||
1182 | * between when the shares were last updated and now. | ||
1183 | */ | ||
1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1185 | wl += more_w; | ||
1186 | wg += more_w; | ||
1187 | 1255 | ||
1188 | S = se->my_q->tg->shares; | 1256 | S = se->my_q->tg->shares; |
1189 | s = se->my_q->shares; | 1257 | s = se->load.weight; |
1190 | rw = se->my_q->rq_weight; | 1258 | rw = se->my_q->load.weight; |
1191 | 1259 | ||
1192 | a = S*(rw + wl); | 1260 | a = S*(rw + wl); |
1193 | b = S*rw + s*wg; | 1261 | b = S*rw + s*wg; |
@@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1508 | sd = tmp; | 1576 | sd = tmp; |
1509 | } | 1577 | } |
1510 | 1578 | ||
1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
1513 | /* | ||
1514 | * Pick the largest domain to update shares over | ||
1515 | */ | ||
1516 | tmp = sd; | ||
1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
1518 | tmp = affine_sd; | ||
1519 | |||
1520 | if (tmp) { | ||
1521 | raw_spin_unlock(&rq->lock); | ||
1522 | update_shares(tmp); | ||
1523 | raw_spin_lock(&rq->lock); | ||
1524 | } | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1528 | if (affine_sd) { | 1579 | if (affine_sd) { |
1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1580 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1530 | return select_idle_sibling(p, cpu); | 1581 | return select_idle_sibling(p, cpu); |
@@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3014 | schedstat_inc(sd, lb_count[idle]); | 3065 | schedstat_inc(sd, lb_count[idle]); |
3015 | 3066 | ||
3016 | redo: | 3067 | redo: |
3017 | update_shares(sd); | ||
3018 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3068 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3019 | cpus, balance); | 3069 | cpus, balance); |
3020 | 3070 | ||
@@ -3156,8 +3206,6 @@ out_one_pinned: | |||
3156 | else | 3206 | else |
3157 | ld_moved = 0; | 3207 | ld_moved = 0; |
3158 | out: | 3208 | out: |
3159 | if (ld_moved) | ||
3160 | update_shares(sd); | ||
3161 | return ld_moved; | 3209 | return ld_moved; |
3162 | } | 3210 | } |
3163 | 3211 | ||
@@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3549 | int update_next_balance = 0; | 3597 | int update_next_balance = 0; |
3550 | int need_serialize; | 3598 | int need_serialize; |
3551 | 3599 | ||
3600 | update_shares(cpu); | ||
3601 | |||
3552 | for_each_domain(cpu, sd) { | 3602 | for_each_domain(cpu, sd) { |
3553 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3603 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3554 | continue; | 3604 | continue; |