diff options
author | Suresh Siddha <suresh.b.siddha@intel.com> | 2010-03-11 03:45:44 -0500 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@suse.de> | 2010-04-01 19:01:23 -0400 |
commit | f140b5c7d133655ae9c541f51218a278d24bba60 (patch) | |
tree | c8c9400680851847d2cba9424fd28827436e3ec9 /kernel | |
parent | 84303658a8fd2bed8e27dacc25643a69dc7426fb (diff) |
sched: Fix SCHED_MC regression caused by change in sched cpu_power
commit dd5feea14a7de4edbd9f36db1a2db785de91b88d upstream
On platforms like dual socket quad-core platform, the scheduler load
balancer is not detecting the load imbalances in certain scenarios. This
is leading to scenarios like where one socket is completely busy (with
all the 4 cores running with 4 tasks) and leaving another socket
completely idle. This causes performance issues as those 4 tasks share
the memory controller, last-level cache bandwidth etc. Also we won't be
taking advantage of turbo-mode as much as we would like, etc.
Some of the comparisons in the scheduler load balancing code are
comparing the "weighted cpu load that is scaled wrt sched_group's
cpu_power" with the "weighted average load per task that is not scaled
wrt sched_group's cpu_power". While this has probably been broken for a
longer time (for multi socket numa nodes etc), the problem got aggrevated
via this recent change:
|
| commit f93e65c186ab3c05ce2068733ca10e34fd00125e
| Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
| Date: Tue Sep 1 10:34:32 2009 +0200
|
| sched: Restore __cpu_power to a straight sum of power
|
Also with this change, the sched group cpu power alone no longer reflects
the group capacity that is needed to implement MC, MT performance
(default) and power-savings (user-selectable) policies.
We need to use the computed group capacity (sgs.group_capacity, that is
computed using the SD_PREFER_SIBLING logic in update_sd_lb_stats()) to
find out if the group with the max load is above its capacity and how
much load to move etc.
Reported-by: Ma Ling <ling.ma@intel.com>
Initial-Analysis-by: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
[ -v2: build fix ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1266970432.11588.22.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 76 |
1 files changed, 43 insertions, 33 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 00a59b090e6f..7ca934588ec4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -3423,6 +3423,7 @@ struct sd_lb_stats { | |||
3423 | unsigned long max_load; | 3423 | unsigned long max_load; |
3424 | unsigned long busiest_load_per_task; | 3424 | unsigned long busiest_load_per_task; |
3425 | unsigned long busiest_nr_running; | 3425 | unsigned long busiest_nr_running; |
3426 | unsigned long busiest_group_capacity; | ||
3426 | 3427 | ||
3427 | int group_imb; /* Is there imbalance in this sd */ | 3428 | int group_imb; /* Is there imbalance in this sd */ |
3428 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 3429 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -3742,8 +3743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3742 | unsigned long load, max_cpu_load, min_cpu_load; | 3743 | unsigned long load, max_cpu_load, min_cpu_load; |
3743 | int i; | 3744 | int i; |
3744 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 3745 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
3745 | unsigned long sum_avg_load_per_task; | 3746 | unsigned long avg_load_per_task = 0; |
3746 | unsigned long avg_load_per_task; | ||
3747 | 3747 | ||
3748 | if (local_group) { | 3748 | if (local_group) { |
3749 | balance_cpu = group_first_cpu(group); | 3749 | balance_cpu = group_first_cpu(group); |
@@ -3752,7 +3752,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3752 | } | 3752 | } |
3753 | 3753 | ||
3754 | /* Tally up the load of all CPUs in the group */ | 3754 | /* Tally up the load of all CPUs in the group */ |
3755 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
3756 | max_cpu_load = 0; | 3755 | max_cpu_load = 0; |
3757 | min_cpu_load = ~0UL; | 3756 | min_cpu_load = ~0UL; |
3758 | 3757 | ||
@@ -3782,7 +3781,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3782 | sgs->sum_nr_running += rq->nr_running; | 3781 | sgs->sum_nr_running += rq->nr_running; |
3783 | sgs->sum_weighted_load += weighted_cpuload(i); | 3782 | sgs->sum_weighted_load += weighted_cpuload(i); |
3784 | 3783 | ||
3785 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
3786 | } | 3784 | } |
3787 | 3785 | ||
3788 | /* | 3786 | /* |
@@ -3800,7 +3798,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3800 | /* Adjust by relative CPU power of the group */ | 3798 | /* Adjust by relative CPU power of the group */ |
3801 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 3799 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
3802 | 3800 | ||
3803 | |||
3804 | /* | 3801 | /* |
3805 | * Consider the group unbalanced when the imbalance is larger | 3802 | * Consider the group unbalanced when the imbalance is larger |
3806 | * than the average weight of two tasks. | 3803 | * than the average weight of two tasks. |
@@ -3810,8 +3807,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
3810 | * normalized nr_running number somewhere that negates | 3807 | * normalized nr_running number somewhere that negates |
3811 | * the hierarchy? | 3808 | * the hierarchy? |
3812 | */ | 3809 | */ |
3813 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / | 3810 | if (sgs->sum_nr_running) |
3814 | group->cpu_power; | 3811 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
3815 | 3812 | ||
3816 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3813 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
3817 | sgs->group_imb = 1; | 3814 | sgs->group_imb = 1; |
@@ -3880,6 +3877,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3880 | sds->max_load = sgs.avg_load; | 3877 | sds->max_load = sgs.avg_load; |
3881 | sds->busiest = group; | 3878 | sds->busiest = group; |
3882 | sds->busiest_nr_running = sgs.sum_nr_running; | 3879 | sds->busiest_nr_running = sgs.sum_nr_running; |
3880 | sds->busiest_group_capacity = sgs.group_capacity; | ||
3883 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 3881 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
3884 | sds->group_imb = sgs.group_imb; | 3882 | sds->group_imb = sgs.group_imb; |
3885 | } | 3883 | } |
@@ -3902,6 +3900,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3902 | { | 3900 | { |
3903 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 3901 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
3904 | unsigned int imbn = 2; | 3902 | unsigned int imbn = 2; |
3903 | unsigned long scaled_busy_load_per_task; | ||
3905 | 3904 | ||
3906 | if (sds->this_nr_running) { | 3905 | if (sds->this_nr_running) { |
3907 | sds->this_load_per_task /= sds->this_nr_running; | 3906 | sds->this_load_per_task /= sds->this_nr_running; |
@@ -3912,8 +3911,12 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3912 | sds->this_load_per_task = | 3911 | sds->this_load_per_task = |
3913 | cpu_avg_load_per_task(this_cpu); | 3912 | cpu_avg_load_per_task(this_cpu); |
3914 | 3913 | ||
3915 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= | 3914 | scaled_busy_load_per_task = sds->busiest_load_per_task |
3916 | sds->busiest_load_per_task * imbn) { | 3915 | * SCHED_LOAD_SCALE; |
3916 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | ||
3917 | |||
3918 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | ||
3919 | (scaled_busy_load_per_task * imbn)) { | ||
3917 | *imbalance = sds->busiest_load_per_task; | 3920 | *imbalance = sds->busiest_load_per_task; |
3918 | return; | 3921 | return; |
3919 | } | 3922 | } |
@@ -3964,7 +3967,14 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3964 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | 3967 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, |
3965 | unsigned long *imbalance) | 3968 | unsigned long *imbalance) |
3966 | { | 3969 | { |
3967 | unsigned long max_pull; | 3970 | unsigned long max_pull, load_above_capacity = ~0UL; |
3971 | |||
3972 | sds->busiest_load_per_task /= sds->busiest_nr_running; | ||
3973 | if (sds->group_imb) { | ||
3974 | sds->busiest_load_per_task = | ||
3975 | min(sds->busiest_load_per_task, sds->avg_load); | ||
3976 | } | ||
3977 | |||
3968 | /* | 3978 | /* |
3969 | * In the presence of smp nice balancing, certain scenarios can have | 3979 | * In the presence of smp nice balancing, certain scenarios can have |
3970 | * max load less than avg load(as we skip the groups at or below | 3980 | * max load less than avg load(as we skip the groups at or below |
@@ -3975,9 +3985,29 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3975 | return fix_small_imbalance(sds, this_cpu, imbalance); | 3985 | return fix_small_imbalance(sds, this_cpu, imbalance); |
3976 | } | 3986 | } |
3977 | 3987 | ||
3978 | /* Don't want to pull so many tasks that a group would go idle */ | 3988 | if (!sds->group_imb) { |
3979 | max_pull = min(sds->max_load - sds->avg_load, | 3989 | /* |
3980 | sds->max_load - sds->busiest_load_per_task); | 3990 | * Don't want to pull so many tasks that a group would go idle. |
3991 | */ | ||
3992 | load_above_capacity = (sds->busiest_nr_running - | ||
3993 | sds->busiest_group_capacity); | ||
3994 | |||
3995 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | ||
3996 | |||
3997 | load_above_capacity /= sds->busiest->cpu_power; | ||
3998 | } | ||
3999 | |||
4000 | /* | ||
4001 | * We're trying to get all the cpus to the average_load, so we don't | ||
4002 | * want to push ourselves above the average load, nor do we wish to | ||
4003 | * reduce the max loaded cpu below the average load. At the same time, | ||
4004 | * we also don't want to reduce the group load below the group capacity | ||
4005 | * (so that we can implement power-savings policies etc). Thus we look | ||
4006 | * for the minimum possible imbalance. | ||
4007 | * Be careful of negative numbers as they'll appear as very large values | ||
4008 | * with unsigned longs. | ||
4009 | */ | ||
4010 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | ||
3981 | 4011 | ||
3982 | /* How much load to actually move to equalise the imbalance */ | 4012 | /* How much load to actually move to equalise the imbalance */ |
3983 | *imbalance = min(max_pull * sds->busiest->cpu_power, | 4013 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
@@ -4045,7 +4075,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4045 | * 4) This group is more busy than the avg busieness at this | 4075 | * 4) This group is more busy than the avg busieness at this |
4046 | * sched_domain. | 4076 | * sched_domain. |
4047 | * 5) The imbalance is within the specified limit. | 4077 | * 5) The imbalance is within the specified limit. |
4048 | * 6) Any rebalance would lead to ping-pong | ||
4049 | */ | 4078 | */ |
4050 | if (balance && !(*balance)) | 4079 | if (balance && !(*balance)) |
4051 | goto ret; | 4080 | goto ret; |
@@ -4064,25 +4093,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
4064 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 4093 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
4065 | goto out_balanced; | 4094 | goto out_balanced; |
4066 | 4095 | ||
4067 | sds.busiest_load_per_task /= sds.busiest_nr_running; | ||
4068 | if (sds.group_imb) | ||
4069 | sds.busiest_load_per_task = | ||
4070 | min(sds.busiest_load_per_task, sds.avg_load); | ||
4071 | |||
4072 | /* | ||
4073 | * We're trying to get all the cpus to the average_load, so we don't | ||
4074 | * want to push ourselves above the average load, nor do we wish to | ||
4075 | * reduce the max loaded cpu below the average load, as either of these | ||
4076 | * actions would just result in more rebalancing later, and ping-pong | ||
4077 | * tasks around. Thus we look for the minimum possible imbalance. | ||
4078 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
4079 | * be counted as no imbalance for these purposes -- we can't fix that | ||
4080 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
4081 | * appear as very large values with unsigned longs. | ||
4082 | */ | ||
4083 | if (sds.max_load <= sds.busiest_load_per_task) | ||
4084 | goto out_balanced; | ||
4085 | |||
4086 | /* Looks like there is an imbalance. Compute it */ | 4096 | /* Looks like there is an imbalance. Compute it */ |
4087 | calculate_imbalance(&sds, this_cpu, imbalance); | 4097 | calculate_imbalance(&sds, this_cpu, imbalance); |
4088 | return sds.busiest; | 4098 | return sds.busiest; |