diff options
author | Suresh Siddha <suresh.b.siddha@intel.com> | 2010-02-23 19:13:52 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-02-26 09:45:13 -0500 |
commit | dd5feea14a7de4edbd9f36db1a2db785de91b88d (patch) | |
tree | 924eb9f44a03011cfc85479495a7cb68ebd62517 /kernel/sched_fair.c | |
parent | 83ab0aa0d5623d823444db82c3b3c34d7ec364ae (diff) |
sched: Fix SCHED_MC regression caused by change in sched cpu_power
On platforms like dual socket quad-core platform, the scheduler load
balancer is not detecting the load imbalances in certain scenarios. This
is leading to scenarios like where one socket is completely busy (with
all the 4 cores running with 4 tasks) and leaving another socket
completely idle. This causes performance issues as those 4 tasks share
the memory controller, last-level cache bandwidth etc. Also we won't be
taking advantage of turbo-mode as much as we would like, etc.
Some of the comparisons in the scheduler load balancing code are
comparing the "weighted cpu load that is scaled wrt sched_group's
cpu_power" with the "weighted average load per task that is not scaled
wrt sched_group's cpu_power". While this has probably been broken for a
longer time (for multi socket numa nodes etc), the problem got aggrevated
via this recent change:
|
| commit f93e65c186ab3c05ce2068733ca10e34fd00125e
| Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
| Date: Tue Sep 1 10:34:32 2009 +0200
|
| sched: Restore __cpu_power to a straight sum of power
|
Also with this change, the sched group cpu power alone no longer reflects
the group capacity that is needed to implement MC, MT performance
(default) and power-savings (user-selectable) policies.
We need to use the computed group capacity (sgs.group_capacity, that is
computed using the SD_PREFER_SIBLING logic in update_sd_lb_stats()) to
find out if the group with the max load is above its capacity and how
much load to move etc.
Reported-by: Ma Ling <ling.ma@intel.com>
Initial-Analysis-by: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
[ -v2: build fix ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org> # [2.6.32.x, 2.6.33.x]
LKML-Reference: <1266970432.11588.22.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 76 |
1 files changed, 43 insertions, 33 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ff7692ccda89..3e1fd96c6cf9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -2097,6 +2097,7 @@ struct sd_lb_stats { | |||
2097 | unsigned long max_load; | 2097 | unsigned long max_load; |
2098 | unsigned long busiest_load_per_task; | 2098 | unsigned long busiest_load_per_task; |
2099 | unsigned long busiest_nr_running; | 2099 | unsigned long busiest_nr_running; |
2100 | unsigned long busiest_group_capacity; | ||
2100 | 2101 | ||
2101 | int group_imb; /* Is there imbalance in this sd */ | 2102 | int group_imb; /* Is there imbalance in this sd */ |
2102 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2103 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -2416,14 +2417,12 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2416 | unsigned long load, max_cpu_load, min_cpu_load; | 2417 | unsigned long load, max_cpu_load, min_cpu_load; |
2417 | int i; | 2418 | int i; |
2418 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2419 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2419 | unsigned long sum_avg_load_per_task; | 2420 | unsigned long avg_load_per_task = 0; |
2420 | unsigned long avg_load_per_task; | ||
2421 | 2421 | ||
2422 | if (local_group) | 2422 | if (local_group) |
2423 | balance_cpu = group_first_cpu(group); | 2423 | balance_cpu = group_first_cpu(group); |
2424 | 2424 | ||
2425 | /* Tally up the load of all CPUs in the group */ | 2425 | /* Tally up the load of all CPUs in the group */ |
2426 | sum_avg_load_per_task = avg_load_per_task = 0; | ||
2427 | max_cpu_load = 0; | 2426 | max_cpu_load = 0; |
2428 | min_cpu_load = ~0UL; | 2427 | min_cpu_load = ~0UL; |
2429 | 2428 | ||
@@ -2453,7 +2452,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2453 | sgs->sum_nr_running += rq->nr_running; | 2452 | sgs->sum_nr_running += rq->nr_running; |
2454 | sgs->sum_weighted_load += weighted_cpuload(i); | 2453 | sgs->sum_weighted_load += weighted_cpuload(i); |
2455 | 2454 | ||
2456 | sum_avg_load_per_task += cpu_avg_load_per_task(i); | ||
2457 | } | 2455 | } |
2458 | 2456 | ||
2459 | /* | 2457 | /* |
@@ -2473,7 +2471,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2473 | /* Adjust by relative CPU power of the group */ | 2471 | /* Adjust by relative CPU power of the group */ |
2474 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2472 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2475 | 2473 | ||
2476 | |||
2477 | /* | 2474 | /* |
2478 | * Consider the group unbalanced when the imbalance is larger | 2475 | * Consider the group unbalanced when the imbalance is larger |
2479 | * than the average weight of two tasks. | 2476 | * than the average weight of two tasks. |
@@ -2483,8 +2480,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2483 | * normalized nr_running number somewhere that negates | 2480 | * normalized nr_running number somewhere that negates |
2484 | * the hierarchy? | 2481 | * the hierarchy? |
2485 | */ | 2482 | */ |
2486 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / | 2483 | if (sgs->sum_nr_running) |
2487 | group->cpu_power; | 2484 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2488 | 2485 | ||
2489 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 2486 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
2490 | sgs->group_imb = 1; | 2487 | sgs->group_imb = 1; |
@@ -2553,6 +2550,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2553 | sds->max_load = sgs.avg_load; | 2550 | sds->max_load = sgs.avg_load; |
2554 | sds->busiest = group; | 2551 | sds->busiest = group; |
2555 | sds->busiest_nr_running = sgs.sum_nr_running; | 2552 | sds->busiest_nr_running = sgs.sum_nr_running; |
2553 | sds->busiest_group_capacity = sgs.group_capacity; | ||
2556 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2554 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2557 | sds->group_imb = sgs.group_imb; | 2555 | sds->group_imb = sgs.group_imb; |
2558 | } | 2556 | } |
@@ -2575,6 +2573,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2575 | { | 2573 | { |
2576 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 2574 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
2577 | unsigned int imbn = 2; | 2575 | unsigned int imbn = 2; |
2576 | unsigned long scaled_busy_load_per_task; | ||
2578 | 2577 | ||
2579 | if (sds->this_nr_running) { | 2578 | if (sds->this_nr_running) { |
2580 | sds->this_load_per_task /= sds->this_nr_running; | 2579 | sds->this_load_per_task /= sds->this_nr_running; |
@@ -2585,8 +2584,12 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2585 | sds->this_load_per_task = | 2584 | sds->this_load_per_task = |
2586 | cpu_avg_load_per_task(this_cpu); | 2585 | cpu_avg_load_per_task(this_cpu); |
2587 | 2586 | ||
2588 | if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= | 2587 | scaled_busy_load_per_task = sds->busiest_load_per_task |
2589 | sds->busiest_load_per_task * imbn) { | 2588 | * SCHED_LOAD_SCALE; |
2589 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | ||
2590 | |||
2591 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | ||
2592 | (scaled_busy_load_per_task * imbn)) { | ||
2590 | *imbalance = sds->busiest_load_per_task; | 2593 | *imbalance = sds->busiest_load_per_task; |
2591 | return; | 2594 | return; |
2592 | } | 2595 | } |
@@ -2637,7 +2640,14 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
2637 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | 2640 | static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, |
2638 | unsigned long *imbalance) | 2641 | unsigned long *imbalance) |
2639 | { | 2642 | { |
2640 | unsigned long max_pull; | 2643 | unsigned long max_pull, load_above_capacity = ~0UL; |
2644 | |||
2645 | sds->busiest_load_per_task /= sds->busiest_nr_running; | ||
2646 | if (sds->group_imb) { | ||
2647 | sds->busiest_load_per_task = | ||
2648 | min(sds->busiest_load_per_task, sds->avg_load); | ||
2649 | } | ||
2650 | |||
2641 | /* | 2651 | /* |
2642 | * In the presence of smp nice balancing, certain scenarios can have | 2652 | * In the presence of smp nice balancing, certain scenarios can have |
2643 | * max load less than avg load(as we skip the groups at or below | 2653 | * max load less than avg load(as we skip the groups at or below |
@@ -2648,9 +2658,29 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
2648 | return fix_small_imbalance(sds, this_cpu, imbalance); | 2658 | return fix_small_imbalance(sds, this_cpu, imbalance); |
2649 | } | 2659 | } |
2650 | 2660 | ||
2651 | /* Don't want to pull so many tasks that a group would go idle */ | 2661 | if (!sds->group_imb) { |
2652 | max_pull = min(sds->max_load - sds->avg_load, | 2662 | /* |
2653 | sds->max_load - sds->busiest_load_per_task); | 2663 | * Don't want to pull so many tasks that a group would go idle. |
2664 | */ | ||
2665 | load_above_capacity = (sds->busiest_nr_running - | ||
2666 | sds->busiest_group_capacity); | ||
2667 | |||
2668 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | ||
2669 | |||
2670 | load_above_capacity /= sds->busiest->cpu_power; | ||
2671 | } | ||
2672 | |||
2673 | /* | ||
2674 | * We're trying to get all the cpus to the average_load, so we don't | ||
2675 | * want to push ourselves above the average load, nor do we wish to | ||
2676 | * reduce the max loaded cpu below the average load. At the same time, | ||
2677 | * we also don't want to reduce the group load below the group capacity | ||
2678 | * (so that we can implement power-savings policies etc). Thus we look | ||
2679 | * for the minimum possible imbalance. | ||
2680 | * Be careful of negative numbers as they'll appear as very large values | ||
2681 | * with unsigned longs. | ||
2682 | */ | ||
2683 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | ||
2654 | 2684 | ||
2655 | /* How much load to actually move to equalise the imbalance */ | 2685 | /* How much load to actually move to equalise the imbalance */ |
2656 | *imbalance = min(max_pull * sds->busiest->cpu_power, | 2686 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
@@ -2718,7 +2748,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2718 | * 4) This group is more busy than the avg busieness at this | 2748 | * 4) This group is more busy than the avg busieness at this |
2719 | * sched_domain. | 2749 | * sched_domain. |
2720 | * 5) The imbalance is within the specified limit. | 2750 | * 5) The imbalance is within the specified limit. |
2721 | * 6) Any rebalance would lead to ping-pong | ||
2722 | */ | 2751 | */ |
2723 | if (!(*balance)) | 2752 | if (!(*balance)) |
2724 | goto ret; | 2753 | goto ret; |
@@ -2737,25 +2766,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2737 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 2766 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) |
2738 | goto out_balanced; | 2767 | goto out_balanced; |
2739 | 2768 | ||
2740 | sds.busiest_load_per_task /= sds.busiest_nr_running; | ||
2741 | if (sds.group_imb) | ||
2742 | sds.busiest_load_per_task = | ||
2743 | min(sds.busiest_load_per_task, sds.avg_load); | ||
2744 | |||
2745 | /* | ||
2746 | * We're trying to get all the cpus to the average_load, so we don't | ||
2747 | * want to push ourselves above the average load, nor do we wish to | ||
2748 | * reduce the max loaded cpu below the average load, as either of these | ||
2749 | * actions would just result in more rebalancing later, and ping-pong | ||
2750 | * tasks around. Thus we look for the minimum possible imbalance. | ||
2751 | * Negative imbalances (*we* are more loaded than anyone else) will | ||
2752 | * be counted as no imbalance for these purposes -- we can't fix that | ||
2753 | * by pulling tasks to us. Be careful of negative numbers as they'll | ||
2754 | * appear as very large values with unsigned longs. | ||
2755 | */ | ||
2756 | if (sds.max_load <= sds.busiest_load_per_task) | ||
2757 | goto out_balanced; | ||
2758 | |||
2759 | /* Looks like there is an imbalance. Compute it */ | 2769 | /* Looks like there is an imbalance. Compute it */ |
2760 | calculate_imbalance(&sds, this_cpu, imbalance); | 2770 | calculate_imbalance(&sds, this_cpu, imbalance); |
2761 | return sds.busiest; | 2771 | return sds.busiest; |