diff options
| author | Suresh Siddha <suresh.b.siddha@intel.com> | 2010-09-17 18:02:32 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2010-11-10 17:13:56 -0500 |
| commit | aae6d3ddd8b90f5b2c8d79a2b914d1706d124193 (patch) | |
| tree | b993f929f4b1cc38ef01094ff4504eaf358adb31 /kernel | |
| parent | f6614b7bb405a9b35dd28baea989a749492c46b2 (diff) | |
sched: Use group weight, idle cpu metrics to fix imbalances during idle
Currently we consider a sched domain to be well balanced when the imbalance
is less than the domain's imablance_pct. As the number of cores and threads
are increasing, current values of imbalance_pct (for example 25% for a
NUMA domain) are not enough to detect imbalances like:
a) On a WSM-EP system (two sockets, each having 6 cores and 12 logical threads),
24 cpu-hogging tasks get scheduled as 13 on one socket and 11 on another
socket. Leading to an idle HT cpu.
b) On a hypothetial 2 socket NHM-EX system (each socket having 8 cores and
16 logical threads), 16 cpu-hogging tasks can get scheduled as 9 on one
socket and 7 on another socket. Leaving one core in a socket idle
whereas in another socket we have a core having both its HT siblings busy.
While this issue can be fixed by decreasing the domain's imbalance_pct
(by making it a function of number of logical cpus in the domain), it
can potentially cause more task migrations across sched groups in an
overloaded case.
Fix this by using imbalance_pct only during newly_idle and busy
load balancing. And during idle load balancing, check if there
is an imbalance in number of idle cpu's across the busiest and this
sched_group or if the busiest group has more tasks than its weight that
the idle cpu in this_group can pull.
Reported-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1284760952.2676.11.camel@sbsiddha-MOBL3.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched.c | 2 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 34 |
2 files changed, 33 insertions, 3 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index aa14a56f9d03..36a088018fe0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -6960,6 +6960,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 6960 | if (cpu != group_first_cpu(sd->groups)) | 6960 | if (cpu != group_first_cpu(sd->groups)) |
| 6961 | return; | 6961 | return; |
| 6962 | 6962 | ||
| 6963 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | ||
| 6964 | |||
| 6963 | child = sd->child; | 6965 | child = sd->child; |
| 6964 | 6966 | ||
| 6965 | sd->groups->cpu_power = 0; | 6967 | sd->groups->cpu_power = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..034c4f410b36 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -2035,13 +2035,16 @@ struct sd_lb_stats { | |||
| 2035 | unsigned long this_load_per_task; | 2035 | unsigned long this_load_per_task; |
| 2036 | unsigned long this_nr_running; | 2036 | unsigned long this_nr_running; |
| 2037 | unsigned long this_has_capacity; | 2037 | unsigned long this_has_capacity; |
| 2038 | unsigned int this_idle_cpus; | ||
| 2038 | 2039 | ||
| 2039 | /* Statistics of the busiest group */ | 2040 | /* Statistics of the busiest group */ |
| 2041 | unsigned int busiest_idle_cpus; | ||
| 2040 | unsigned long max_load; | 2042 | unsigned long max_load; |
| 2041 | unsigned long busiest_load_per_task; | 2043 | unsigned long busiest_load_per_task; |
| 2042 | unsigned long busiest_nr_running; | 2044 | unsigned long busiest_nr_running; |
| 2043 | unsigned long busiest_group_capacity; | 2045 | unsigned long busiest_group_capacity; |
| 2044 | unsigned long busiest_has_capacity; | 2046 | unsigned long busiest_has_capacity; |
| 2047 | unsigned int busiest_group_weight; | ||
| 2045 | 2048 | ||
| 2046 | int group_imb; /* Is there imbalance in this sd */ | 2049 | int group_imb; /* Is there imbalance in this sd */ |
| 2047 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2050 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| @@ -2063,6 +2066,8 @@ struct sg_lb_stats { | |||
| 2063 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | 2066 | unsigned long sum_nr_running; /* Nr tasks running in the group */ |
| 2064 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 2067 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 2065 | unsigned long group_capacity; | 2068 | unsigned long group_capacity; |
| 2069 | unsigned long idle_cpus; | ||
| 2070 | unsigned long group_weight; | ||
| 2066 | int group_imb; /* Is there an imbalance in the group ? */ | 2071 | int group_imb; /* Is there an imbalance in the group ? */ |
| 2067 | int group_has_capacity; /* Is there extra capacity in the group? */ | 2072 | int group_has_capacity; /* Is there extra capacity in the group? */ |
| 2068 | }; | 2073 | }; |
| @@ -2431,7 +2436,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2431 | sgs->group_load += load; | 2436 | sgs->group_load += load; |
| 2432 | sgs->sum_nr_running += rq->nr_running; | 2437 | sgs->sum_nr_running += rq->nr_running; |
| 2433 | sgs->sum_weighted_load += weighted_cpuload(i); | 2438 | sgs->sum_weighted_load += weighted_cpuload(i); |
| 2434 | 2439 | if (idle_cpu(i)) | |
| 2440 | sgs->idle_cpus++; | ||
| 2435 | } | 2441 | } |
| 2436 | 2442 | ||
| 2437 | /* | 2443 | /* |
| @@ -2469,6 +2475,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2469 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2475 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
| 2470 | if (!sgs->group_capacity) | 2476 | if (!sgs->group_capacity) |
| 2471 | sgs->group_capacity = fix_small_capacity(sd, group); | 2477 | sgs->group_capacity = fix_small_capacity(sd, group); |
| 2478 | sgs->group_weight = group->group_weight; | ||
| 2472 | 2479 | ||
| 2473 | if (sgs->group_capacity > sgs->sum_nr_running) | 2480 | if (sgs->group_capacity > sgs->sum_nr_running) |
| 2474 | sgs->group_has_capacity = 1; | 2481 | sgs->group_has_capacity = 1; |
| @@ -2576,13 +2583,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2576 | sds->this_nr_running = sgs.sum_nr_running; | 2583 | sds->this_nr_running = sgs.sum_nr_running; |
| 2577 | sds->this_load_per_task = sgs.sum_weighted_load; | 2584 | sds->this_load_per_task = sgs.sum_weighted_load; |
| 2578 | sds->this_has_capacity = sgs.group_has_capacity; | 2585 | sds->this_has_capacity = sgs.group_has_capacity; |
| 2586 | sds->this_idle_cpus = sgs.idle_cpus; | ||
| 2579 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { | 2587 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
| 2580 | sds->max_load = sgs.avg_load; | 2588 | sds->max_load = sgs.avg_load; |
| 2581 | sds->busiest = sg; | 2589 | sds->busiest = sg; |
| 2582 | sds->busiest_nr_running = sgs.sum_nr_running; | 2590 | sds->busiest_nr_running = sgs.sum_nr_running; |
| 2591 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
| 2583 | sds->busiest_group_capacity = sgs.group_capacity; | 2592 | sds->busiest_group_capacity = sgs.group_capacity; |
| 2584 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2593 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
| 2585 | sds->busiest_has_capacity = sgs.group_has_capacity; | 2594 | sds->busiest_has_capacity = sgs.group_has_capacity; |
| 2595 | sds->busiest_group_weight = sgs.group_weight; | ||
| 2586 | sds->group_imb = sgs.group_imb; | 2596 | sds->group_imb = sgs.group_imb; |
| 2587 | } | 2597 | } |
| 2588 | 2598 | ||
| @@ -2860,8 +2870,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2860 | if (sds.this_load >= sds.avg_load) | 2870 | if (sds.this_load >= sds.avg_load) |
| 2861 | goto out_balanced; | 2871 | goto out_balanced; |
| 2862 | 2872 | ||
| 2863 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | 2873 | /* |
| 2864 | goto out_balanced; | 2874 | * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. |
| 2875 | * And to check for busy balance use !idle_cpu instead of | ||
| 2876 | * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE | ||
| 2877 | * even when they are idle. | ||
| 2878 | */ | ||
| 2879 | if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { | ||
| 2880 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
| 2881 | goto out_balanced; | ||
| 2882 | } else { | ||
| 2883 | /* | ||
| 2884 | * This cpu is idle. If the busiest group load doesn't | ||
| 2885 | * have more tasks than the number of available cpu's and | ||
| 2886 | * there is no imbalance between this and busiest group | ||
| 2887 | * wrt to idle cpu's, it is balanced. | ||
| 2888 | */ | ||
| 2889 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | ||
| 2890 | sds.busiest_nr_running <= sds.busiest_group_weight) | ||
| 2891 | goto out_balanced; | ||
| 2892 | } | ||
| 2865 | 2893 | ||
| 2866 | force_balance: | 2894 | force_balance: |
| 2867 | /* Looks like there is an imbalance. Compute it */ | 2895 | /* Looks like there is an imbalance. Compute it */ |
