aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNikhil Rao <ncrao@google.com>2010-10-15 16:12:29 -0400
committerIngo Molnar <mingo@elte.hu>2010-10-18 14:52:18 -0400
commitfab476228ba37907ad75216d0fd9732ada9c119e (patch)
tree51246ee236cbaae94e18213e5419f9e9f244d69d
parent2582f0eba54066b5e98ff2b27ef0cfa833b59f54 (diff)
sched: Force balancing on newidle balance if local group has capacity
This patch forces a load balance on a newly idle cpu when the local group has extra capacity and the busiest group does not have any. It improves system utilization when balancing tasks with a large weight differential. Under certain situations, such as a niced down task (i.e. nice = -15) in the presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and kicks away other tasks because of its large weight. This leads to sub-optimal utilization of the machine. Even though the sched group has capacity, it does not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL. With this patch, if the local group has extra capacity, we shortcut the checks in f_b_g() and try to pull a task over. A sched group has extra capacity if the group capacity is greater than the number of running tasks in that group. Thanks to Mike Galbraith for discussions leading to this patch and for the insight to reuse SD_NEWIDLE_BALANCE. Signed-off-by: Nikhil Rao <ncrao@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1287173550-30365-4-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched_fair.c28
1 files changed, 25 insertions, 3 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3656480e0f79..032b548be0fc 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1764 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1765 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1767} 1771}
1768 1772
1769/* 1773/*
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
2030 unsigned long this_load; 2034 unsigned long this_load;
2031 unsigned long this_load_per_task; 2035 unsigned long this_load_per_task;
2032 unsigned long this_nr_running; 2036 unsigned long this_nr_running;
2037 unsigned long this_has_capacity;
2033 2038
2034 /* Statistics of the busiest group */ 2039 /* Statistics of the busiest group */
2035 unsigned long max_load; 2040 unsigned long max_load;
2036 unsigned long busiest_load_per_task; 2041 unsigned long busiest_load_per_task;
2037 unsigned long busiest_nr_running; 2042 unsigned long busiest_nr_running;
2038 unsigned long busiest_group_capacity; 2043 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity;
2039 2045
2040 int group_imb; /* Is there imbalance in this sd */ 2046 int group_imb; /* Is there imbalance in this sd */
2041#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
2058 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2059 unsigned long group_capacity; 2065 unsigned long group_capacity;
2060 int group_imb; /* Is there an imbalance in the group ? */ 2066 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */
2061}; 2068};
2062 2069
2063/** 2070/**
@@ -2456,6 +2463,9 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2456 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2463 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2457 if (!sgs->group_capacity) 2464 if (!sgs->group_capacity)
2458 sgs->group_capacity = fix_small_capacity(sd, group); 2465 sgs->group_capacity = fix_small_capacity(sd, group);
2466
2467 if (sgs->group_capacity > sgs->sum_nr_running)
2468 sgs->group_has_capacity = 1;
2459} 2469}
2460 2470
2461/** 2471/**
@@ -2554,12 +2564,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2554 sds->this = sg; 2564 sds->this = sg;
2555 sds->this_nr_running = sgs.sum_nr_running; 2565 sds->this_nr_running = sgs.sum_nr_running;
2556 sds->this_load_per_task = sgs.sum_weighted_load; 2566 sds->this_load_per_task = sgs.sum_weighted_load;
2567 sds->this_has_capacity = sgs.group_has_capacity;
2557 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2568 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2558 sds->max_load = sgs.avg_load; 2569 sds->max_load = sgs.avg_load;
2559 sds->busiest = sg; 2570 sds->busiest = sg;
2560 sds->busiest_nr_running = sgs.sum_nr_running; 2571 sds->busiest_nr_running = sgs.sum_nr_running;
2561 sds->busiest_group_capacity = sgs.group_capacity; 2572 sds->busiest_group_capacity = sgs.group_capacity;
2562 sds->busiest_load_per_task = sgs.sum_weighted_load; 2573 sds->busiest_load_per_task = sgs.sum_weighted_load;
2574 sds->busiest_has_capacity = sgs.group_has_capacity;
2563 sds->group_imb = sgs.group_imb; 2575 sds->group_imb = sgs.group_imb;
2564 } 2576 }
2565 2577
@@ -2756,6 +2768,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2756 return fix_small_imbalance(sds, this_cpu, imbalance); 2768 return fix_small_imbalance(sds, this_cpu, imbalance);
2757 2769
2758} 2770}
2771
2759/******* find_busiest_group() helpers end here *********************/ 2772/******* find_busiest_group() helpers end here *********************/
2760 2773
2761/** 2774/**
@@ -2807,6 +2820,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2807 * 4) This group is more busy than the avg busieness at this 2820 * 4) This group is more busy than the avg busieness at this
2808 * sched_domain. 2821 * sched_domain.
2809 * 5) The imbalance is within the specified limit. 2822 * 5) The imbalance is within the specified limit.
2823 *
2824 * Note: when doing newidle balance, if the local group has excess
2825 * capacity (i.e. nr_running < group_capacity) and the busiest group
2826 * does not have any capacity, we force a load balance to pull tasks
2827 * to the local group. In this case, we skip past checks 3, 4 and 5.
2810 */ 2828 */
2811 if (!(*balance)) 2829 if (!(*balance))
2812 goto ret; 2830 goto ret;
@@ -2818,6 +2836,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2818 if (!sds.busiest || sds.busiest_nr_running == 0) 2836 if (!sds.busiest || sds.busiest_nr_running == 0)
2819 goto out_balanced; 2837 goto out_balanced;
2820 2838
2839 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
2840 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
2841 !sds.busiest_has_capacity)
2842 goto force_balance;
2843
2821 if (sds.this_load >= sds.max_load) 2844 if (sds.this_load >= sds.max_load)
2822 goto out_balanced; 2845 goto out_balanced;
2823 2846
@@ -2829,6 +2852,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2829 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2852 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2830 goto out_balanced; 2853 goto out_balanced;
2831 2854
2855force_balance:
2832 /* Looks like there is an imbalance. Compute it */ 2856 /* Looks like there is an imbalance. Compute it */
2833 calculate_imbalance(&sds, this_cpu, imbalance); 2857 calculate_imbalance(&sds, this_cpu, imbalance);
2834 return sds.busiest; 2858 return sds.busiest;
@@ -3162,10 +3186,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3162 interval = msecs_to_jiffies(sd->balance_interval); 3186 interval = msecs_to_jiffies(sd->balance_interval);
3163 if (time_after(next_balance, sd->last_balance + interval)) 3187 if (time_after(next_balance, sd->last_balance + interval))
3164 next_balance = sd->last_balance + interval; 3188 next_balance = sd->last_balance + interval;
3165 if (pulled_task) { 3189 if (pulled_task)
3166 this_rq->idle_stamp = 0;
3167 break; 3190 break;
3168 }
3169 } 3191 }
3170 3192
3171 raw_spin_lock(&this_rq->lock); 3193 raw_spin_lock(&this_rq->lock);