sched: Use group weight, idle cpu metrics to fix imbalances during idle

Currently we consider a sched domain to be well balanced when the imbalance is less than the domain's imablance_pct. As the number of cores and threads are increasing, current values of imbalance_pct (for example 25% for a NUMA domain) are not enough to detect imbalances like: a) On a WSM-EP system (two sockets, each having 6 cores and 12 logical threads), 24 cpu-hogging tasks get scheduled as 13 on one socket and 11 on another socket. Leading to an idle HT cpu. b) On a hypothetial 2 socket NHM-EX system (each socket having 8 cores and 16 logical threads), 16 cpu-hogging tasks can get scheduled as 9 on one socket and 7 on another socket. Leaving one core in a socket idle whereas in another socket we have a core having both its HT siblings busy. While this issue can be fixed by decreasing the domain's imbalance_pct (by making it a function of number of logical cpus in the domain), it can potentially cause more task migrations across sched groups in an overloaded case. Fix this by using imbalance_pct only during newly_idle and busy load balancing. And during idle load balancing, check if there is an imbalance in number of idle cpu's across the busiest and this sched_group or if the busiest group has more tasks than its weight that the idle cpu in this_group can pull. Reported-by: Nikhil Rao <ncrao@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1284760952.2676.11.camel@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Suresh Siddha <suresh.b.siddha@intel.com> 2010-09-17 18:02:32 -0400
committer: Ingo Molnar <mingo@elte.hu> 2010-11-10 17:13:56 -0500
commit: aae6d3ddd8b90f5b2c8d79a2b914d1706d124193 (patch)
tree: b993f929f4b1cc38ef01094ff4504eaf358adb31
parent: f6614b7bb405a9b35dd28baea989a749492c46b2 (diff)
3 files changed, 34 insertions, 3 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0036e52a24a..2c79e921a68b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -862,6 +862,7 @@ struct sched_group {
         * single CPU.
         */
        unsigned int cpu_power, cpu_power_orig;
+        unsigned int group_weight;
        /*
         * The CPUs this group covers.
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..36a088018fe0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6960,6 +6960,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (cpu != group_first_cpu(sd->groups))
                return;
+        sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
        child = sd->child;
        sd->groups->cpu_power = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..034c4f410b36 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2035,13 +2035,16 @@ struct sd_lb_stats {
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
        unsigned long this_has_capacity;
+        unsigned int  this_idle_cpus;
        /* Statistics of the busiest group */
+        unsigned int  busiest_idle_cpus;
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
        unsigned long busiest_has_capacity;
+        unsigned int  busiest_group_weight;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2066,8 @@ struct sg_lb_stats {
        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
+        unsigned long idle_cpus;
+        unsigned long group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
 };
@@ -2431,7 +2436,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
+                if (idle_cpu(i))
+                        sgs->idle_cpus++;
        }
        /*
@@ -2469,6 +2475,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+        sgs->group_weight = group->group_weight;
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
@@ -2576,13 +2583,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
                        sds->this_has_capacity = sgs.group_has_capacity;
+                        sds->this_idle_cpus = sgs.idle_cpus;
                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_idle_cpus = sgs.idle_cpus;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
                        sds->busiest_has_capacity = sgs.group_has_capacity;
+                        sds->busiest_group_weight = sgs.group_weight;
                        sds->group_imb = sgs.group_imb;
                }
@@ -2860,8 +2870,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+        /*
-                goto out_balanced;
+         * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+         * And to check for busy balance use !idle_cpu instead of
+         * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+         * even when they are idle.
+         */
+        if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                        goto out_balanced;
+        } else {
+                /*
+                 * This cpu is idle. If the busiest group load doesn't
+                 * have more tasks than the number of available cpu's and
+                 * there is no imbalance between this and busiest group
+                 * wrt to idle cpu's, it is balanced.
+                 */
+                if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                    sds.busiest_nr_running <= sds.busiest_group_weight)
+                        goto out_balanced;
+        }
 force_balance:
        /* Looks like there is an imbalance. Compute it */
author	Suresh Siddha <suresh.b.siddha@intel.com>	2010-09-17 18:02:32 -0400
committer	Ingo Molnar <mingo@elte.hu>	2010-11-10 17:13:56 -0500
commit	aae6d3ddd8b90f5b2c8d79a2b914d1706d124193 (patch)
tree	b993f929f4b1cc38ef01094ff4504eaf358adb31
parent	f6614b7bb405a9b35dd28baea989a749492c46b2 (diff)

diff --git a/include/linux/sched.h b/include/linux/sched.h index d0036e52a24a..2c79e921a68b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -862,6 +862,7 @@ struct sched_group {
862	* single CPU.	862	* single CPU.
863	*/	863	*/
864	unsigned int cpu_power, cpu_power_orig;	864	unsigned int cpu_power, cpu_power_orig;
		865	unsigned int group_weight;
865		866
866	/*	867	/*
867	* The CPUs this group covers.	868	* The CPUs this group covers.


diff --git a/kernel/sched.c b/kernel/sched.c index aa14a56f9d03..36a088018fe0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -6960,6 +6960,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960	if (cpu != group_first_cpu(sd->groups))	6960	if (cpu != group_first_cpu(sd->groups))
6961	return;	6961	return;
6962		6962
		6963	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
		6964
6963	child = sd->child;	6965	child = sd->child;
6964		6966
6965	sd->groups->cpu_power = 0;	6967	sd->groups->cpu_power = 0;


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..034c4f410b36 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -2035,13 +2035,16 @@ struct sd_lb_stats {
2035	unsigned long this_load_per_task;	2035	unsigned long this_load_per_task;
2036	unsigned long this_nr_running;	2036	unsigned long this_nr_running;
2037	unsigned long this_has_capacity;	2037	unsigned long this_has_capacity;
		2038	unsigned int this_idle_cpus;
2038		2039
2039	/* Statistics of the busiest group */	2040	/* Statistics of the busiest group */
		2041	unsigned int busiest_idle_cpus;
2040	unsigned long max_load;	2042	unsigned long max_load;
2041	unsigned long busiest_load_per_task;	2043	unsigned long busiest_load_per_task;
2042	unsigned long busiest_nr_running;	2044	unsigned long busiest_nr_running;
2043	unsigned long busiest_group_capacity;	2045	unsigned long busiest_group_capacity;
2044	unsigned long busiest_has_capacity;	2046	unsigned long busiest_has_capacity;
		2047	unsigned int busiest_group_weight;
2045		2048
2046	int group_imb; /* Is there imbalance in this sd */	2049	int group_imb; /* Is there imbalance in this sd */
2047	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	2050	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2066,8 @@ struct sg_lb_stats {
2063	unsigned long sum_nr_running; /* Nr tasks running in the group */	2066	unsigned long sum_nr_running; /* Nr tasks running in the group */
2064	unsigned long sum_weighted_load; /* Weighted load of group's tasks */	2067	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2065	unsigned long group_capacity;	2068	unsigned long group_capacity;
		2069	unsigned long idle_cpus;
		2070	unsigned long group_weight;
2066	int group_imb; /* Is there an imbalance in the group ? */	2071	int group_imb; /* Is there an imbalance in the group ? */
2067	int group_has_capacity; /* Is there extra capacity in the group? */	2072	int group_has_capacity; /* Is there extra capacity in the group? */
2068	};	2073	};
@@ -2431,7 +2436,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2431	sgs->group_load += load;	2436	sgs->group_load += load;
2432	sgs->sum_nr_running += rq->nr_running;	2437	sgs->sum_nr_running += rq->nr_running;
2433	sgs->sum_weighted_load += weighted_cpuload(i);	2438	sgs->sum_weighted_load += weighted_cpuload(i);
2434		2439	if (idle_cpu(i))
		2440	sgs->idle_cpus++;
2435	}	2441	}
2436		2442
2437	/*	2443	/*
@@ -2469,6 +2475,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2469	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);	2475	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2470	if (!sgs->group_capacity)	2476	if (!sgs->group_capacity)
2471	sgs->group_capacity = fix_small_capacity(sd, group);	2477	sgs->group_capacity = fix_small_capacity(sd, group);
		2478	sgs->group_weight = group->group_weight;
2472		2479
2473	if (sgs->group_capacity > sgs->sum_nr_running)	2480	if (sgs->group_capacity > sgs->sum_nr_running)
2474	sgs->group_has_capacity = 1;	2481	sgs->group_has_capacity = 1;
@@ -2576,13 +2583,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2576	sds->this_nr_running = sgs.sum_nr_running;	2583	sds->this_nr_running = sgs.sum_nr_running;
2577	sds->this_load_per_task = sgs.sum_weighted_load;	2584	sds->this_load_per_task = sgs.sum_weighted_load;
2578	sds->this_has_capacity = sgs.group_has_capacity;	2585	sds->this_has_capacity = sgs.group_has_capacity;
		2586	sds->this_idle_cpus = sgs.idle_cpus;
2579	} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {	2587	} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2580	sds->max_load = sgs.avg_load;	2588	sds->max_load = sgs.avg_load;
2581	sds->busiest = sg;	2589	sds->busiest = sg;
2582	sds->busiest_nr_running = sgs.sum_nr_running;	2590	sds->busiest_nr_running = sgs.sum_nr_running;
		2591	sds->busiest_idle_cpus = sgs.idle_cpus;
2583	sds->busiest_group_capacity = sgs.group_capacity;	2592	sds->busiest_group_capacity = sgs.group_capacity;
2584	sds->busiest_load_per_task = sgs.sum_weighted_load;	2593	sds->busiest_load_per_task = sgs.sum_weighted_load;
2585	sds->busiest_has_capacity = sgs.group_has_capacity;	2594	sds->busiest_has_capacity = sgs.group_has_capacity;
		2595	sds->busiest_group_weight = sgs.group_weight;
2586	sds->group_imb = sgs.group_imb;	2596	sds->group_imb = sgs.group_imb;
2587	}	2597	}
2588		2598
@@ -2860,8 +2870,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2860	if (sds.this_load >= sds.avg_load)	2870	if (sds.this_load >= sds.avg_load)
2861	goto out_balanced;	2871	goto out_balanced;
2862		2872
2863	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)	2873	/*
2864	goto out_balanced;	2874	* In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
		2875	* And to check for busy balance use !idle_cpu instead of
		2876	* CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
		2877	* even when they are idle.
		2878	*/
		2879	if (idle == CPU_NEWLY_IDLE \|\| !idle_cpu(this_cpu)) {
		2880	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
		2881	goto out_balanced;
		2882	} else {
		2883	/*
		2884	* This cpu is idle. If the busiest group load doesn't
		2885	* have more tasks than the number of available cpu's and
		2886	* there is no imbalance between this and busiest group
		2887	* wrt to idle cpu's, it is balanced.
		2888	*/
		2889	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
		2890	sds.busiest_nr_running <= sds.busiest_group_weight)
		2891	goto out_balanced;
		2892	}
2865		2893
2866	force_balance:	2894	force_balance:
2867	/* Looks like there is an imbalance. Compute it */	2895	/* Looks like there is an imbalance. Compute it */