sched/numa: Avoid overloading CPUs on a preferred NUMA node

This patch replaces find_idlest_cpu_node with task_numa_find_cpu. find_idlest_cpu_node has two critical limitations. It does not take the scheduling class into account when calculating the load and it is unsuitable for using when comparing loads between NUMA nodes. task_numa_find_cpu uses similar load calculations to wake_affine() when selecting the least loaded CPU within a scheduling domain common to the source and destimation nodes. It avoids causing CPU load imbalances in the machine by refusing to migrate if the relative load on the target CPU is higher than the source CPU. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-33-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Mel Gorman <mgorman@suse.de> 2013-10-07 06:29:10 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-10-09 06:40:39 -0400
commit: 58d081b5082dd85e02ac9a1fb151d97395340a09 (patch)
tree: 5bfc7ac630ba62d898dfb860e63f118079cba57d /kernel/sched
parent: fc3147245d193bd0f57307859c698fa28a20b0fe (diff)
1 files changed, 102 insertions, 29 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d98175d5c2c6..51a760081193 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -901,28 +901,114 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
 }
 static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+struct numa_stats {
+        unsigned long load;
+        s64 eff_load;
+        unsigned long faults;
+};
-static int
+struct task_numa_env {
-find_idlest_cpu_node(int this_cpu, int nid)
+        struct task_struct *p;
-{
-        unsigned long load, min_load = ULONG_MAX;
-        int i, idlest_cpu = this_cpu;
-        BUG_ON(cpu_to_node(this_cpu) == nid);
+        int src_cpu, src_nid;
+        int dst_cpu, dst_nid;
-        rcu_read_lock();
+        struct numa_stats src_stats, dst_stats;
-        for_each_cpu(i, cpumask_of_node(nid)) {
-                load = weighted_cpuload(i);
-                if (load < min_load) {
+        unsigned long best_load;
-                        min_load = load;
+        int best_cpu;
-                        idlest_cpu = i;
+};
+static int task_numa_migrate(struct task_struct *p)
+{
+        int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
+        struct task_numa_env env = {
+                .p = p,
+                .src_cpu = task_cpu(p),
+                .src_nid = cpu_to_node(task_cpu(p)),
+                .dst_cpu = node_cpu,
+                .dst_nid = p->numa_preferred_nid,
+                .best_load = ULONG_MAX,
+                .best_cpu = task_cpu(p),
+        };
+        struct sched_domain *sd;
+        int cpu;
+        struct task_group *tg = task_group(p);
+        unsigned long weight;
+        bool balanced;
+        int imbalance_pct, idx = -1;
+        /*
+         * Find the lowest common scheduling domain covering the nodes of both
+         * the CPU the task is currently running on and the target NUMA node.
+         */
+        rcu_read_lock();
+        for_each_domain(env.src_cpu, sd) {
+                if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
+                        /*
+                         * busy_idx is used for the load decision as it is the
+                         * same index used by the regular load balancer for an
+                         * active cpu.
+                         */
+                        idx = sd->busy_idx;
+                        imbalance_pct = sd->imbalance_pct;
+                        break;
                }
        }
        rcu_read_unlock();
-        return idlest_cpu;
+        if (WARN_ON_ONCE(idx == -1))
+                return 0;
+        /*
+         * XXX the below is mostly nicked from wake_affine(); we should
+         * see about sharing a bit if at all possible; also it might want
+         * some per entity weight love.
+         */
+        weight = p->se.load.weight;
+        env.src_stats.load = source_load(env.src_cpu, idx);
+        env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
+        env.src_stats.eff_load *= power_of(env.src_cpu);
+        env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
+        for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
+                env.dst_cpu = cpu;
+                env.dst_stats.load = target_load(cpu, idx);
+                /* If the CPU is idle, use it */
+                if (!env.dst_stats.load) {
+                        env.best_cpu = cpu;
+                        goto migrate;
+                }
+                /* Otherwise check the target CPU load */
+                env.dst_stats.eff_load = 100;
+                env.dst_stats.eff_load *= power_of(cpu);
+                env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
+                /*
+                 * Destination is considered balanced if the destination CPU is
+                 * less loaded than the source CPU. Unfortunately there is a
+                 * risk that a task running on a lightly loaded CPU will not
+                 * migrate to its preferred node due to load imbalances.
+                 */
+                balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
+                if (!balanced)
+                        continue;
+                if (env.dst_stats.eff_load < env.best_load) {
+                        env.best_load = env.dst_stats.eff_load;
+                        env.best_cpu = cpu;
+                }
+        }
+migrate:
+        return migrate_task_to(p, env.best_cpu);
 }
 static void task_numa_placement(struct task_struct *p)
@@ -966,22 +1052,10 @@ static void task_numa_placement(struct task_struct *p)
         * the working set placement.
         */
        if (max_faults && max_nid != p->numa_preferred_nid) {
-                int preferred_cpu;
-                /*
-                 * If the task is not on the preferred node then find the most
-                 * idle CPU to migrate to.
-                 */
-                preferred_cpu = task_cpu(p);
-                if (cpu_to_node(preferred_cpu) != max_nid) {
-                        preferred_cpu = find_idlest_cpu_node(preferred_cpu,
-                                                             max_nid);
-                }
                /* Update the preferred nid and migrate task if possible */
                p->numa_preferred_nid = max_nid;
                p->numa_migrate_seq = 1;
-                migrate_task_to(p, preferred_cpu);
+                task_numa_migrate(p);
        }
 }
@@ -3292,7 +3366,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)        /* the trivial, non-cgroup case */
+        if (!tg->parent || !wl) /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
@@ -3345,8 +3419,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 }
 #else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                unsigned long wl, unsigned long wg)
 {
        return wl;
 }
author	Mel Gorman <mgorman@suse.de>	2013-10-07 06:29:10 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-10-09 06:40:39 -0400
commit	58d081b5082dd85e02ac9a1fb151d97395340a09 (patch)
tree	5bfc7ac630ba62d898dfb860e63f118079cba57d /kernel/sched
parent	fc3147245d193bd0f57307859c698fa28a20b0fe (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d98175d5c2c6..51a760081193 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -901,28 +901,114 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
901	}	901	}
902		902
903	static unsigned long weighted_cpuload(const int cpu);	903	static unsigned long weighted_cpuload(const int cpu);
		904	static unsigned long source_load(int cpu, int type);
		905	static unsigned long target_load(int cpu, int type);
		906	static unsigned long power_of(int cpu);
		907	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
904		908
		909	struct numa_stats {
		910	unsigned long load;
		911	s64 eff_load;
		912	unsigned long faults;
		913	};
905		914
906	static int	915	struct task_numa_env {
907	find_idlest_cpu_node(int this_cpu, int nid)	916	struct task_struct *p;
908	{
909	unsigned long load, min_load = ULONG_MAX;
910	int i, idlest_cpu = this_cpu;
911		917
912	BUG_ON(cpu_to_node(this_cpu) == nid);	918	int src_cpu, src_nid;
		919	int dst_cpu, dst_nid;
913		920
914	rcu_read_lock();	921	struct numa_stats src_stats, dst_stats;
915	for_each_cpu(i, cpumask_of_node(nid)) {
916	load = weighted_cpuload(i);
917		922
918	if (load < min_load) {	923	unsigned long best_load;
919	min_load = load;	924	int best_cpu;
920	idlest_cpu = i;	925	};
		926
		927	static int task_numa_migrate(struct task_struct *p)
		928	{
		929	int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
		930	struct task_numa_env env = {
		931	.p = p,
		932	.src_cpu = task_cpu(p),
		933	.src_nid = cpu_to_node(task_cpu(p)),
		934	.dst_cpu = node_cpu,
		935	.dst_nid = p->numa_preferred_nid,
		936	.best_load = ULONG_MAX,
		937	.best_cpu = task_cpu(p),
		938	};
		939	struct sched_domain *sd;
		940	int cpu;
		941	struct task_group *tg = task_group(p);
		942	unsigned long weight;
		943	bool balanced;
		944	int imbalance_pct, idx = -1;
		945
		946	/*
		947	* Find the lowest common scheduling domain covering the nodes of both
		948	* the CPU the task is currently running on and the target NUMA node.
		949	*/
		950	rcu_read_lock();
		951	for_each_domain(env.src_cpu, sd) {
		952	if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
		953	/*
		954	* busy_idx is used for the load decision as it is the
		955	* same index used by the regular load balancer for an
		956	* active cpu.
		957	*/
		958	idx = sd->busy_idx;
		959	imbalance_pct = sd->imbalance_pct;
		960	break;
921	}	961	}
922	}	962	}
923	rcu_read_unlock();	963	rcu_read_unlock();
924		964
925	return idlest_cpu;	965	if (WARN_ON_ONCE(idx == -1))
		966	return 0;
		967
		968	/*
		969	* XXX the below is mostly nicked from wake_affine(); we should
		970	* see about sharing a bit if at all possible; also it might want
		971	* some per entity weight love.
		972	*/
		973	weight = p->se.load.weight;
		974	env.src_stats.load = source_load(env.src_cpu, idx);
		975	env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
		976	env.src_stats.eff_load *= power_of(env.src_cpu);
		977	env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
		978
		979	for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
		980	env.dst_cpu = cpu;
		981	env.dst_stats.load = target_load(cpu, idx);
		982
		983	/* If the CPU is idle, use it */
		984	if (!env.dst_stats.load) {
		985	env.best_cpu = cpu;
		986	goto migrate;
		987	}
		988
		989	/* Otherwise check the target CPU load */
		990	env.dst_stats.eff_load = 100;
		991	env.dst_stats.eff_load *= power_of(cpu);
		992	env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
		993
		994	/*
		995	* Destination is considered balanced if the destination CPU is
		996	* less loaded than the source CPU. Unfortunately there is a
		997	* risk that a task running on a lightly loaded CPU will not
		998	* migrate to its preferred node due to load imbalances.
		999	*/
		1000	balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
		1001	if (!balanced)
		1002	continue;
		1003
		1004	if (env.dst_stats.eff_load < env.best_load) {
		1005	env.best_load = env.dst_stats.eff_load;
		1006	env.best_cpu = cpu;
		1007	}
		1008	}
		1009
		1010	migrate:
		1011	return migrate_task_to(p, env.best_cpu);
926	}	1012	}
927		1013
928	static void task_numa_placement(struct task_struct *p)	1014	static void task_numa_placement(struct task_struct *p)
@@ -966,22 +1052,10 @@ static void task_numa_placement(struct task_struct *p)
966	* the working set placement.	1052	* the working set placement.
967	*/	1053	*/
968	if (max_faults && max_nid != p->numa_preferred_nid) {	1054	if (max_faults && max_nid != p->numa_preferred_nid) {
969	int preferred_cpu;
970
971	/*
972	* If the task is not on the preferred node then find the most
973	* idle CPU to migrate to.
974	*/
975	preferred_cpu = task_cpu(p);
976	if (cpu_to_node(preferred_cpu) != max_nid) {
977	preferred_cpu = find_idlest_cpu_node(preferred_cpu,
978	max_nid);
979	}
980
981	/* Update the preferred nid and migrate task if possible */	1055	/* Update the preferred nid and migrate task if possible */
982	p->numa_preferred_nid = max_nid;	1056	p->numa_preferred_nid = max_nid;
983	p->numa_migrate_seq = 1;	1057	p->numa_migrate_seq = 1;
984	migrate_task_to(p, preferred_cpu);	1058	task_numa_migrate(p);
985	}	1059	}
986	}	1060	}
987		1061
@@ -3292,7 +3366,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3292	{	3366	{
3293	struct sched_entity *se = tg->se[cpu];	3367	struct sched_entity *se = tg->se[cpu];
3294		3368
3295	if (!tg->parent) /* the trivial, non-cgroup case */	3369	if (!tg->parent \|\| !wl) /* the trivial, non-cgroup case */
3296	return wl;	3370	return wl;
3297		3371
3298	for_each_sched_entity(se) {	3372	for_each_sched_entity(se) {
@@ -3345,8 +3419,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3345	}	3419	}
3346	#else	3420	#else
3347		3421
3348	static inline unsigned long effective_load(struct task_group *tg, int cpu,	3422	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3349	unsigned long wl, unsigned long wg)
3350	{	3423	{
3351	return wl;	3424	return wl;
3352	}	3425	}