sched/numa: Favour moving tasks towards the preferred node

This patch favours moving tasks towards NUMA node that recorded a higher number of NUMA faults during active load balancing. Ideally this is self-reinforcing as the longer the task runs on that node, the more faults it should incur causing task_numa_placement to keep the task running on that node. In reality a big weakness is that the nodes CPUs can be overloaded and it would be more efficient to queue tasks on an idle node and migrate to the new node. This would require additional smarts in the balancer so for now the balancer will simply prefer to place the task on the preferred node for a PTE scans which is controlled by the numa_balancing_settle_count sysctl. Once the settle_count number of scans has complete the schedule is free to place the task on an alternative node if the load is imbalanced. [srikar@linux.vnet.ibm.com: Fixed statistics] Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> [ Tunable and use higher faults instead of preferred. ] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-23-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Mel Gorman <mgorman@suse.de> 2013-10-07 06:29:00 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-10-09 06:40:26 -0400
commit: 3a7053b3224f4a8b0e8184166190076593621617 (patch)
tree: dfe404bfbc1306fccbc00f2177becf1482504e45 /kernel/sched/fair.c
parent: 745d61476ddb737aad3495fa6d9a8f8c2ee59f86 (diff)
1 files changed, 59 insertions, 4 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3abc651bc38a..6ffddca687fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -877,6 +877,15 @@ static unsigned int task_scan_max(struct task_struct *p)
        return max(smin, smax);
 }
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
 static void task_numa_placement(struct task_struct *p)
 {
        int seq, nid, max_nid = -1;
@@ -888,6 +897,7 @@ static void task_numa_placement(struct task_struct *p)
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
+        p->numa_migrate_seq++;
        p->numa_scan_period_max = task_scan_max(p);
        /* Find the node with the highest number of faults */
@@ -907,8 +917,10 @@ static void task_numa_placement(struct task_struct *p)
        }
        /* Update the tasks preferred node if necessary */
-        if (max_faults && max_nid != p->numa_preferred_nid)
+        if (max_faults && max_nid != p->numa_preferred_nid) {
                p->numa_preferred_nid = max_nid;
+                p->numa_migrate_seq = 0;
+        }
 }
 /*
@@ -4071,6 +4083,38 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+        int src_nid, dst_nid;
+        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+            !(env->sd->flags & SD_NUMA)) {
+                return false;
+        }
+        src_nid = cpu_to_node(env->src_cpu);
+        dst_nid = cpu_to_node(env->dst_cpu);
+        if (src_nid == dst_nid ||
+            p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+                return false;
+        if (dst_nid == p->numa_preferred_nid ||
+            p->numa_faults[dst_nid] > p->numa_faults[src_nid])
+                return true;
+        return false;
+}
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                             struct lb_env *env)
+{
+        return false;
+}
+#endif
 /*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
@@ -4128,11 +4172,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        /*
         * Aggressive migration if:
-         * 1) task is cache cold, or
+         * 1) destination numa is preferred
-         * 2) too many balance attempts have failed.
+         * 2) task is cache cold, or
+         * 3) too many balance attempts have failed.
         */
        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+        if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                        schedstat_inc(p, se.statistics.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
author	Mel Gorman <mgorman@suse.de>	2013-10-07 06:29:00 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-10-09 06:40:26 -0400
commit	3a7053b3224f4a8b0e8184166190076593621617 (patch)
tree	dfe404bfbc1306fccbc00f2177becf1482504e45 /kernel/sched/fair.c
parent	745d61476ddb737aad3495fa6d9a8f8c2ee59f86 (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3abc651bc38a..6ffddca687fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -877,6 +877,15 @@ static unsigned int task_scan_max(struct task_struct *p)
877	return max(smin, smax);	877	return max(smin, smax);
878	}	878	}
879		879
		880	/*
		881	* Once a preferred node is selected the scheduler balancer will prefer moving
		882	* a task to that node for sysctl_numa_balancing_settle_count number of PTE
		883	* scans. This will give the process the chance to accumulate more faults on
		884	* the preferred node but still allow the scheduler to move the task again if
		885	* the nodes CPUs are overloaded.
		886	*/
		887	unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
		888
880	static void task_numa_placement(struct task_struct *p)	889	static void task_numa_placement(struct task_struct *p)
881	{	890	{
882	int seq, nid, max_nid = -1;	891	int seq, nid, max_nid = -1;
@@ -888,6 +897,7 @@ static void task_numa_placement(struct task_struct *p)
888	if (p->numa_scan_seq == seq)	897	if (p->numa_scan_seq == seq)
889	return;	898	return;
890	p->numa_scan_seq = seq;	899	p->numa_scan_seq = seq;
		900	p->numa_migrate_seq++;
891	p->numa_scan_period_max = task_scan_max(p);	901	p->numa_scan_period_max = task_scan_max(p);
892		902
893	/* Find the node with the highest number of faults */	903	/* Find the node with the highest number of faults */
@@ -907,8 +917,10 @@ static void task_numa_placement(struct task_struct *p)
907	}	917	}
908		918
909	/* Update the tasks preferred node if necessary */	919	/* Update the tasks preferred node if necessary */
910	if (max_faults && max_nid != p->numa_preferred_nid)	920	if (max_faults && max_nid != p->numa_preferred_nid) {
911	p->numa_preferred_nid = max_nid;	921	p->numa_preferred_nid = max_nid;
		922	p->numa_migrate_seq = 0;
		923	}
912	}	924	}
913		925
914	/*	926	/*
@@ -4071,6 +4083,38 @@ task_hot(struct task_struct p, u64 now, struct sched_domain sd)
4071	return delta < (s64)sysctl_sched_migration_cost;	4083	return delta < (s64)sysctl_sched_migration_cost;
4072	}	4084	}
4073		4085
		4086	#ifdef CONFIG_NUMA_BALANCING
		4087	/* Returns true if the destination node has incurred more faults */
		4088	static bool migrate_improves_locality(struct task_struct p, struct lb_env env)
		4089	{
		4090	int src_nid, dst_nid;
		4091
		4092	if (!sched_feat(NUMA_FAVOUR_HIGHER) \|\| !p->numa_faults \|\|
		4093	!(env->sd->flags & SD_NUMA)) {
		4094	return false;
		4095	}
		4096
		4097	src_nid = cpu_to_node(env->src_cpu);
		4098	dst_nid = cpu_to_node(env->dst_cpu);
		4099
		4100	if (src_nid == dst_nid \|\|
		4101	p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
		4102	return false;
		4103
		4104	if (dst_nid == p->numa_preferred_nid \|\|
		4105	p->numa_faults[dst_nid] > p->numa_faults[src_nid])
		4106	return true;
		4107
		4108	return false;
		4109	}
		4110	#else
		4111	static inline bool migrate_improves_locality(struct task_struct *p,
		4112	struct lb_env *env)
		4113	{
		4114	return false;
		4115	}
		4116	#endif
		4117
4074	/*	4118	/*
4075	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?	4119	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
4076	*/	4120	*/
@@ -4128,11 +4172,22 @@ int can_migrate_task(struct task_struct p, struct lb_env env)
4128		4172
4129	/*	4173	/*
4130	* Aggressive migration if:	4174	* Aggressive migration if:
4131	* 1) task is cache cold, or	4175	* 1) destination numa is preferred
4132	* 2) too many balance attempts have failed.	4176	* 2) task is cache cold, or
		4177	* 3) too many balance attempts have failed.
4133	*/	4178	*/
4134
4135	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);	4179	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
		4180
		4181	if (migrate_improves_locality(p, env)) {
		4182	#ifdef CONFIG_SCHEDSTATS
		4183	if (tsk_cache_hot) {
		4184	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
		4185	schedstat_inc(p, se.statistics.nr_forced_migrations);
		4186	}
		4187	#endif
		4188	return 1;
		4189	}
		4190
4136	if (!tsk_cache_hot \|\|	4191	if (!tsk_cache_hot \|\|
4137	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {	4192	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4138		4193