summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:29:00 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 06:40:26 -0400
commit3a7053b3224f4a8b0e8184166190076593621617 (patch)
treedfe404bfbc1306fccbc00f2177becf1482504e45 /kernel/sched/fair.c
parent745d61476ddb737aad3495fa6d9a8f8c2ee59f86 (diff)
sched/numa: Favour moving tasks towards the preferred node
This patch favours moving tasks towards NUMA node that recorded a higher number of NUMA faults during active load balancing. Ideally this is self-reinforcing as the longer the task runs on that node, the more faults it should incur causing task_numa_placement to keep the task running on that node. In reality a big weakness is that the nodes CPUs can be overloaded and it would be more efficient to queue tasks on an idle node and migrate to the new node. This would require additional smarts in the balancer so for now the balancer will simply prefer to place the task on the preferred node for a PTE scans which is controlled by the numa_balancing_settle_count sysctl. Once the settle_count number of scans has complete the schedule is free to place the task on an alternative node if the load is imbalanced. [srikar@linux.vnet.ibm.com: Fixed statistics] Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> [ Tunable and use higher faults instead of preferred. ] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-23-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c63
1 files changed, 59 insertions, 4 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3abc651bc38a..6ffddca687fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -877,6 +877,15 @@ static unsigned int task_scan_max(struct task_struct *p)
877 return max(smin, smax); 877 return max(smin, smax);
878} 878}
879 879
880/*
881 * Once a preferred node is selected the scheduler balancer will prefer moving
882 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
883 * scans. This will give the process the chance to accumulate more faults on
884 * the preferred node but still allow the scheduler to move the task again if
885 * the nodes CPUs are overloaded.
886 */
887unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
888
880static void task_numa_placement(struct task_struct *p) 889static void task_numa_placement(struct task_struct *p)
881{ 890{
882 int seq, nid, max_nid = -1; 891 int seq, nid, max_nid = -1;
@@ -888,6 +897,7 @@ static void task_numa_placement(struct task_struct *p)
888 if (p->numa_scan_seq == seq) 897 if (p->numa_scan_seq == seq)
889 return; 898 return;
890 p->numa_scan_seq = seq; 899 p->numa_scan_seq = seq;
900 p->numa_migrate_seq++;
891 p->numa_scan_period_max = task_scan_max(p); 901 p->numa_scan_period_max = task_scan_max(p);
892 902
893 /* Find the node with the highest number of faults */ 903 /* Find the node with the highest number of faults */
@@ -907,8 +917,10 @@ static void task_numa_placement(struct task_struct *p)
907 } 917 }
908 918
909 /* Update the tasks preferred node if necessary */ 919 /* Update the tasks preferred node if necessary */
910 if (max_faults && max_nid != p->numa_preferred_nid) 920 if (max_faults && max_nid != p->numa_preferred_nid) {
911 p->numa_preferred_nid = max_nid; 921 p->numa_preferred_nid = max_nid;
922 p->numa_migrate_seq = 0;
923 }
912} 924}
913 925
914/* 926/*
@@ -4071,6 +4083,38 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
4071 return delta < (s64)sysctl_sched_migration_cost; 4083 return delta < (s64)sysctl_sched_migration_cost;
4072} 4084}
4073 4085
4086#ifdef CONFIG_NUMA_BALANCING
4087/* Returns true if the destination node has incurred more faults */
4088static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4089{
4090 int src_nid, dst_nid;
4091
4092 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4093 !(env->sd->flags & SD_NUMA)) {
4094 return false;
4095 }
4096
4097 src_nid = cpu_to_node(env->src_cpu);
4098 dst_nid = cpu_to_node(env->dst_cpu);
4099
4100 if (src_nid == dst_nid ||
4101 p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
4102 return false;
4103
4104 if (dst_nid == p->numa_preferred_nid ||
4105 p->numa_faults[dst_nid] > p->numa_faults[src_nid])
4106 return true;
4107
4108 return false;
4109}
4110#else
4111static inline bool migrate_improves_locality(struct task_struct *p,
4112 struct lb_env *env)
4113{
4114 return false;
4115}
4116#endif
4117
4074/* 4118/*
4075 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4119 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
4076 */ 4120 */
@@ -4128,11 +4172,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4128 4172
4129 /* 4173 /*
4130 * Aggressive migration if: 4174 * Aggressive migration if:
4131 * 1) task is cache cold, or 4175 * 1) destination numa is preferred
4132 * 2) too many balance attempts have failed. 4176 * 2) task is cache cold, or
4177 * 3) too many balance attempts have failed.
4133 */ 4178 */
4134
4135 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4179 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4180
4181 if (migrate_improves_locality(p, env)) {
4182#ifdef CONFIG_SCHEDSTATS
4183 if (tsk_cache_hot) {
4184 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4185 schedstat_inc(p, se.statistics.nr_forced_migrations);
4186 }
4187#endif
4188 return 1;
4189 }
4190
4136 if (!tsk_cache_hot || 4191 if (!tsk_cache_hot ||
4137 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4192 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4138 4193