aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:29:10 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 06:40:39 -0400
commit58d081b5082dd85e02ac9a1fb151d97395340a09 (patch)
tree5bfc7ac630ba62d898dfb860e63f118079cba57d /kernel/sched
parentfc3147245d193bd0f57307859c698fa28a20b0fe (diff)
sched/numa: Avoid overloading CPUs on a preferred NUMA node
This patch replaces find_idlest_cpu_node with task_numa_find_cpu. find_idlest_cpu_node has two critical limitations. It does not take the scheduling class into account when calculating the load and it is unsuitable for using when comparing loads between NUMA nodes. task_numa_find_cpu uses similar load calculations to wake_affine() when selecting the least loaded CPU within a scheduling domain common to the source and destimation nodes. It avoids causing CPU load imbalances in the machine by refusing to migrate if the relative load on the target CPU is higher than the source CPU. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-33-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c131
1 files changed, 102 insertions, 29 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d98175d5c2c6..51a760081193 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -901,28 +901,114 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
901} 901}
902 902
903static unsigned long weighted_cpuload(const int cpu); 903static unsigned long weighted_cpuload(const int cpu);
904static unsigned long source_load(int cpu, int type);
905static unsigned long target_load(int cpu, int type);
906static unsigned long power_of(int cpu);
907static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
904 908
909struct numa_stats {
910 unsigned long load;
911 s64 eff_load;
912 unsigned long faults;
913};
905 914
906static int 915struct task_numa_env {
907find_idlest_cpu_node(int this_cpu, int nid) 916 struct task_struct *p;
908{
909 unsigned long load, min_load = ULONG_MAX;
910 int i, idlest_cpu = this_cpu;
911 917
912 BUG_ON(cpu_to_node(this_cpu) == nid); 918 int src_cpu, src_nid;
919 int dst_cpu, dst_nid;
913 920
914 rcu_read_lock(); 921 struct numa_stats src_stats, dst_stats;
915 for_each_cpu(i, cpumask_of_node(nid)) {
916 load = weighted_cpuload(i);
917 922
918 if (load < min_load) { 923 unsigned long best_load;
919 min_load = load; 924 int best_cpu;
920 idlest_cpu = i; 925};
926
927static int task_numa_migrate(struct task_struct *p)
928{
929 int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
930 struct task_numa_env env = {
931 .p = p,
932 .src_cpu = task_cpu(p),
933 .src_nid = cpu_to_node(task_cpu(p)),
934 .dst_cpu = node_cpu,
935 .dst_nid = p->numa_preferred_nid,
936 .best_load = ULONG_MAX,
937 .best_cpu = task_cpu(p),
938 };
939 struct sched_domain *sd;
940 int cpu;
941 struct task_group *tg = task_group(p);
942 unsigned long weight;
943 bool balanced;
944 int imbalance_pct, idx = -1;
945
946 /*
947 * Find the lowest common scheduling domain covering the nodes of both
948 * the CPU the task is currently running on and the target NUMA node.
949 */
950 rcu_read_lock();
951 for_each_domain(env.src_cpu, sd) {
952 if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
953 /*
954 * busy_idx is used for the load decision as it is the
955 * same index used by the regular load balancer for an
956 * active cpu.
957 */
958 idx = sd->busy_idx;
959 imbalance_pct = sd->imbalance_pct;
960 break;
921 } 961 }
922 } 962 }
923 rcu_read_unlock(); 963 rcu_read_unlock();
924 964
925 return idlest_cpu; 965 if (WARN_ON_ONCE(idx == -1))
966 return 0;
967
968 /*
969 * XXX the below is mostly nicked from wake_affine(); we should
970 * see about sharing a bit if at all possible; also it might want
971 * some per entity weight love.
972 */
973 weight = p->se.load.weight;
974 env.src_stats.load = source_load(env.src_cpu, idx);
975 env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
976 env.src_stats.eff_load *= power_of(env.src_cpu);
977 env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
978
979 for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
980 env.dst_cpu = cpu;
981 env.dst_stats.load = target_load(cpu, idx);
982
983 /* If the CPU is idle, use it */
984 if (!env.dst_stats.load) {
985 env.best_cpu = cpu;
986 goto migrate;
987 }
988
989 /* Otherwise check the target CPU load */
990 env.dst_stats.eff_load = 100;
991 env.dst_stats.eff_load *= power_of(cpu);
992 env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
993
994 /*
995 * Destination is considered balanced if the destination CPU is
996 * less loaded than the source CPU. Unfortunately there is a
997 * risk that a task running on a lightly loaded CPU will not
998 * migrate to its preferred node due to load imbalances.
999 */
1000 balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
1001 if (!balanced)
1002 continue;
1003
1004 if (env.dst_stats.eff_load < env.best_load) {
1005 env.best_load = env.dst_stats.eff_load;
1006 env.best_cpu = cpu;
1007 }
1008 }
1009
1010migrate:
1011 return migrate_task_to(p, env.best_cpu);
926} 1012}
927 1013
928static void task_numa_placement(struct task_struct *p) 1014static void task_numa_placement(struct task_struct *p)
@@ -966,22 +1052,10 @@ static void task_numa_placement(struct task_struct *p)
966 * the working set placement. 1052 * the working set placement.
967 */ 1053 */
968 if (max_faults && max_nid != p->numa_preferred_nid) { 1054 if (max_faults && max_nid != p->numa_preferred_nid) {
969 int preferred_cpu;
970
971 /*
972 * If the task is not on the preferred node then find the most
973 * idle CPU to migrate to.
974 */
975 preferred_cpu = task_cpu(p);
976 if (cpu_to_node(preferred_cpu) != max_nid) {
977 preferred_cpu = find_idlest_cpu_node(preferred_cpu,
978 max_nid);
979 }
980
981 /* Update the preferred nid and migrate task if possible */ 1055 /* Update the preferred nid and migrate task if possible */
982 p->numa_preferred_nid = max_nid; 1056 p->numa_preferred_nid = max_nid;
983 p->numa_migrate_seq = 1; 1057 p->numa_migrate_seq = 1;
984 migrate_task_to(p, preferred_cpu); 1058 task_numa_migrate(p);
985 } 1059 }
986} 1060}
987 1061
@@ -3292,7 +3366,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3292{ 3366{
3293 struct sched_entity *se = tg->se[cpu]; 3367 struct sched_entity *se = tg->se[cpu];
3294 3368
3295 if (!tg->parent) /* the trivial, non-cgroup case */ 3369 if (!tg->parent || !wl) /* the trivial, non-cgroup case */
3296 return wl; 3370 return wl;
3297 3371
3298 for_each_sched_entity(se) { 3372 for_each_sched_entity(se) {
@@ -3345,8 +3419,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3345} 3419}
3346#else 3420#else
3347 3421
3348static inline unsigned long effective_load(struct task_group *tg, int cpu, 3422static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3349 unsigned long wl, unsigned long wg)
3350{ 3423{
3351 return wl; 3424 return wl;
3352} 3425}