aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-10-07 06:29:17 -0400
committerIngo Molnar <mingo@kernel.org>2013-10-09 08:47:25 -0400
commitfb13c7ee0ed387bd6bec4b4024a4d49b1bd504f1 (patch)
treeb5892db95bf0b47375cc43005291006aeb115772 /kernel/sched
parentac66f5477239ebd3c4e2cbf2f591ef387aa09884 (diff)
sched/numa: Use a system-wide search to find swap/migration candidates
This patch implements a system-wide search for swap/migration candidates based on total NUMA hinting faults. It has a balance limit, however it doesn't properly consider total node balance. In the old scheme a task selected a preferred node based on the highest number of private faults recorded on the node. In this scheme, the preferred node is based on the total number of faults. If the preferred node for a task changes then task_numa_migrate will search the whole system looking for tasks to swap with that would improve both the overall compute balance and minimise the expected number of remote NUMA hinting faults. Not there is no guarantee that the node the source task is placed on by task_numa_migrate() has any relationship to the newly selected task->numa_preferred_nid due to compute overloading. Signed-off-by: Mel Gorman <mgorman@suse.de> [ Do not swap with tasks that cannot run on source cpu] Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> [ Fixed compiler warning on UP. ] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-40-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/fair.c253
-rw-r--r--kernel/sched/sched.h13
3 files changed, 199 insertions, 71 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32a2b29c2610..1fe59da280e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5236,6 +5236,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5236DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5236DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5237DEFINE_PER_CPU(int, sd_llc_size); 5237DEFINE_PER_CPU(int, sd_llc_size);
5238DEFINE_PER_CPU(int, sd_llc_id); 5238DEFINE_PER_CPU(int, sd_llc_id);
5239DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5239 5240
5240static void update_top_cache_domain(int cpu) 5241static void update_top_cache_domain(int cpu)
5241{ 5242{
@@ -5252,6 +5253,9 @@ static void update_top_cache_domain(int cpu)
5252 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5253 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5253 per_cpu(sd_llc_size, cpu) = size; 5254 per_cpu(sd_llc_size, cpu) = size;
5254 per_cpu(sd_llc_id, cpu) = id; 5255 per_cpu(sd_llc_id, cpu) = id;
5256
5257 sd = lowest_flag_domain(cpu, SD_NUMA);
5258 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5255} 5259}
5256 5260
5257/* 5261/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b1e5061287ab..1422765d4b86 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 681}
682 682
683#ifdef CONFIG_SMP 683#ifdef CONFIG_SMP
684static unsigned long task_h_load(struct task_struct *p);
685
684static inline void __update_task_entity_contrib(struct sched_entity *se); 686static inline void __update_task_entity_contrib(struct sched_entity *se);
685 687
686/* Give new task start runnable values to heavy its load in infant time */ 688/* Give new task start runnable values to heavy its load in infant time */
@@ -906,12 +908,40 @@ static unsigned long target_load(int cpu, int type);
906static unsigned long power_of(int cpu); 908static unsigned long power_of(int cpu);
907static long effective_load(struct task_group *tg, int cpu, long wl, long wg); 909static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
908 910
911/* Cached statistics for all CPUs within a node */
909struct numa_stats { 912struct numa_stats {
913 unsigned long nr_running;
910 unsigned long load; 914 unsigned long load;
911 s64 eff_load; 915
912 unsigned long faults; 916 /* Total compute capacity of CPUs on a node */
917 unsigned long power;
918
919 /* Approximate capacity in terms of runnable tasks on a node */
920 unsigned long capacity;
921 int has_capacity;
913}; 922};
914 923
924/*
925 * XXX borrowed from update_sg_lb_stats
926 */
927static void update_numa_stats(struct numa_stats *ns, int nid)
928{
929 int cpu;
930
931 memset(ns, 0, sizeof(*ns));
932 for_each_cpu(cpu, cpumask_of_node(nid)) {
933 struct rq *rq = cpu_rq(cpu);
934
935 ns->nr_running += rq->nr_running;
936 ns->load += weighted_cpuload(cpu);
937 ns->power += power_of(cpu);
938 }
939
940 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
941 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
942 ns->has_capacity = (ns->nr_running < ns->capacity);
943}
944
915struct task_numa_env { 945struct task_numa_env {
916 struct task_struct *p; 946 struct task_struct *p;
917 947
@@ -920,95 +950,178 @@ struct task_numa_env {
920 950
921 struct numa_stats src_stats, dst_stats; 951 struct numa_stats src_stats, dst_stats;
922 952
923 unsigned long best_load; 953 int imbalance_pct, idx;
954
955 struct task_struct *best_task;
956 long best_imp;
924 int best_cpu; 957 int best_cpu;
925}; 958};
926 959
960static void task_numa_assign(struct task_numa_env *env,
961 struct task_struct *p, long imp)
962{
963 if (env->best_task)
964 put_task_struct(env->best_task);
965 if (p)
966 get_task_struct(p);
967
968 env->best_task = p;
969 env->best_imp = imp;
970 env->best_cpu = env->dst_cpu;
971}
972
973/*
974 * This checks if the overall compute and NUMA accesses of the system would
975 * be improved if the source tasks was migrated to the target dst_cpu taking
976 * into account that it might be best if task running on the dst_cpu should
977 * be exchanged with the source task
978 */
979static void task_numa_compare(struct task_numa_env *env, long imp)
980{
981 struct rq *src_rq = cpu_rq(env->src_cpu);
982 struct rq *dst_rq = cpu_rq(env->dst_cpu);
983 struct task_struct *cur;
984 long dst_load, src_load;
985 long load;
986
987 rcu_read_lock();
988 cur = ACCESS_ONCE(dst_rq->curr);
989 if (cur->pid == 0) /* idle */
990 cur = NULL;
991
992 /*
993 * "imp" is the fault differential for the source task between the
994 * source and destination node. Calculate the total differential for
995 * the source task and potential destination task. The more negative
996 * the value is, the more rmeote accesses that would be expected to
997 * be incurred if the tasks were swapped.
998 */
999 if (cur) {
1000 /* Skip this swap candidate if cannot move to the source cpu */
1001 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1002 goto unlock;
1003
1004 imp += task_faults(cur, env->src_nid) -
1005 task_faults(cur, env->dst_nid);
1006 }
1007
1008 if (imp < env->best_imp)
1009 goto unlock;
1010
1011 if (!cur) {
1012 /* Is there capacity at our destination? */
1013 if (env->src_stats.has_capacity &&
1014 !env->dst_stats.has_capacity)
1015 goto unlock;
1016
1017 goto balance;
1018 }
1019
1020 /* Balance doesn't matter much if we're running a task per cpu */
1021 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1022 goto assign;
1023
1024 /*
1025 * In the overloaded case, try and keep the load balanced.
1026 */
1027balance:
1028 dst_load = env->dst_stats.load;
1029 src_load = env->src_stats.load;
1030
1031 /* XXX missing power terms */
1032 load = task_h_load(env->p);
1033 dst_load += load;
1034 src_load -= load;
1035
1036 if (cur) {
1037 load = task_h_load(cur);
1038 dst_load -= load;
1039 src_load += load;
1040 }
1041
1042 /* make src_load the smaller */
1043 if (dst_load < src_load)
1044 swap(dst_load, src_load);
1045
1046 if (src_load * env->imbalance_pct < dst_load * 100)
1047 goto unlock;
1048
1049assign:
1050 task_numa_assign(env, cur, imp);
1051unlock:
1052 rcu_read_unlock();
1053}
1054
927static int task_numa_migrate(struct task_struct *p) 1055static int task_numa_migrate(struct task_struct *p)
928{ 1056{
929 int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
930 struct task_numa_env env = { 1057 struct task_numa_env env = {
931 .p = p, 1058 .p = p,
1059
932 .src_cpu = task_cpu(p), 1060 .src_cpu = task_cpu(p),
933 .src_nid = cpu_to_node(task_cpu(p)), 1061 .src_nid = cpu_to_node(task_cpu(p)),
934 .dst_cpu = node_cpu, 1062
935 .dst_nid = p->numa_preferred_nid, 1063 .imbalance_pct = 112,
936 .best_load = ULONG_MAX, 1064
937 .best_cpu = task_cpu(p), 1065 .best_task = NULL,
1066 .best_imp = 0,
1067 .best_cpu = -1
938 }; 1068 };
939 struct sched_domain *sd; 1069 struct sched_domain *sd;
940 int cpu; 1070 unsigned long faults;
941 struct task_group *tg = task_group(p); 1071 int nid, cpu, ret;
942 unsigned long weight;
943 bool balanced;
944 int imbalance_pct, idx = -1;
945 1072
946 /* 1073 /*
947 * Find the lowest common scheduling domain covering the nodes of both 1074 * Pick the lowest SD_NUMA domain, as that would have the smallest
948 * the CPU the task is currently running on and the target NUMA node. 1075 * imbalance and would be the first to start moving tasks about.
1076 *
1077 * And we want to avoid any moving of tasks about, as that would create
1078 * random movement of tasks -- counter the numa conditions we're trying
1079 * to satisfy here.
949 */ 1080 */
950 rcu_read_lock(); 1081 rcu_read_lock();
951 for_each_domain(env.src_cpu, sd) { 1082 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
952 if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) { 1083 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
953 /*
954 * busy_idx is used for the load decision as it is the
955 * same index used by the regular load balancer for an
956 * active cpu.
957 */
958 idx = sd->busy_idx;
959 imbalance_pct = sd->imbalance_pct;
960 break;
961 }
962 }
963 rcu_read_unlock(); 1084 rcu_read_unlock();
964 1085
965 if (WARN_ON_ONCE(idx == -1)) 1086 faults = task_faults(p, env.src_nid);
966 return 0; 1087 update_numa_stats(&env.src_stats, env.src_nid);
967 1088
968 /* 1089 /* Find an alternative node with relatively better statistics */
969 * XXX the below is mostly nicked from wake_affine(); we should 1090 for_each_online_node(nid) {
970 * see about sharing a bit if at all possible; also it might want 1091 long imp;
971 * some per entity weight love.
972 */
973 weight = p->se.load.weight;
974 env.src_stats.load = source_load(env.src_cpu, idx);
975 env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
976 env.src_stats.eff_load *= power_of(env.src_cpu);
977 env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
978
979 for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
980 env.dst_cpu = cpu;
981 env.dst_stats.load = target_load(cpu, idx);
982
983 /* If the CPU is idle, use it */
984 if (!env.dst_stats.load) {
985 env.best_cpu = cpu;
986 goto migrate;
987 }
988 1092
989 /* Otherwise check the target CPU load */ 1093 if (nid == env.src_nid)
990 env.dst_stats.eff_load = 100; 1094 continue;
991 env.dst_stats.eff_load *= power_of(cpu);
992 env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
993 1095
994 /* 1096 /* Only consider nodes that recorded more faults */
995 * Destination is considered balanced if the destination CPU is 1097 imp = task_faults(p, nid) - faults;
996 * less loaded than the source CPU. Unfortunately there is a 1098 if (imp < 0)
997 * risk that a task running on a lightly loaded CPU will not
998 * migrate to its preferred node due to load imbalances.
999 */
1000 balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
1001 if (!balanced)
1002 continue; 1099 continue;
1003 1100
1004 if (env.dst_stats.eff_load < env.best_load) { 1101 env.dst_nid = nid;
1005 env.best_load = env.dst_stats.eff_load; 1102 update_numa_stats(&env.dst_stats, env.dst_nid);
1006 env.best_cpu = cpu; 1103 for_each_cpu(cpu, cpumask_of_node(nid)) {
1104 /* Skip this CPU if the source task cannot migrate */
1105 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1106 continue;
1107
1108 env.dst_cpu = cpu;
1109 task_numa_compare(&env, imp);
1007 } 1110 }
1008 } 1111 }
1009 1112
1010migrate: 1113 /* No better CPU than the current one was found. */
1011 return migrate_task_to(p, env.best_cpu); 1114 if (env.best_cpu == -1)
1115 return -EAGAIN;
1116
1117 if (env.best_task == NULL) {
1118 int ret = migrate_task_to(p, env.best_cpu);
1119 return ret;
1120 }
1121
1122 ret = migrate_swap(p, env.best_task);
1123 put_task_struct(env.best_task);
1124 return ret;
1012} 1125}
1013 1126
1014/* Attempt to migrate a task to a CPU on the preferred node. */ 1127/* Attempt to migrate a task to a CPU on the preferred node. */
@@ -1050,7 +1163,7 @@ static void task_numa_placement(struct task_struct *p)
1050 1163
1051 /* Find the node with the highest number of faults */ 1164 /* Find the node with the highest number of faults */
1052 for_each_online_node(nid) { 1165 for_each_online_node(nid) {
1053 unsigned long faults; 1166 unsigned long faults = 0;
1054 int priv, i; 1167 int priv, i;
1055 1168
1056 for (priv = 0; priv < 2; priv++) { 1169 for (priv = 0; priv < 2; priv++) {
@@ -1060,10 +1173,10 @@ static void task_numa_placement(struct task_struct *p)
1060 p->numa_faults[i] >>= 1; 1173 p->numa_faults[i] >>= 1;
1061 p->numa_faults[i] += p->numa_faults_buffer[i]; 1174 p->numa_faults[i] += p->numa_faults_buffer[i];
1062 p->numa_faults_buffer[i] = 0; 1175 p->numa_faults_buffer[i] = 0;
1176
1177 faults += p->numa_faults[i];
1063 } 1178 }
1064 1179
1065 /* Find maximum private faults */
1066 faults = p->numa_faults[task_faults_idx(nid, 1)];
1067 if (faults > max_faults) { 1180 if (faults > max_faults) {
1068 max_faults = faults; 1181 max_faults = faults;
1069 max_nid = nid; 1182 max_nid = nid;
@@ -4455,8 +4568,6 @@ static int move_one_task(struct lb_env *env)
4455 return 0; 4568 return 0;
4456} 4569}
4457 4570
4458static unsigned long task_h_load(struct task_struct *p);
4459
4460static const unsigned int sched_nr_migrate_break = 32; 4571static const unsigned int sched_nr_migrate_break = 32;
4461 4572
4462/* 4573/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4dc92d016aef..691e96964dcc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -610,9 +610,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
610 return hsd; 610 return hsd;
611} 611}
612 612
613static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
614{
615 struct sched_domain *sd;
616
617 for_each_domain(cpu, sd) {
618 if (sd->flags & flag)
619 break;
620 }
621
622 return sd;
623}
624
613DECLARE_PER_CPU(struct sched_domain *, sd_llc); 625DECLARE_PER_CPU(struct sched_domain *, sd_llc);
614DECLARE_PER_CPU(int, sd_llc_size); 626DECLARE_PER_CPU(int, sd_llc_size);
615DECLARE_PER_CPU(int, sd_llc_id); 627DECLARE_PER_CPU(int, sd_llc_id);
628DECLARE_PER_CPU(struct sched_domain *, sd_numa);
616 629
617struct sched_group_power { 630struct sched_group_power {
618 atomic_t ref; 631 atomic_t ref;