sched/numa: Use a system-wide search to find swap/migration candidates

This patch implements a system-wide search for swap/migration candidates based on total NUMA hinting faults. It has a balance limit, however it doesn't properly consider total node balance. In the old scheme a task selected a preferred node based on the highest number of private faults recorded on the node. In this scheme, the preferred node is based on the total number of faults. If the preferred node for a task changes then task_numa_migrate will search the whole system looking for tasks to swap with that would improve both the overall compute balance and minimise the expected number of remote NUMA hinting faults. Not there is no guarantee that the node the source task is placed on by task_numa_migrate() has any relationship to the newly selected task->numa_preferred_nid due to compute overloading. Signed-off-by: Mel Gorman <mgorman@suse.de> [ Do not swap with tasks that cannot run on source cpu] Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> [ Fixed compiler warning on UP. ] Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-40-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Mel Gorman <mgorman@suse.de> 2013-10-07 06:29:17 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-10-09 08:47:25 -0400
commit: fb13c7ee0ed387bd6bec4b4024a4d49b1bd504f1 (patch)
tree: b5892db95bf0b47375cc43005291006aeb115772 /kernel/sched
parent: ac66f5477239ebd3c4e2cbf2f591ef387aa09884 (diff)
3 files changed, 199 insertions, 71 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32a2b29c2610..1fe59da280e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5236,6 +5236,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 static void update_top_cache_domain(int cpu)
 {
@@ -5252,6 +5253,9 @@ static void update_top_cache_domain(int cpu)
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+        sd = lowest_flag_domain(cpu, SD_NUMA);
+        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 }
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b1e5061287ab..1422765d4b86 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
 /* Give new task start runnable values to heavy its load in infant time */
@@ -906,12 +908,40 @@ static unsigned long target_load(int cpu, int type);
 static unsigned long power_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+/* Cached statistics for all CPUs within a node */
 struct numa_stats {
+        unsigned long nr_running;
        unsigned long load;
-        s64 eff_load;
-        unsigned long faults;
+        /* Total compute capacity of CPUs on a node */
+        unsigned long power;
+        /* Approximate capacity in terms of runnable tasks on a node */
+        unsigned long capacity;
+        int has_capacity;
 };
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+        int cpu;
+        memset(ns, 0, sizeof(*ns));
+        for_each_cpu(cpu, cpumask_of_node(nid)) {
+                struct rq *rq = cpu_rq(cpu);
+                ns->nr_running += rq->nr_running;
+                ns->load += weighted_cpuload(cpu);
+                ns->power += power_of(cpu);
+        }
+        ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+        ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+        ns->has_capacity = (ns->nr_running < ns->capacity);
+}
 struct task_numa_env {
        struct task_struct *p;
@@ -920,95 +950,178 @@ struct task_numa_env {
        struct numa_stats src_stats, dst_stats;
-        unsigned long best_load;
+        int imbalance_pct, idx;
+        struct task_struct *best_task;
+        long best_imp;
        int best_cpu;
 };
+static void task_numa_assign(struct task_numa_env *env,
+                             struct task_struct *p, long imp)
+{
+        if (env->best_task)
+                put_task_struct(env->best_task);
+        if (p)
+                get_task_struct(p);
+        env->best_task = p;
+        env->best_imp = imp;
+        env->best_cpu = env->dst_cpu;
+}
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env, long imp)
+{
+        struct rq *src_rq = cpu_rq(env->src_cpu);
+        struct rq *dst_rq = cpu_rq(env->dst_cpu);
+        struct task_struct *cur;
+        long dst_load, src_load;
+        long load;
+        rcu_read_lock();
+        cur = ACCESS_ONCE(dst_rq->curr);
+        if (cur->pid == 0) /* idle */
+                cur = NULL;
+        /*
+         * "imp" is the fault differential for the source task between the
+         * source and destination node. Calculate the total differential for
+         * the source task and potential destination task. The more negative
+         * the value is, the more rmeote accesses that would be expected to
+         * be incurred if the tasks were swapped.
+         */
+        if (cur) {
+                /* Skip this swap candidate if cannot move to the source cpu */
+                if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                        goto unlock;
+                imp += task_faults(cur, env->src_nid) -
+                       task_faults(cur, env->dst_nid);
+        }
+        if (imp < env->best_imp)
+                goto unlock;
+        if (!cur) {
+                /* Is there capacity at our destination? */
+                if (env->src_stats.has_capacity &&
+                    !env->dst_stats.has_capacity)
+                        goto unlock;
+                goto balance;
+        }
+        /* Balance doesn't matter much if we're running a task per cpu */
+        if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+                goto assign;
+        /*
+         * In the overloaded case, try and keep the load balanced.
+         */
+balance:
+        dst_load = env->dst_stats.load;
+        src_load = env->src_stats.load;
+        /* XXX missing power terms */
+        load = task_h_load(env->p);
+        dst_load += load;
+        src_load -= load;
+        if (cur) {
+                load = task_h_load(cur);
+                dst_load -= load;
+                src_load += load;
+        }
+        /* make src_load the smaller */
+        if (dst_load < src_load)
+                swap(dst_load, src_load);
+        if (src_load * env->imbalance_pct < dst_load * 100)
+                goto unlock;
+assign:
+        task_numa_assign(env, cur, imp);
+unlock:
+        rcu_read_unlock();
+}
 static int task_numa_migrate(struct task_struct *p)
 {
-        int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
        struct task_numa_env env = {
                .p = p,
                .src_cpu = task_cpu(p),
                .src_nid = cpu_to_node(task_cpu(p)),
-                .dst_cpu = node_cpu,
-                .dst_nid = p->numa_preferred_nid,
+                .imbalance_pct = 112,
-                .best_load = ULONG_MAX,
-                .best_cpu = task_cpu(p),
+                .best_task = NULL,
+                .best_imp = 0,
+                .best_cpu = -1
        };
        struct sched_domain *sd;
-        int cpu;
+        unsigned long faults;
-        struct task_group *tg = task_group(p);
+        int nid, cpu, ret;
-        unsigned long weight;
-        bool balanced;
-        int imbalance_pct, idx = -1;
        /*
-         * Find the lowest common scheduling domain covering the nodes of both
+         * Pick the lowest SD_NUMA domain, as that would have the smallest
-         * the CPU the task is currently running on and the target NUMA node.
+         * imbalance and would be the first to start moving tasks about.
+         *
+         * And we want to avoid any moving of tasks about, as that would create
+         * random movement of tasks -- counter the numa conditions we're trying
+         * to satisfy here.
         */
        rcu_read_lock();
-        for_each_domain(env.src_cpu, sd) {
+        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-                if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
+        env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
-                        /*
-                         * busy_idx is used for the load decision as it is the
-                         * same index used by the regular load balancer for an
-                         * active cpu.
-                         */
-                        idx = sd->busy_idx;
-                        imbalance_pct = sd->imbalance_pct;
-                        break;
-                }
-        }
        rcu_read_unlock();
-        if (WARN_ON_ONCE(idx == -1))
+        faults = task_faults(p, env.src_nid);
-                return 0;
+        update_numa_stats(&env.src_stats, env.src_nid);
-        /*
+        /* Find an alternative node with relatively better statistics */
-         * XXX the below is mostly nicked from wake_affine(); we should
+        for_each_online_node(nid) {
-         * see about sharing a bit if at all possible; also it might want
+                long imp;
-         * some per entity weight love.
-         */
-        weight = p->se.load.weight;
-        env.src_stats.load = source_load(env.src_cpu, idx);
-        env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
-        env.src_stats.eff_load *= power_of(env.src_cpu);
-        env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
-        for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
-                env.dst_cpu = cpu;
-                env.dst_stats.load = target_load(cpu, idx);
-                /* If the CPU is idle, use it */
-                if (!env.dst_stats.load) {
-                        env.best_cpu = cpu;
-                        goto migrate;
-                }
-                /* Otherwise check the target CPU load */
+                if (nid == env.src_nid)
-                env.dst_stats.eff_load = 100;
+                        continue;
-                env.dst_stats.eff_load *= power_of(cpu);
-                env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
-                /*
+                /* Only consider nodes that recorded more faults */
-                 * Destination is considered balanced if the destination CPU is
+                imp = task_faults(p, nid) - faults;
-                 * less loaded than the source CPU. Unfortunately there is a
+                if (imp < 0)
-                 * risk that a task running on a lightly loaded CPU will not
-                 * migrate to its preferred node due to load imbalances.
-                 */
-                balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
-                if (!balanced)
                        continue;
-                if (env.dst_stats.eff_load < env.best_load) {
+                env.dst_nid = nid;
-                        env.best_load = env.dst_stats.eff_load;
+                update_numa_stats(&env.dst_stats, env.dst_nid);
-                        env.best_cpu = cpu;
+                for_each_cpu(cpu, cpumask_of_node(nid)) {
+                        /* Skip this CPU if the source task cannot migrate */
+                        if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                                continue;
+                        env.dst_cpu = cpu;
+                        task_numa_compare(&env, imp);
                }
        }
-migrate:
+        /* No better CPU than the current one was found. */
-        return migrate_task_to(p, env.best_cpu);
+        if (env.best_cpu == -1)
+                return -EAGAIN;
+        if (env.best_task == NULL) {
+                int ret = migrate_task_to(p, env.best_cpu);
+                return ret;
+        }
+        ret = migrate_swap(p, env.best_task);
+        put_task_struct(env.best_task);
+        return ret;
 }
 /* Attempt to migrate a task to a CPU on the preferred node. */
@@ -1050,7 +1163,7 @@ static void task_numa_placement(struct task_struct *p)
        /* Find the node with the highest number of faults */
        for_each_online_node(nid) {
-                unsigned long faults;
+                unsigned long faults = 0;
                int priv, i;
                for (priv = 0; priv < 2; priv++) {
@@ -1060,10 +1173,10 @@ static void task_numa_placement(struct task_struct *p)
                        p->numa_faults[i] >>= 1;
                        p->numa_faults[i] += p->numa_faults_buffer[i];
                        p->numa_faults_buffer[i] = 0;
+                        faults += p->numa_faults[i];
                }
-                /* Find maximum private faults */
-                faults = p->numa_faults[task_faults_idx(nid, 1)];
                if (faults > max_faults) {
                        max_faults = faults;
                        max_nid = nid;
@@ -4455,8 +4568,6 @@ static int move_one_task(struct lb_env *env)
        return 0;
 }
-static unsigned long task_h_load(struct task_struct *p);
 static const unsigned int sched_nr_migrate_break = 32;
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4dc92d016aef..691e96964dcc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -610,9 +610,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
        return hsd;
 }
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd) {
+                if (sd->flags & flag)
+                        break;
+        }
+        return sd;
+}
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 struct sched_group_power {
        atomic_t ref;
author	Mel Gorman <mgorman@suse.de>	2013-10-07 06:29:17 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-10-09 08:47:25 -0400
commit	fb13c7ee0ed387bd6bec4b4024a4d49b1bd504f1 (patch)
tree	b5892db95bf0b47375cc43005291006aeb115772 /kernel/sched
parent	ac66f5477239ebd3c4e2cbf2f591ef387aa09884 (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 32a2b29c2610..1fe59da280e3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -5236,6 +5236,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5236	DEFINE_PER_CPU(struct sched_domain *, sd_llc);	5236	DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5237	DEFINE_PER_CPU(int, sd_llc_size);	5237	DEFINE_PER_CPU(int, sd_llc_size);
5238	DEFINE_PER_CPU(int, sd_llc_id);	5238	DEFINE_PER_CPU(int, sd_llc_id);
		5239	DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5239		5240
5240	static void update_top_cache_domain(int cpu)	5241	static void update_top_cache_domain(int cpu)
5241	{	5242	{
@@ -5252,6 +5253,9 @@ static void update_top_cache_domain(int cpu)
5252	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);	5253	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5253	per_cpu(sd_llc_size, cpu) = size;	5254	per_cpu(sd_llc_size, cpu) = size;
5254	per_cpu(sd_llc_id, cpu) = id;	5255	per_cpu(sd_llc_id, cpu) = id;
		5256
		5257	sd = lowest_flag_domain(cpu, SD_NUMA);
		5258	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5255	}	5259	}
5256		5260
5257	/*	5261	/*


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1e5061287ab..1422765d4b86 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
681	}	681	}
682		682
683	#ifdef CONFIG_SMP	683	#ifdef CONFIG_SMP
		684	static unsigned long task_h_load(struct task_struct *p);
		685
684	static inline void __update_task_entity_contrib(struct sched_entity *se);	686	static inline void __update_task_entity_contrib(struct sched_entity *se);
685		687
686	/* Give new task start runnable values to heavy its load in infant time */	688	/* Give new task start runnable values to heavy its load in infant time */
@@ -906,12 +908,40 @@ static unsigned long target_load(int cpu, int type);
906	static unsigned long power_of(int cpu);	908	static unsigned long power_of(int cpu);
907	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);	909	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
908		910
		911	/* Cached statistics for all CPUs within a node */
909	struct numa_stats {	912	struct numa_stats {
		913	unsigned long nr_running;
910	unsigned long load;	914	unsigned long load;
911	s64 eff_load;	915
912	unsigned long faults;	916	/* Total compute capacity of CPUs on a node */
		917	unsigned long power;
		918
		919	/* Approximate capacity in terms of runnable tasks on a node */
		920	unsigned long capacity;
		921	int has_capacity;
913	};	922	};
914		923
		924	/*
		925	* XXX borrowed from update_sg_lb_stats
		926	*/
		927	static void update_numa_stats(struct numa_stats *ns, int nid)
		928	{
		929	int cpu;
		930
		931	memset(ns, 0, sizeof(*ns));
		932	for_each_cpu(cpu, cpumask_of_node(nid)) {
		933	struct rq *rq = cpu_rq(cpu);
		934
		935	ns->nr_running += rq->nr_running;
		936	ns->load += weighted_cpuload(cpu);
		937	ns->power += power_of(cpu);
		938	}
		939
		940	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
		941	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
		942	ns->has_capacity = (ns->nr_running < ns->capacity);
		943	}
		944
915	struct task_numa_env {	945	struct task_numa_env {
916	struct task_struct *p;	946	struct task_struct *p;
917		947
@@ -920,95 +950,178 @@ struct task_numa_env {
920		950
921	struct numa_stats src_stats, dst_stats;	951	struct numa_stats src_stats, dst_stats;
922		952
923	unsigned long best_load;	953	int imbalance_pct, idx;
		954
		955	struct task_struct *best_task;
		956	long best_imp;
924	int best_cpu;	957	int best_cpu;
925	};	958	};
926		959
		960	static void task_numa_assign(struct task_numa_env *env,
		961	struct task_struct *p, long imp)
		962	{
		963	if (env->best_task)
		964	put_task_struct(env->best_task);
		965	if (p)
		966	get_task_struct(p);
		967
		968	env->best_task = p;
		969	env->best_imp = imp;
		970	env->best_cpu = env->dst_cpu;
		971	}
		972
		973	/*
		974	* This checks if the overall compute and NUMA accesses of the system would
		975	* be improved if the source tasks was migrated to the target dst_cpu taking
		976	* into account that it might be best if task running on the dst_cpu should
		977	* be exchanged with the source task
		978	*/
		979	static void task_numa_compare(struct task_numa_env *env, long imp)
		980	{
		981	struct rq *src_rq = cpu_rq(env->src_cpu);
		982	struct rq *dst_rq = cpu_rq(env->dst_cpu);
		983	struct task_struct *cur;
		984	long dst_load, src_load;
		985	long load;
		986
		987	rcu_read_lock();
		988	cur = ACCESS_ONCE(dst_rq->curr);
		989	if (cur->pid == 0) /* idle */
		990	cur = NULL;
		991
		992	/*
		993	* "imp" is the fault differential for the source task between the
		994	* source and destination node. Calculate the total differential for
		995	* the source task and potential destination task. The more negative
		996	* the value is, the more rmeote accesses that would be expected to
		997	* be incurred if the tasks were swapped.
		998	*/
		999	if (cur) {
		1000	/* Skip this swap candidate if cannot move to the source cpu */
		1001	if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
		1002	goto unlock;
		1003
		1004	imp += task_faults(cur, env->src_nid) -
		1005	task_faults(cur, env->dst_nid);
		1006	}
		1007
		1008	if (imp < env->best_imp)
		1009	goto unlock;
		1010
		1011	if (!cur) {
		1012	/* Is there capacity at our destination? */
		1013	if (env->src_stats.has_capacity &&
		1014	!env->dst_stats.has_capacity)
		1015	goto unlock;
		1016
		1017	goto balance;
		1018	}
		1019
		1020	/* Balance doesn't matter much if we're running a task per cpu */
		1021	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
		1022	goto assign;
		1023
		1024	/*
		1025	* In the overloaded case, try and keep the load balanced.
		1026	*/
		1027	balance:
		1028	dst_load = env->dst_stats.load;
		1029	src_load = env->src_stats.load;
		1030
		1031	/* XXX missing power terms */
		1032	load = task_h_load(env->p);
		1033	dst_load += load;
		1034	src_load -= load;
		1035
		1036	if (cur) {
		1037	load = task_h_load(cur);
		1038	dst_load -= load;
		1039	src_load += load;
		1040	}
		1041
		1042	/* make src_load the smaller */
		1043	if (dst_load < src_load)
		1044	swap(dst_load, src_load);
		1045
		1046	if (src_load * env->imbalance_pct < dst_load * 100)
		1047	goto unlock;
		1048
		1049	assign:
		1050	task_numa_assign(env, cur, imp);
		1051	unlock:
		1052	rcu_read_unlock();
		1053	}
		1054
927	static int task_numa_migrate(struct task_struct *p)	1055	static int task_numa_migrate(struct task_struct *p)
928	{	1056	{
929	int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
930	struct task_numa_env env = {	1057	struct task_numa_env env = {
931	.p = p,	1058	.p = p,
		1059
932	.src_cpu = task_cpu(p),	1060	.src_cpu = task_cpu(p),
933	.src_nid = cpu_to_node(task_cpu(p)),	1061	.src_nid = cpu_to_node(task_cpu(p)),
934	.dst_cpu = node_cpu,	1062
935	.dst_nid = p->numa_preferred_nid,	1063	.imbalance_pct = 112,
936	.best_load = ULONG_MAX,	1064
937	.best_cpu = task_cpu(p),	1065	.best_task = NULL,
		1066	.best_imp = 0,
		1067	.best_cpu = -1
938	};	1068	};
939	struct sched_domain *sd;	1069	struct sched_domain *sd;
940	int cpu;	1070	unsigned long faults;
941	struct task_group *tg = task_group(p);	1071	int nid, cpu, ret;
942	unsigned long weight;
943	bool balanced;
944	int imbalance_pct, idx = -1;
945		1072
946	/*	1073	/*
947	* Find the lowest common scheduling domain covering the nodes of both	1074	* Pick the lowest SD_NUMA domain, as that would have the smallest
948	* the CPU the task is currently running on and the target NUMA node.	1075	* imbalance and would be the first to start moving tasks about.
		1076	*
		1077	* And we want to avoid any moving of tasks about, as that would create
		1078	* random movement of tasks -- counter the numa conditions we're trying
		1079	* to satisfy here.
949	*/	1080	*/
950	rcu_read_lock();	1081	rcu_read_lock();
951	for_each_domain(env.src_cpu, sd) {	1082	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
952	if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {	1083	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
953	/*
954	* busy_idx is used for the load decision as it is the
955	* same index used by the regular load balancer for an
956	* active cpu.
957	*/
958	idx = sd->busy_idx;
959	imbalance_pct = sd->imbalance_pct;
960	break;
961	}
962	}
963	rcu_read_unlock();	1084	rcu_read_unlock();
964		1085
965	if (WARN_ON_ONCE(idx == -1))	1086	faults = task_faults(p, env.src_nid);
966	return 0;	1087	update_numa_stats(&env.src_stats, env.src_nid);
967		1088
968	/*	1089	/* Find an alternative node with relatively better statistics */
969	* XXX the below is mostly nicked from wake_affine(); we should	1090	for_each_online_node(nid) {
970	* see about sharing a bit if at all possible; also it might want	1091	long imp;
971	* some per entity weight love.
972	*/
973	weight = p->se.load.weight;
974	env.src_stats.load = source_load(env.src_cpu, idx);
975	env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
976	env.src_stats.eff_load *= power_of(env.src_cpu);
977	env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
978
979	for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
980	env.dst_cpu = cpu;
981	env.dst_stats.load = target_load(cpu, idx);
982
983	/* If the CPU is idle, use it */
984	if (!env.dst_stats.load) {
985	env.best_cpu = cpu;
986	goto migrate;
987	}
988		1092
989	/* Otherwise check the target CPU load */	1093	if (nid == env.src_nid)
990	env.dst_stats.eff_load = 100;	1094	continue;
991	env.dst_stats.eff_load *= power_of(cpu);
992	env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
993		1095
994	/*	1096	/* Only consider nodes that recorded more faults */
995	* Destination is considered balanced if the destination CPU is	1097	imp = task_faults(p, nid) - faults;
996	* less loaded than the source CPU. Unfortunately there is a	1098	if (imp < 0)
997	* risk that a task running on a lightly loaded CPU will not
998	* migrate to its preferred node due to load imbalances.
999	*/
1000	balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
1001	if (!balanced)
1002	continue;	1099	continue;
1003		1100
1004	if (env.dst_stats.eff_load < env.best_load) {	1101	env.dst_nid = nid;
1005	env.best_load = env.dst_stats.eff_load;	1102	update_numa_stats(&env.dst_stats, env.dst_nid);
1006	env.best_cpu = cpu;	1103	for_each_cpu(cpu, cpumask_of_node(nid)) {
		1104	/* Skip this CPU if the source task cannot migrate */
		1105	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
		1106	continue;
		1107
		1108	env.dst_cpu = cpu;
		1109	task_numa_compare(&env, imp);
1007	}	1110	}
1008	}	1111	}
1009		1112
1010	migrate:	1113	/* No better CPU than the current one was found. */
1011	return migrate_task_to(p, env.best_cpu);	1114	if (env.best_cpu == -1)
		1115	return -EAGAIN;
		1116
		1117	if (env.best_task == NULL) {
		1118	int ret = migrate_task_to(p, env.best_cpu);
		1119	return ret;
		1120	}
		1121
		1122	ret = migrate_swap(p, env.best_task);
		1123	put_task_struct(env.best_task);
		1124	return ret;
1012	}	1125	}
1013		1126
1014	/* Attempt to migrate a task to a CPU on the preferred node. */	1127	/* Attempt to migrate a task to a CPU on the preferred node. */
@@ -1050,7 +1163,7 @@ static void task_numa_placement(struct task_struct *p)
1050		1163
1051	/* Find the node with the highest number of faults */	1164	/* Find the node with the highest number of faults */
1052	for_each_online_node(nid) {	1165	for_each_online_node(nid) {
1053	unsigned long faults;	1166	unsigned long faults = 0;
1054	int priv, i;	1167	int priv, i;
1055		1168
1056	for (priv = 0; priv < 2; priv++) {	1169	for (priv = 0; priv < 2; priv++) {
@@ -1060,10 +1173,10 @@ static void task_numa_placement(struct task_struct *p)
1060	p->numa_faults[i] >>= 1;	1173	p->numa_faults[i] >>= 1;
1061	p->numa_faults[i] += p->numa_faults_buffer[i];	1174	p->numa_faults[i] += p->numa_faults_buffer[i];
1062	p->numa_faults_buffer[i] = 0;	1175	p->numa_faults_buffer[i] = 0;
		1176
		1177	faults += p->numa_faults[i];
1063	}	1178	}
1064		1179
1065	/* Find maximum private faults */
1066	faults = p->numa_faults[task_faults_idx(nid, 1)];
1067	if (faults > max_faults) {	1180	if (faults > max_faults) {
1068	max_faults = faults;	1181	max_faults = faults;
1069	max_nid = nid;	1182	max_nid = nid;
@@ -4455,8 +4568,6 @@ static int move_one_task(struct lb_env *env)
4455	return 0;	4568	return 0;
4456	}	4569	}
4457		4570
4458	static unsigned long task_h_load(struct task_struct *p);
4459
4460	static const unsigned int sched_nr_migrate_break = 32;	4571	static const unsigned int sched_nr_migrate_break = 32;
4461		4572
4462	/*	4573	/*


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4dc92d016aef..691e96964dcc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -610,9 +610,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
610	return hsd;	610	return hsd;
611	}	611	}
612		612
		613	static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
		614	{
		615	struct sched_domain *sd;
		616
		617	for_each_domain(cpu, sd) {
		618	if (sd->flags & flag)
		619	break;
		620	}
		621
		622	return sd;
		623	}
		624
613	DECLARE_PER_CPU(struct sched_domain *, sd_llc);	625	DECLARE_PER_CPU(struct sched_domain *, sd_llc);
614	DECLARE_PER_CPU(int, sd_llc_size);	626	DECLARE_PER_CPU(int, sd_llc_size);
615	DECLARE_PER_CPU(int, sd_llc_id);	627	DECLARE_PER_CPU(int, sd_llc_id);
		628	DECLARE_PER_CPU(struct sched_domain *, sd_numa);
616		629
617	struct sched_group_power {	630	struct sched_group_power {
618	atomic_t ref;	631	atomic_t ref;