sched/numa: Avoid migrating tasks that are placed on their preferred node

This patch classifies scheduler domains and runqueues into types depending the number of tasks that are about their NUMA placement and the number that are currently running on their preferred node. The types are regular: There are tasks running that do not care about their NUMA placement. remote: There are tasks running that care about their placement but are currently running on a node remote to their ideal placement all: No distinction To implement this the patch tracks the number of tasks that are optimally NUMA placed (rq->nr_preferred_running) and the number of tasks running that care about their placement (nr_numa_running). The load balancer uses this information to avoid migrating idea placed NUMA tasks as long as better options for load balancing exists. For example, it will not consider balancing between a group whose tasks are all perfectly placed and a group with remote tasks. Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Link: http://lkml.kernel.org/r/1381141781-10992-56-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Peter Zijlstra <peterz@infradead.org> 2013-10-07 06:29:33 -0400
committer: Ingo Molnar <mingo@kernel.org> 2013-10-09 08:48:10 -0400
commit: 0ec8aa00f2b4dc457836ef4e2662b02483e94fb7 (patch)
tree: ffd621a5f639a10b1a7213892649e512ea0ee1ba /kernel/sched/fair.c
parent: ca28aa53dd95868c9e38917b9881c09dacfacf1a (diff)
1 files changed, 108 insertions, 12 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 423316cdee07..5166b9b1af70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
 */
 unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running += (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
 struct numa_group {
        atomic_t refcount;
@@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
        if (env.best_cpu == -1)
                return -EAGAIN;
+        sched_setnuma(p, env.dst_nid);
        if (env.best_task == NULL) {
                int ret = migrate_task_to(p, env.best_cpu);
                return ret;
@@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
        /* Preferred node as the node with the most faults */
        if (max_faults && max_nid != p->numa_preferred_nid) {
                /* Update the preferred nid and migrate task if possible */
-                p->numa_preferred_nid = max_nid;
+                sched_setnuma(p, max_nid);
-                p->numa_migrate_seq = 1;
                numa_migrate_preferred(p);
        }
 }
@@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 static void
@@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
-                list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+                struct rq *rq = rq_of(cfs_rq);
+                account_numa_enqueue(rq, task_of(se));
+                list_add(&se->group_node, &rq->cfs_tasks);
+        }
 #endif
        cfs_rq->nr_running++;
 }
@@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
+                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+        }
        cfs_rq->nr_running--;
 }
@@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+enum fbq_type { regular, remote, all };
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
 #define LBF_DST_PINNED  0x04
@@ -4631,6 +4660,8 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+        enum fbq_type           fbq_type;
 };
 /*
@@ -5092,6 +5123,10 @@ struct sg_lb_stats {
        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
 };
 /*
@@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->group_load += load;
                sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+                sgs->nr_numa_running += rq->nr_numa_running;
+                sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
@@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        return false;
 }
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running > sgs->nr_numa_running)
+                return regular;
+        if (sgs->sum_nr_running > sgs->nr_preferred_running)
+                return remote;
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        if (rq->nr_running > rq->nr_numa_running)
+                return regular;
+        if (rq->nr_running > rq->nr_preferred_running)
+                return remote;
+        return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
-static inline void update_sd_lb_stats(struct lb_env *env,
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
-                                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -5538,6 +5606,9 @@ next_group:
                sg = sg->next;
        } while (sg != env->sd->groups);
+        if (env->sd->flags & SD_NUMA)
+                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 /**
@@ -5841,15 +5912,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                unsigned long power = power_of(i);
+                unsigned long power, capacity, wl;
-                unsigned long capacity = DIV_ROUND_CLOSEST(power,
+                enum fbq_type rt;
-                                                           SCHED_POWER_SCALE);
-                unsigned long wl;
+                rq = cpu_rq(i);
+                rt = fbq_classify_rq(rq);
+                /*
+                 * We classify groups/runqueues into three groups:
+                 *  - regular: there are !numa tasks
+                 *  - remote:  there are numa tasks that run on the 'wrong' node
+                 *  - all:     there is no distinction
+                 *
+                 * In order to avoid migrating ideally placed numa tasks,
+                 * ignore those when there's better options.
+                 *
+                 * If we ignore the actual busiest queue to migrate another
+                 * task, the next balance pass can still reduce the busiest
+                 * queue by moving tasks around inside the node.
+                 *
+                 * If we cannot move enough load due to this classification
+                 * the next pass will adjust the group classification and
+                 * allow migration of more tasks.
+                 *
+                 * Both cases only affect the total convergence complexity.
+                 */
+                if (rt > env->fbq_type)
+                        continue;
+                power = power_of(i);
+                capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
-                rq = cpu_rq(i);
                wl = weighted_cpuload(i);
                /*
@@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+                .fbq_type       = all,
        };
        /*
author	Peter Zijlstra <peterz@infradead.org>	2013-10-07 06:29:33 -0400
committer	Ingo Molnar <mingo@kernel.org>	2013-10-09 08:48:10 -0400
commit	0ec8aa00f2b4dc457836ef4e2662b02483e94fb7 (patch)
tree	ffd621a5f639a10b1a7213892649e512ea0ee1ba /kernel/sched/fair.c
parent	ca28aa53dd95868c9e38917b9881c09dacfacf1a (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 423316cdee07..5166b9b1af70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
888	*/	888	*/
889	unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;	889	unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
890		890
		891	static void account_numa_enqueue(struct rq rq, struct task_struct p)
		892	{
		893	rq->nr_numa_running += (p->numa_preferred_nid != -1);
		894	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
		895	}
		896
		897	static void account_numa_dequeue(struct rq rq, struct task_struct p)
		898	{
		899	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
		900	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
		901	}
		902
891	struct numa_group {	903	struct numa_group {
892	atomic_t refcount;	904	atomic_t refcount;
893		905
@@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
1227	if (env.best_cpu == -1)	1239	if (env.best_cpu == -1)
1228	return -EAGAIN;	1240	return -EAGAIN;
1229		1241
		1242	sched_setnuma(p, env.dst_nid);
		1243
1230	if (env.best_task == NULL) {	1244	if (env.best_task == NULL) {
1231	int ret = migrate_task_to(p, env.best_cpu);	1245	int ret = migrate_task_to(p, env.best_cpu);
1232	return ret;	1246	return ret;
@@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
1342	/* Preferred node as the node with the most faults */	1356	/* Preferred node as the node with the most faults */
1343	if (max_faults && max_nid != p->numa_preferred_nid) {	1357	if (max_faults && max_nid != p->numa_preferred_nid) {
1344	/* Update the preferred nid and migrate task if possible */	1358	/* Update the preferred nid and migrate task if possible */
1345	p->numa_preferred_nid = max_nid;	1359	sched_setnuma(p, max_nid);
1346	p->numa_migrate_seq = 1;
1347	numa_migrate_preferred(p);	1360	numa_migrate_preferred(p);
1348	}	1361	}
1349	}	1362	}
@@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq rq, struct task_struct curr)
1741	static void task_tick_numa(struct rq rq, struct task_struct curr)	1754	static void task_tick_numa(struct rq rq, struct task_struct curr)
1742	{	1755	{
1743	}	1756	}
		1757
		1758	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)
		1759	{
		1760	}
		1761
		1762	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)
		1763	{
		1764	}
1744	#endif /* CONFIG_NUMA_BALANCING */	1765	#endif /* CONFIG_NUMA_BALANCING */
1745		1766
1746	static void	1767	static void
@@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
1750	if (!parent_entity(se))	1771	if (!parent_entity(se))
1751	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);	1772	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1752	#ifdef CONFIG_SMP	1773	#ifdef CONFIG_SMP
1753	if (entity_is_task(se))	1774	if (entity_is_task(se)) {
1754	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);	1775	struct rq *rq = rq_of(cfs_rq);
		1776
		1777	account_numa_enqueue(rq, task_of(se));
		1778	list_add(&se->group_node, &rq->cfs_tasks);
		1779	}
1755	#endif	1780	#endif
1756	cfs_rq->nr_running++;	1781	cfs_rq->nr_running++;
1757	}	1782	}
@@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
1762	update_load_sub(&cfs_rq->load, se->load.weight);	1787	update_load_sub(&cfs_rq->load, se->load.weight);
1763	if (!parent_entity(se))	1788	if (!parent_entity(se))
1764	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);	1789	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1765	if (entity_is_task(se))	1790	if (entity_is_task(se)) {
		1791	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1766	list_del_init(&se->group_node);	1792	list_del_init(&se->group_node);
		1793	}
1767	cfs_rq->nr_running--;	1794	cfs_rq->nr_running--;
1768	}	1795	}
1769		1796
@@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preemp
4605		4632
4606	static unsigned long __read_mostly max_load_balance_interval = HZ/10;	4633	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4607		4634
		4635	enum fbq_type { regular, remote, all };
		4636
4608	#define LBF_ALL_PINNED 0x01	4637	#define LBF_ALL_PINNED 0x01
4609	#define LBF_NEED_BREAK 0x02	4638	#define LBF_NEED_BREAK 0x02
4610	#define LBF_DST_PINNED 0x04	4639	#define LBF_DST_PINNED 0x04
@@ -4631,6 +4660,8 @@ struct lb_env {
4631	unsigned int loop;	4660	unsigned int loop;
4632	unsigned int loop_break;	4661	unsigned int loop_break;
4633	unsigned int loop_max;	4662	unsigned int loop_max;
		4663
		4664	enum fbq_type fbq_type;
4634	};	4665	};
4635		4666
4636	/*	4667	/*
@@ -5092,6 +5123,10 @@ struct sg_lb_stats {
5092	unsigned int group_weight;	5123	unsigned int group_weight;
5093	int group_imb; /* Is there an imbalance in the group ? */	5124	int group_imb; /* Is there an imbalance in the group ? */
5094	int group_has_capacity; /* Is there extra capacity in the group? */	5125	int group_has_capacity; /* Is there extra capacity in the group? */
		5126	#ifdef CONFIG_NUMA_BALANCING
		5127	unsigned int nr_numa_running;
		5128	unsigned int nr_preferred_running;
		5129	#endif
5095	};	5130	};
5096		5131
5097	/*	5132	/*
@@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5409		5444
5410	sgs->group_load += load;	5445	sgs->group_load += load;
5411	sgs->sum_nr_running += nr_running;	5446	sgs->sum_nr_running += nr_running;
		5447	#ifdef CONFIG_NUMA_BALANCING
		5448	sgs->nr_numa_running += rq->nr_numa_running;
		5449	sgs->nr_preferred_running += rq->nr_preferred_running;
		5450	#endif
5412	sgs->sum_weighted_load += weighted_cpuload(i);	5451	sgs->sum_weighted_load += weighted_cpuload(i);
5413	if (idle_cpu(i))	5452	if (idle_cpu(i))
5414	sgs->idle_cpus++;	5453	sgs->idle_cpus++;
@@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5474	return false;	5513	return false;
5475	}	5514	}
5476		5515
		5516	#ifdef CONFIG_NUMA_BALANCING
		5517	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
		5518	{
		5519	if (sgs->sum_nr_running > sgs->nr_numa_running)
		5520	return regular;
		5521	if (sgs->sum_nr_running > sgs->nr_preferred_running)
		5522	return remote;
		5523	return all;
		5524	}
		5525
		5526	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
		5527	{
		5528	if (rq->nr_running > rq->nr_numa_running)
		5529	return regular;
		5530	if (rq->nr_running > rq->nr_preferred_running)
		5531	return remote;
		5532	return all;
		5533	}
		5534	#else
		5535	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
		5536	{
		5537	return all;
		5538	}
		5539
		5540	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
		5541	{
		5542	return regular;
		5543	}
		5544	#endif /* CONFIG_NUMA_BALANCING */
		5545
5477	/**	5546	/**
5478	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.	5547	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5479	* @env: The load balancing environment.	5548	* @env: The load balancing environment.
5480	* @balance: Should we balance.	5549	* @balance: Should we balance.
5481	* @sds: variable to hold the statistics for this sched_domain.	5550	* @sds: variable to hold the statistics for this sched_domain.
5482	*/	5551	*/
5483	static inline void update_sd_lb_stats(struct lb_env *env,	5552	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
5484	struct sd_lb_stats *sds)
5485	{	5553	{
5486	struct sched_domain *child = env->sd->child;	5554	struct sched_domain *child = env->sd->child;
5487	struct sched_group *sg = env->sd->groups;	5555	struct sched_group *sg = env->sd->groups;
@@ -5538,6 +5606,9 @@ next_group:
5538		5606
5539	sg = sg->next;	5607	sg = sg->next;
5540	} while (sg != env->sd->groups);	5608	} while (sg != env->sd->groups);
		5609
		5610	if (env->sd->flags & SD_NUMA)
		5611	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
5541	}	5612	}
5542		5613
5543	/**	5614	/**
@@ -5841,15 +5912,39 @@ static struct rq find_busiest_queue(struct lb_env env,
5841	int i;	5912	int i;
5842		5913
5843	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {	5914	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5844	unsigned long power = power_of(i);	5915	unsigned long power, capacity, wl;
5845	unsigned long capacity = DIV_ROUND_CLOSEST(power,	5916	enum fbq_type rt;
5846	SCHED_POWER_SCALE);	5917
5847	unsigned long wl;	5918	rq = cpu_rq(i);
		5919	rt = fbq_classify_rq(rq);
5848		5920
		5921	/*
		5922	* We classify groups/runqueues into three groups:
		5923	* - regular: there are !numa tasks
		5924	* - remote: there are numa tasks that run on the 'wrong' node
		5925	* - all: there is no distinction
		5926	*
		5927	* In order to avoid migrating ideally placed numa tasks,
		5928	* ignore those when there's better options.
		5929	*
		5930	* If we ignore the actual busiest queue to migrate another
		5931	* task, the next balance pass can still reduce the busiest
		5932	* queue by moving tasks around inside the node.
		5933	*
		5934	* If we cannot move enough load due to this classification
		5935	* the next pass will adjust the group classification and
		5936	* allow migration of more tasks.
		5937	*
		5938	* Both cases only affect the total convergence complexity.
		5939	*/
		5940	if (rt > env->fbq_type)
		5941	continue;
		5942
		5943	power = power_of(i);
		5944	capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5849	if (!capacity)	5945	if (!capacity)
5850	capacity = fix_small_capacity(env->sd, group);	5946	capacity = fix_small_capacity(env->sd, group);
5851		5947
5852	rq = cpu_rq(i);
5853	wl = weighted_cpuload(i);	5948	wl = weighted_cpuload(i);
5854		5949
5855	/*	5950	/*
@@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5966	.idle = idle,	6061	.idle = idle,
5967	.loop_break = sched_nr_migrate_break,	6062	.loop_break = sched_nr_migrate_break,
5968	.cpus = cpus,	6063	.cpus = cpus,
		6064	.fbq_type = all,
5969	};	6065	};
5970		6066
5971	/*	6067	/*