sched/core: Fix wake_affine() performance regression

Eric reported a sysbench regression against commit: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed against his v3.10 enterprise kernel. PRE (current tip/master): ivb-ep sysbench: 2: [30 secs] transactions: 64110 (2136.94 per sec.) 5: [30 secs] transactions: 143644 (4787.99 per sec.) 10: [30 secs] transactions: 274298 (9142.93 per sec.) 20: [30 secs] transactions: 418683 (13955.45 per sec.) 40: [30 secs] transactions: 320731 (10690.15 per sec.) 80: [30 secs] transactions: 355096 (11834.28 per sec.) hsw-ex NAS: OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93 lu.C.x_threads_144_run_1.log: Time in seconds = 434.68 lu.C.x_threads_144_run_2.log: Time in seconds = 405.36 lu.C.x_threads_144_run_3.log: Time in seconds = 433.83 POST (+patch): ivb-ep sysbench: 2: [30 secs] transactions: 64494 (2149.75 per sec.) 5: [30 secs] transactions: 145114 (4836.99 per sec.) 10: [30 secs] transactions: 278311 (9276.69 per sec.) 20: [30 secs] transactions: 437169 (14571.60 per sec.) 40: [30 secs] transactions: 669837 (22326.73 per sec.) 80: [30 secs] transactions: 631739 (21055.88 per sec.) hsw-ex NAS: lu.C.x_threads_144_run_1.log: Time in seconds = 23.36 lu.C.x_threads_144_run_2.log: Time in seconds = 22.96 lu.C.x_threads_144_run_3.log: Time in seconds = 22.52 This patch takes out all the shiny wake_affine() stuff and goes back to utter basics. Between the two CPUs involved with the wakeup (the CPU doing the wakeup and the CPU we ran on previously) pick the CPU we can run on _now_. This restores much of the regressions against the older kernels, but leaves some ground in the overloaded case. The default-enabled WA_WEIGHT (which will be introduced in the next patch) is an attempt to address the overloaded situation. Reported-by: Eric Farman <farman@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Rosato <mjrosato@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: jinpuwang@gmail.com Cc: vcaputo@pengaru.com Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Peter Zijlstra <peterz@infradead.org> 2017-09-27 05:35:30 -0400
committer: Ingo Molnar <mingo@kernel.org> 2017-10-10 04:14:02 -0400
commit: d153b153446f7d8832bb2ebd92309c8a6003b3bb (patch)
tree: d5fd41d9fe691221dc007ac6778eec50a70d32dc
parent: 529a86e063e9ff625c4ff247d8aa17d8072444fb (diff)
3 files changed, 16 insertions, 119 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
        atomic_t        ref;
        atomic_t        nr_busy_cpus;
        int             has_idle_cores;
-        /*
-         * Some variables from the most recent sd_lb_stats for this domain,
-         * used by wake_affine().
-         */
-        unsigned long   nr_running;
-        unsigned long   load;
-        unsigned long   capacity;
 };
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..28cabed85387 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
-struct llc_stats {
-        unsigned long   nr_running;
-        unsigned long   load;
-        unsigned long   capacity;
-        int             has_capacity;
-};
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
-{
-        struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-        if (!sds)
-                return false;
-        stats->nr_running       = READ_ONCE(sds->nr_running);
-        stats->load             = READ_ONCE(sds->load);
-        stats->capacity         = READ_ONCE(sds->capacity);
-        stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
-        return true;
-}
 /*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
- * imbalance that would trigger the load balancer?
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
 *
- * Since we're running on 'stale' values, we might in fact create an imbalance
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ *                      will be) idle.
- * domains worth of CPUs.
 */
 static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
-                int this_cpu, int prev_cpu, int sync)
+                 int this_cpu, int prev_cpu, int sync)
 {
-        struct llc_stats prev_stats, this_stats;
+        if (idle_cpu(this_cpu))
-        s64 this_eff_load, prev_eff_load;
-        unsigned long task_load;
-        if (!get_llc_stats(&prev_stats, prev_cpu) ||
-            !get_llc_stats(&this_stats, this_cpu))
-                return false;
-        /*
-         * If sync wakeup then subtract the (maximum possible)
-         * effect of the currently running task from the load
-         * of the current LLC.
-         */
-        if (sync) {
-                unsigned long current_load = task_h_load(current);
-                /* in this case load hits 0 and this LLC is considered 'idle' */
-                if (current_load > this_stats.load)
-                        return true;
-                this_stats.load -= current_load;
-        }
-        /*
-         * The has_capacity stuff is not SMT aware, but by trying to balance
-         * the nr_running on both ends we try and fill the domain at equal
-         * rates, thereby first consuming cores before siblings.
-         */
-        /* if the old cache has capacity, stay there */
-        if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
-                return false;
-        /* if this cache has capacity, come here */
-        if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
                return true;
-        /*
+        if (sync && cpu_rq(this_cpu)->nr_running == 1)
-         * Check to see if we can move the load without causing too much
+                return true;
-         * imbalance.
-         */
-        task_load = task_h_load(p);
-        this_eff_load = 100;
-        this_eff_load *= prev_stats.capacity;
-        prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-        prev_eff_load *= this_stats.capacity;
-        this_eff_load *= this_stats.load + task_load;
-        prev_eff_load *= prev_stats.load - task_load;
-        return this_eff_load <= prev_eff_load;
+        return false;
 }
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                       int prev_cpu, int sync)
 {
        int this_cpu = smp_processor_id();
-        bool affine;
+        bool affine = false;
-        /*
-         * Default to no affine wakeups; wake_affine() should not effect a task
-         * placement the load-balancer feels inclined to undo. The conservative
-         * option is therefore to not move tasks when they wake up.
-         */
-        affine = false;
-        /*
+        if (sched_feat(WA_IDLE) && !affine)
-         * If the wakeup is across cache domains, try to evaluate if movement
+                affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
-         * makes sense, otherwise rely on select_idle_siblings() to do
-         * placement inside the cache domain.
-         */
-        if (!cpus_share_cache(prev_cpu, this_cpu))
-                affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-        struct sched_domain_shared *shared = env->sd->shared;
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ next_group:
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
        }
-        if (!shared)
-                return;
-        /*
-         * Since these are sums over groups they can contain some CPUs
-         * multiple times for the NUMA domains.
-         *
-         * Currently only wake_affine_llc() and find_busiest_group()
-         * uses these numbers, only the last is affected by this problem.
-         *
-         * XXX fix that.
-         */
-        WRITE_ONCE(shared->nr_running,  sds->total_running);
-        WRITE_ONCE(shared->load,        sds->total_load);
-        WRITE_ONCE(shared->capacity,    sds->total_capacity);
 }
 /**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..0a519f8c224d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
author	Peter Zijlstra <peterz@infradead.org>	2017-09-27 05:35:30 -0400
committer	Ingo Molnar <mingo@kernel.org>	2017-10-10 04:14:02 -0400
commit	d153b153446f7d8832bb2ebd92309c8a6003b3bb (patch)
tree	d5fd41d9fe691221dc007ac6778eec50a70d32dc
parent	529a86e063e9ff625c4ff247d8aa17d8072444fb (diff)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index d7b6dab956ec..7d065abc7a47 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
71	atomic_t ref;	71	atomic_t ref;
72	atomic_t nr_busy_cpus;	72	atomic_t nr_busy_cpus;
73	int has_idle_cores;	73	int has_idle_cores;
74
75	/*
76	* Some variables from the most recent sd_lb_stats for this domain,
77	* used by wake_affine().
78	*/
79	unsigned long nr_running;
80	unsigned long load;
81	unsigned long capacity;
82	};	74	};
83		75
84	struct sched_domain {	76	struct sched_domain {


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..28cabed85387 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
5356	return 1;	5356	return 1;
5357	}	5357	}
5358		5358
5359	struct llc_stats {
5360	unsigned long nr_running;
5361	unsigned long load;
5362	unsigned long capacity;
5363	int has_capacity;
5364	};
5365
5366	static bool get_llc_stats(struct llc_stats *stats, int cpu)
5367	{
5368	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5369
5370	if (!sds)
5371	return false;
5372
5373	stats->nr_running = READ_ONCE(sds->nr_running);
5374	stats->load = READ_ONCE(sds->load);
5375	stats->capacity = READ_ONCE(sds->capacity);
5376	stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377
5378	return true;
5379	}
5380
5381	/*	5359	/*
5382	* Can a task be moved from prev_cpu to this_cpu without causing a load	5360	* The purpose of wake_affine() is to quickly determine on which CPU we can run
5383	* imbalance that would trigger the load balancer?	5361	* soonest. For the purpose of speed we only consider the waking and previous
		5362	* CPU.
5384	*	5363	*
5385	* Since we're running on 'stale' values, we might in fact create an imbalance	5364	* wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5386	* but recomputing these values is expensive, as that'd mean iteration 2 cache	5365	* will be) idle.
5387	* domains worth of CPUs.
5388	*/	5366	*/
		5367
5389	static bool	5368	static bool
5390	wake_affine_llc(struct sched_domain sd, struct task_struct p,	5369	wake_affine_idle(struct sched_domain sd, struct task_struct p,
5391	int this_cpu, int prev_cpu, int sync)	5370	int this_cpu, int prev_cpu, int sync)
5392	{	5371	{
5393	struct llc_stats prev_stats, this_stats;	5372	if (idle_cpu(this_cpu))
5394	s64 this_eff_load, prev_eff_load;
5395	unsigned long task_load;
5396
5397	if (!get_llc_stats(&prev_stats, prev_cpu) \|\|
5398	!get_llc_stats(&this_stats, this_cpu))
5399	return false;
5400
5401	/*
5402	* If sync wakeup then subtract the (maximum possible)
5403	* effect of the currently running task from the load
5404	* of the current LLC.
5405	*/
5406	if (sync) {
5407	unsigned long current_load = task_h_load(current);
5408
5409	/* in this case load hits 0 and this LLC is considered 'idle' */
5410	if (current_load > this_stats.load)
5411	return true;
5412
5413	this_stats.load -= current_load;
5414	}
5415
5416	/*
5417	* The has_capacity stuff is not SMT aware, but by trying to balance
5418	* the nr_running on both ends we try and fill the domain at equal
5419	* rates, thereby first consuming cores before siblings.
5420	*/
5421
5422	/* if the old cache has capacity, stay there */
5423	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424	return false;
5425
5426	/* if this cache has capacity, come here */
5427	if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428	return true;	5373	return true;
5429		5374
5430	/*	5375	if (sync && cpu_rq(this_cpu)->nr_running == 1)
5431	* Check to see if we can move the load without causing too much	5376	return true;
5432	* imbalance.
5433	*/
5434	task_load = task_h_load(p);
5435
5436	this_eff_load = 100;
5437	this_eff_load *= prev_stats.capacity;
5438
5439	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5440	prev_eff_load *= this_stats.capacity;
5441
5442	this_eff_load *= this_stats.load + task_load;
5443	prev_eff_load *= prev_stats.load - task_load;
5444		5377
5445	return this_eff_load <= prev_eff_load;	5378	return false;
5446	}	5379	}
5447		5380
5448	static int wake_affine(struct sched_domain sd, struct task_struct p,	5381	static int wake_affine(struct sched_domain sd, struct task_struct p,
5449	int prev_cpu, int sync)	5382	int prev_cpu, int sync)
5450	{	5383	{
5451	int this_cpu = smp_processor_id();	5384	int this_cpu = smp_processor_id();
5452	bool affine;	5385	bool affine = false;
5453
5454	/*
5455	* Default to no affine wakeups; wake_affine() should not effect a task
5456	* placement the load-balancer feels inclined to undo. The conservative
5457	* option is therefore to not move tasks when they wake up.
5458	*/
5459	affine = false;
5460		5386
5461	/*	5387	if (sched_feat(WA_IDLE) && !affine)
5462	* If the wakeup is across cache domains, try to evaluate if movement	5388	affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
5463	* makes sense, otherwise rely on select_idle_siblings() to do
5464	* placement inside the cache domain.
5465	*/
5466	if (!cpus_share_cache(prev_cpu, this_cpu))
5467	affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468		5389
5469	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);	5390	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470	if (affine) {	5391	if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600	*/	7521	*/
7601	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)	7522	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
7602	{	7523	{
7603	struct sched_domain_shared *shared = env->sd->shared;
7604	struct sched_domain *child = env->sd->child;	7524	struct sched_domain *child = env->sd->child;
7605	struct sched_group *sg = env->sd->groups;	7525	struct sched_group *sg = env->sd->groups;
7606	struct sg_lb_stats *local = &sds->local_stat;	7526	struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ next_group:
7672	if (env->dst_rq->rd->overload != overload)	7592	if (env->dst_rq->rd->overload != overload)
7673	env->dst_rq->rd->overload = overload;	7593	env->dst_rq->rd->overload = overload;
7674	}	7594	}
7675
7676	if (!shared)
7677	return;
7678
7679	/*
7680	* Since these are sums over groups they can contain some CPUs
7681	* multiple times for the NUMA domains.
7682	*
7683	* Currently only wake_affine_llc() and find_busiest_group()
7684	* uses these numbers, only the last is affected by this problem.
7685	*
7686	* XXX fix that.
7687	*/
7688	WRITE_ONCE(shared->nr_running, sds->total_running);
7689	WRITE_ONCE(shared->load, sds->total_load);
7690	WRITE_ONCE(shared->capacity, sds->total_capacity);
7691	}	7595	}
7692		7596
7693	/**	7597	/**


diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..0a519f8c224d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
81	SCHED_FEAT(LB_MIN, false)	81	SCHED_FEAT(LB_MIN, false)
82	SCHED_FEAT(ATTACH_AGE_LOAD, true)	82	SCHED_FEAT(ATTACH_AGE_LOAD, true)
83		83
		84	SCHED_FEAT(WA_IDLE, true)