Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar: "Three fixes that address an SMP balancing performance regression" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Ensure load_balance() respects the active_mask sched/core: Address more wake_affine() regressions sched/core: Fix wake_affine() performance regression
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-10-14 15:20:38 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-10-14 15:20:38 -0400
commit: a339b351304d5e6b02c7cf8eed895d181e64bce0 (patch)
tree: 5335b3fd01a73ddf9f9edaadbc67fdae91b3f7e2 /kernel
parent: 7b764cedcb1a04e795795dd0fa38570467583be3 (diff)
parent: 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 (diff)
2 files changed, 49 insertions, 94 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
-struct llc_stats {
+/*
-        unsigned long   nr_running;
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
-        unsigned long   load;
+ * soonest. For the purpose of speed we only consider the waking and previous
-        unsigned long   capacity;
+ * CPU.
-        int             has_capacity;
+ *
-};
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *                      will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ *                        scheduling latency of the CPUs. This seems to work
+ *                        for the overloaded case.
+ */
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int prev_cpu, int sync)
 {
-        struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (idle_cpu(this_cpu))
+                return true;
-        if (!sds)
-                return false;
-        stats->nr_running       = READ_ONCE(sds->nr_running);
+        if (sync && cpu_rq(this_cpu)->nr_running == 1)
-        stats->load             = READ_ONCE(sds->load);
+                return true;
-        stats->capacity         = READ_ONCE(sds->capacity);
-        stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
-        return true;
+        return false;
 }
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
- */
 static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
-                int this_cpu, int prev_cpu, int sync)
+                   int this_cpu, int prev_cpu, int sync)
 {
-        struct llc_stats prev_stats, this_stats;
        s64 this_eff_load, prev_eff_load;
        unsigned long task_load;
-        if (!get_llc_stats(&prev_stats, prev_cpu) ||
+        this_eff_load = target_load(this_cpu, sd->wake_idx);
-            !get_llc_stats(&this_stats, this_cpu))
+        prev_eff_load = source_load(prev_cpu, sd->wake_idx);
-                return false;
-        /*
-         * If sync wakeup then subtract the (maximum possible)
-         * effect of the currently running task from the load
-         * of the current LLC.
-         */
        if (sync) {
                unsigned long current_load = task_h_load(current);
-                /* in this case load hits 0 and this LLC is considered 'idle' */
+                if (current_load > this_eff_load)
-                if (current_load > this_stats.load)
                        return true;
-                this_stats.load -= current_load;
+                this_eff_load -= current_load;
        }
-        /*
-         * The has_capacity stuff is not SMT aware, but by trying to balance
-         * the nr_running on both ends we try and fill the domain at equal
-         * rates, thereby first consuming cores before siblings.
-         */
-        /* if the old cache has capacity, stay there */
-        if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
-                return false;
-        /* if this cache has capacity, come here */
-        if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
-                return true;
-        /*
-         * Check to see if we can move the load without causing too much
-         * imbalance.
-         */
        task_load = task_h_load(p);
-        this_eff_load = 100;
+        this_eff_load += task_load;
-        this_eff_load *= prev_stats.capacity;
+        if (sched_feat(WA_BIAS))
+                this_eff_load *= 100;
-        prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+        this_eff_load *= capacity_of(prev_cpu);
-        prev_eff_load *= this_stats.capacity;
-        this_eff_load *= this_stats.load + task_load;
+        prev_eff_load -= task_load;
-        prev_eff_load *= prev_stats.load - task_load;
+        if (sched_feat(WA_BIAS))
+                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+        prev_eff_load *= capacity_of(this_cpu);
        return this_eff_load <= prev_eff_load;
 }
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                       int prev_cpu, int sync)
 {
        int this_cpu = smp_processor_id();
-        bool affine;
+        bool affine = false;
-        /*
+        if (sched_feat(WA_IDLE) && !affine)
-         * Default to no affine wakeups; wake_affine() should not effect a task
+                affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
-         * placement the load-balancer feels inclined to undo. The conservative
-         * option is therefore to not move tasks when they wake up.
-         */
-        affine = false;
-        /*
+        if (sched_feat(WA_WEIGHT) && !affine)
-         * If the wakeup is across cache domains, try to evaluate if movement
+                affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
-         * makes sense, otherwise rely on select_idle_siblings() to do
-         * placement inside the cache domain.
-         */
-        if (!cpus_share_cache(prev_cpu, this_cpu))
-                affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-        struct sched_domain_shared *shared = env->sd->shared;
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
        }
-        if (!shared)
-                return;
-        /*
-         * Since these are sums over groups they can contain some CPUs
-         * multiple times for the NUMA domains.
-         *
-         * Currently only wake_affine_llc() and find_busiest_group()
-         * uses these numbers, only the last is affected by this problem.
-         *
-         * XXX fix that.
-         */
-        WRITE_ONCE(shared->nr_running,  sds->total_running);
-        WRITE_ONCE(shared->load,        sds->total_load);
-        WRITE_ONCE(shared->capacity,    sds->total_capacity);
 }
 /**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
        int cpu, balance_cpu = -1;
        /*
+         * Ensure the balancing environment is consistent; can happen
+         * when the softirq triggers 'during' hotplug.
+         */
+        if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
+                return 0;
+        /*
         * In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_IDLE, true)
+SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_BIAS, true)
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-10-14 15:20:38 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-10-14 15:20:38 -0400
commit	a339b351304d5e6b02c7cf8eed895d181e64bce0 (patch)
tree	5335b3fd01a73ddf9f9edaadbc67fdae91b3f7e2 /kernel
parent	7b764cedcb1a04e795795dd0fa38570467583be3 (diff)
parent	024c9d2faebdad3fb43fe49ad68e91a36190f1e2 (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..d3f3094856fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
5356	return 1;	5356	return 1;
5357	}	5357	}
5358		5358
5359	struct llc_stats {	5359	/*
5360	unsigned long nr_running;	5360	* The purpose of wake_affine() is to quickly determine on which CPU we can run
5361	unsigned long load;	5361	* soonest. For the purpose of speed we only consider the waking and previous
5362	unsigned long capacity;	5362	* CPU.
5363	int has_capacity;	5363	*
5364	};	5364	* wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
		5365	* will be) idle.
		5366	*
		5367	* wake_affine_weight() - considers the weight to reflect the average
		5368	* scheduling latency of the CPUs. This seems to work
		5369	* for the overloaded case.
		5370	*/
5365		5371
5366	static bool get_llc_stats(struct llc_stats *stats, int cpu)	5372	static bool
		5373	wake_affine_idle(struct sched_domain sd, struct task_struct p,
		5374	int this_cpu, int prev_cpu, int sync)
5367	{	5375	{
5368	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));	5376	if (idle_cpu(this_cpu))
5369		5377	return true;
5370	if (!sds)
5371	return false;
5372		5378
5373	stats->nr_running = READ_ONCE(sds->nr_running);	5379	if (sync && cpu_rq(this_cpu)->nr_running == 1)
5374	stats->load = READ_ONCE(sds->load);	5380	return true;
5375	stats->capacity = READ_ONCE(sds->capacity);
5376	stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377		5381
5378	return true;	5382	return false;
5379	}	5383	}
5380		5384
5381	/*
5382	* Can a task be moved from prev_cpu to this_cpu without causing a load
5383	* imbalance that would trigger the load balancer?
5384	*
5385	* Since we're running on 'stale' values, we might in fact create an imbalance
5386	* but recomputing these values is expensive, as that'd mean iteration 2 cache
5387	* domains worth of CPUs.
5388	*/
5389	static bool	5385	static bool
5390	wake_affine_llc(struct sched_domain sd, struct task_struct p,	5386	wake_affine_weight(struct sched_domain sd, struct task_struct p,
5391	int this_cpu, int prev_cpu, int sync)	5387	int this_cpu, int prev_cpu, int sync)
5392	{	5388	{
5393	struct llc_stats prev_stats, this_stats;
5394	s64 this_eff_load, prev_eff_load;	5389	s64 this_eff_load, prev_eff_load;
5395	unsigned long task_load;	5390	unsigned long task_load;
5396		5391
5397	if (!get_llc_stats(&prev_stats, prev_cpu) \|\|	5392	this_eff_load = target_load(this_cpu, sd->wake_idx);
5398	!get_llc_stats(&this_stats, this_cpu))	5393	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5399	return false;
5400		5394
5401	/*
5402	* If sync wakeup then subtract the (maximum possible)
5403	* effect of the currently running task from the load
5404	* of the current LLC.
5405	*/
5406	if (sync) {	5395	if (sync) {
5407	unsigned long current_load = task_h_load(current);	5396	unsigned long current_load = task_h_load(current);
5408		5397
5409	/* in this case load hits 0 and this LLC is considered 'idle' */	5398	if (current_load > this_eff_load)
5410	if (current_load > this_stats.load)
5411	return true;	5399	return true;
5412		5400
5413	this_stats.load -= current_load;	5401	this_eff_load -= current_load;
5414	}	5402	}
5415		5403
5416	/*
5417	* The has_capacity stuff is not SMT aware, but by trying to balance
5418	* the nr_running on both ends we try and fill the domain at equal
5419	* rates, thereby first consuming cores before siblings.
5420	*/
5421
5422	/* if the old cache has capacity, stay there */
5423	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424	return false;
5425
5426	/* if this cache has capacity, come here */
5427	if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428	return true;
5429
5430	/*
5431	* Check to see if we can move the load without causing too much
5432	* imbalance.
5433	*/
5434	task_load = task_h_load(p);	5404	task_load = task_h_load(p);
5435		5405
5436	this_eff_load = 100;	5406	this_eff_load += task_load;
5437	this_eff_load *= prev_stats.capacity;	5407	if (sched_feat(WA_BIAS))
5438		5408	this_eff_load *= 100;
5439	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;	5409	this_eff_load *= capacity_of(prev_cpu);
5440	prev_eff_load *= this_stats.capacity;
5441		5410
5442	this_eff_load *= this_stats.load + task_load;	5411	prev_eff_load -= task_load;
5443	prev_eff_load *= prev_stats.load - task_load;	5412	if (sched_feat(WA_BIAS))
		5413	prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
		5414	prev_eff_load *= capacity_of(this_cpu);
5444		5415
5445	return this_eff_load <= prev_eff_load;	5416	return this_eff_load <= prev_eff_load;
5446	}	5417	}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain sd, struct task_struct p,
5449	int prev_cpu, int sync)	5420	int prev_cpu, int sync)
5450	{	5421	{
5451	int this_cpu = smp_processor_id();	5422	int this_cpu = smp_processor_id();
5452	bool affine;	5423	bool affine = false;
5453		5424
5454	/*	5425	if (sched_feat(WA_IDLE) && !affine)
5455	* Default to no affine wakeups; wake_affine() should not effect a task	5426	affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
5456	* placement the load-balancer feels inclined to undo. The conservative
5457	* option is therefore to not move tasks when they wake up.
5458	*/
5459	affine = false;
5460		5427
5461	/*	5428	if (sched_feat(WA_WEIGHT) && !affine)
5462	* If the wakeup is across cache domains, try to evaluate if movement	5429	affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5463	* makes sense, otherwise rely on select_idle_siblings() to do
5464	* placement inside the cache domain.
5465	*/
5466	if (!cpus_share_cache(prev_cpu, this_cpu))
5467	affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468		5430
5469	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);	5431	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470	if (affine) {	5432	if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600	*/	7562	*/
7601	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)	7563	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
7602	{	7564	{
7603	struct sched_domain_shared *shared = env->sd->shared;
7604	struct sched_domain *child = env->sd->child;	7565	struct sched_domain *child = env->sd->child;
7605	struct sched_group *sg = env->sd->groups;	7566	struct sched_group *sg = env->sd->groups;
7606	struct sg_lb_stats *local = &sds->local_stat;	7567	struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
7672	if (env->dst_rq->rd->overload != overload)	7633	if (env->dst_rq->rd->overload != overload)
7673	env->dst_rq->rd->overload = overload;	7634	env->dst_rq->rd->overload = overload;
7674	}	7635	}
7675
7676	if (!shared)
7677	return;
7678
7679	/*
7680	* Since these are sums over groups they can contain some CPUs
7681	* multiple times for the NUMA domains.
7682	*
7683	* Currently only wake_affine_llc() and find_busiest_group()
7684	* uses these numbers, only the last is affected by this problem.
7685	*
7686	* XXX fix that.
7687	*/
7688	WRITE_ONCE(shared->nr_running, sds->total_running);
7689	WRITE_ONCE(shared->load, sds->total_load);
7690	WRITE_ONCE(shared->capacity, sds->total_capacity);
7691	}	7636	}
7692		7637
7693	/**	7638	/**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
8098	int cpu, balance_cpu = -1;	8043	int cpu, balance_cpu = -1;
8099		8044
8100	/*	8045	/*
		8046	* Ensure the balancing environment is consistent; can happen
		8047	* when the softirq triggers 'during' hotplug.
		8048	*/
		8049	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
		8050	return 0;
		8051
		8052	/*
8101	* In the newly idle case, we will allow all the cpu's	8053	* In the newly idle case, we will allow all the cpu's
8102	* to do the newly idle load balance.	8054	* to do the newly idle load balance.
8103	*/	8055	*/


diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..319ed0e8a347 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
81	SCHED_FEAT(LB_MIN, false)	81	SCHED_FEAT(LB_MIN, false)
82	SCHED_FEAT(ATTACH_AGE_LOAD, true)	82	SCHED_FEAT(ATTACH_AGE_LOAD, true)
83		83
		84	SCHED_FEAT(WA_IDLE, true)
		85	SCHED_FEAT(WA_WEIGHT, true)
		86	SCHED_FEAT(WA_BIAS, true)