aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2017-09-27 05:35:30 -0400
committerIngo Molnar <mingo@kernel.org>2017-10-10 04:14:02 -0400
commitd153b153446f7d8832bb2ebd92309c8a6003b3bb (patch)
treed5fd41d9fe691221dc007ac6778eec50a70d32dc
parent529a86e063e9ff625c4ff247d8aa17d8072444fb (diff)
sched/core: Fix wake_affine() performance regression
Eric reported a sysbench regression against commit: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed against his v3.10 enterprise kernel. PRE (current tip/master): ivb-ep sysbench: 2: [30 secs] transactions: 64110 (2136.94 per sec.) 5: [30 secs] transactions: 143644 (4787.99 per sec.) 10: [30 secs] transactions: 274298 (9142.93 per sec.) 20: [30 secs] transactions: 418683 (13955.45 per sec.) 40: [30 secs] transactions: 320731 (10690.15 per sec.) 80: [30 secs] transactions: 355096 (11834.28 per sec.) hsw-ex NAS: OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93 lu.C.x_threads_144_run_1.log: Time in seconds = 434.68 lu.C.x_threads_144_run_2.log: Time in seconds = 405.36 lu.C.x_threads_144_run_3.log: Time in seconds = 433.83 POST (+patch): ivb-ep sysbench: 2: [30 secs] transactions: 64494 (2149.75 per sec.) 5: [30 secs] transactions: 145114 (4836.99 per sec.) 10: [30 secs] transactions: 278311 (9276.69 per sec.) 20: [30 secs] transactions: 437169 (14571.60 per sec.) 40: [30 secs] transactions: 669837 (22326.73 per sec.) 80: [30 secs] transactions: 631739 (21055.88 per sec.) hsw-ex NAS: lu.C.x_threads_144_run_1.log: Time in seconds = 23.36 lu.C.x_threads_144_run_2.log: Time in seconds = 22.96 lu.C.x_threads_144_run_3.log: Time in seconds = 22.52 This patch takes out all the shiny wake_affine() stuff and goes back to utter basics. Between the two CPUs involved with the wakeup (the CPU doing the wakeup and the CPU we ran on previously) pick the CPU we can run on _now_. This restores much of the regressions against the older kernels, but leaves some ground in the overloaded case. The default-enabled WA_WEIGHT (which will be introduced in the next patch) is an attempt to address the overloaded situation. Reported-by: Eric Farman <farman@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Rosato <mjrosato@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: jinpuwang@gmail.com Cc: vcaputo@pengaru.com Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched/topology.h8
-rw-r--r--kernel/sched/fair.c126
-rw-r--r--kernel/sched/features.h1
3 files changed, 16 insertions, 119 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
71 atomic_t ref; 71 atomic_t ref;
72 atomic_t nr_busy_cpus; 72 atomic_t nr_busy_cpus;
73 int has_idle_cores; 73 int has_idle_cores;
74
75 /*
76 * Some variables from the most recent sd_lb_stats for this domain,
77 * used by wake_affine().
78 */
79 unsigned long nr_running;
80 unsigned long load;
81 unsigned long capacity;
82}; 74};
83 75
84struct sched_domain { 76struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..28cabed85387 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
5356 return 1; 5356 return 1;
5357} 5357}
5358 5358
5359struct llc_stats {
5360 unsigned long nr_running;
5361 unsigned long load;
5362 unsigned long capacity;
5363 int has_capacity;
5364};
5365
5366static bool get_llc_stats(struct llc_stats *stats, int cpu)
5367{
5368 struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5369
5370 if (!sds)
5371 return false;
5372
5373 stats->nr_running = READ_ONCE(sds->nr_running);
5374 stats->load = READ_ONCE(sds->load);
5375 stats->capacity = READ_ONCE(sds->capacity);
5376 stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377
5378 return true;
5379}
5380
5381/* 5359/*
5382 * Can a task be moved from prev_cpu to this_cpu without causing a load 5360 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5383 * imbalance that would trigger the load balancer? 5361 * soonest. For the purpose of speed we only consider the waking and previous
5362 * CPU.
5384 * 5363 *
5385 * Since we're running on 'stale' values, we might in fact create an imbalance 5364 * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5386 * but recomputing these values is expensive, as that'd mean iteration 2 cache 5365 * will be) idle.
5387 * domains worth of CPUs.
5388 */ 5366 */
5367
5389static bool 5368static bool
5390wake_affine_llc(struct sched_domain *sd, struct task_struct *p, 5369wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5391 int this_cpu, int prev_cpu, int sync) 5370 int this_cpu, int prev_cpu, int sync)
5392{ 5371{
5393 struct llc_stats prev_stats, this_stats; 5372 if (idle_cpu(this_cpu))
5394 s64 this_eff_load, prev_eff_load;
5395 unsigned long task_load;
5396
5397 if (!get_llc_stats(&prev_stats, prev_cpu) ||
5398 !get_llc_stats(&this_stats, this_cpu))
5399 return false;
5400
5401 /*
5402 * If sync wakeup then subtract the (maximum possible)
5403 * effect of the currently running task from the load
5404 * of the current LLC.
5405 */
5406 if (sync) {
5407 unsigned long current_load = task_h_load(current);
5408
5409 /* in this case load hits 0 and this LLC is considered 'idle' */
5410 if (current_load > this_stats.load)
5411 return true;
5412
5413 this_stats.load -= current_load;
5414 }
5415
5416 /*
5417 * The has_capacity stuff is not SMT aware, but by trying to balance
5418 * the nr_running on both ends we try and fill the domain at equal
5419 * rates, thereby first consuming cores before siblings.
5420 */
5421
5422 /* if the old cache has capacity, stay there */
5423 if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424 return false;
5425
5426 /* if this cache has capacity, come here */
5427 if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428 return true; 5373 return true;
5429 5374
5430 /* 5375 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5431 * Check to see if we can move the load without causing too much 5376 return true;
5432 * imbalance.
5433 */
5434 task_load = task_h_load(p);
5435
5436 this_eff_load = 100;
5437 this_eff_load *= prev_stats.capacity;
5438
5439 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5440 prev_eff_load *= this_stats.capacity;
5441
5442 this_eff_load *= this_stats.load + task_load;
5443 prev_eff_load *= prev_stats.load - task_load;
5444 5377
5445 return this_eff_load <= prev_eff_load; 5378 return false;
5446} 5379}
5447 5380
5448static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5381static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5449 int prev_cpu, int sync) 5382 int prev_cpu, int sync)
5450{ 5383{
5451 int this_cpu = smp_processor_id(); 5384 int this_cpu = smp_processor_id();
5452 bool affine; 5385 bool affine = false;
5453
5454 /*
5455 * Default to no affine wakeups; wake_affine() should not effect a task
5456 * placement the load-balancer feels inclined to undo. The conservative
5457 * option is therefore to not move tasks when they wake up.
5458 */
5459 affine = false;
5460 5386
5461 /* 5387 if (sched_feat(WA_IDLE) && !affine)
5462 * If the wakeup is across cache domains, try to evaluate if movement 5388 affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
5463 * makes sense, otherwise rely on select_idle_siblings() to do
5464 * placement inside the cache domain.
5465 */
5466 if (!cpus_share_cache(prev_cpu, this_cpu))
5467 affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468 5389
5469 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5390 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470 if (affine) { 5391 if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600 */ 7521 */
7601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 7522static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7602{ 7523{
7603 struct sched_domain_shared *shared = env->sd->shared;
7604 struct sched_domain *child = env->sd->child; 7524 struct sched_domain *child = env->sd->child;
7605 struct sched_group *sg = env->sd->groups; 7525 struct sched_group *sg = env->sd->groups;
7606 struct sg_lb_stats *local = &sds->local_stat; 7526 struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ next_group:
7672 if (env->dst_rq->rd->overload != overload) 7592 if (env->dst_rq->rd->overload != overload)
7673 env->dst_rq->rd->overload = overload; 7593 env->dst_rq->rd->overload = overload;
7674 } 7594 }
7675
7676 if (!shared)
7677 return;
7678
7679 /*
7680 * Since these are sums over groups they can contain some CPUs
7681 * multiple times for the NUMA domains.
7682 *
7683 * Currently only wake_affine_llc() and find_busiest_group()
7684 * uses these numbers, only the last is affected by this problem.
7685 *
7686 * XXX fix that.
7687 */
7688 WRITE_ONCE(shared->nr_running, sds->total_running);
7689 WRITE_ONCE(shared->load, sds->total_load);
7690 WRITE_ONCE(shared->capacity, sds->total_capacity);
7691} 7595}
7692 7596
7693/** 7597/**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..0a519f8c224d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
81SCHED_FEAT(LB_MIN, false) 81SCHED_FEAT(LB_MIN, false)
82SCHED_FEAT(ATTACH_AGE_LOAD, true) 82SCHED_FEAT(ATTACH_AGE_LOAD, true)
83 83
84SCHED_FEAT(WA_IDLE, true)