summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-10-14 15:20:38 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-10-14 15:20:38 -0400
commita339b351304d5e6b02c7cf8eed895d181e64bce0 (patch)
tree5335b3fd01a73ddf9f9edaadbc67fdae91b3f7e2 /kernel
parent7b764cedcb1a04e795795dd0fa38570467583be3 (diff)
parent024c9d2faebdad3fb43fe49ad68e91a36190f1e2 (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Three fixes that address an SMP balancing performance regression" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Ensure load_balance() respects the active_mask sched/core: Address more wake_affine() regressions sched/core: Fix wake_affine() performance regression
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c140
-rw-r--r--kernel/sched/features.h3
2 files changed, 49 insertions, 94 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..d3f3094856fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
5356 return 1; 5356 return 1;
5357} 5357}
5358 5358
5359struct llc_stats { 5359/*
5360 unsigned long nr_running; 5360 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5361 unsigned long load; 5361 * soonest. For the purpose of speed we only consider the waking and previous
5362 unsigned long capacity; 5362 * CPU.
5363 int has_capacity; 5363 *
5364}; 5364 * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5365 * will be) idle.
5366 *
5367 * wake_affine_weight() - considers the weight to reflect the average
5368 * scheduling latency of the CPUs. This seems to work
5369 * for the overloaded case.
5370 */
5365 5371
5366static bool get_llc_stats(struct llc_stats *stats, int cpu) 5372static bool
5373wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5374 int this_cpu, int prev_cpu, int sync)
5367{ 5375{
5368 struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 5376 if (idle_cpu(this_cpu))
5369 5377 return true;
5370 if (!sds)
5371 return false;
5372 5378
5373 stats->nr_running = READ_ONCE(sds->nr_running); 5379 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5374 stats->load = READ_ONCE(sds->load); 5380 return true;
5375 stats->capacity = READ_ONCE(sds->capacity);
5376 stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377 5381
5378 return true; 5382 return false;
5379} 5383}
5380 5384
5381/*
5382 * Can a task be moved from prev_cpu to this_cpu without causing a load
5383 * imbalance that would trigger the load balancer?
5384 *
5385 * Since we're running on 'stale' values, we might in fact create an imbalance
5386 * but recomputing these values is expensive, as that'd mean iteration 2 cache
5387 * domains worth of CPUs.
5388 */
5389static bool 5385static bool
5390wake_affine_llc(struct sched_domain *sd, struct task_struct *p, 5386wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5391 int this_cpu, int prev_cpu, int sync) 5387 int this_cpu, int prev_cpu, int sync)
5392{ 5388{
5393 struct llc_stats prev_stats, this_stats;
5394 s64 this_eff_load, prev_eff_load; 5389 s64 this_eff_load, prev_eff_load;
5395 unsigned long task_load; 5390 unsigned long task_load;
5396 5391
5397 if (!get_llc_stats(&prev_stats, prev_cpu) || 5392 this_eff_load = target_load(this_cpu, sd->wake_idx);
5398 !get_llc_stats(&this_stats, this_cpu)) 5393 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5399 return false;
5400 5394
5401 /*
5402 * If sync wakeup then subtract the (maximum possible)
5403 * effect of the currently running task from the load
5404 * of the current LLC.
5405 */
5406 if (sync) { 5395 if (sync) {
5407 unsigned long current_load = task_h_load(current); 5396 unsigned long current_load = task_h_load(current);
5408 5397
5409 /* in this case load hits 0 and this LLC is considered 'idle' */ 5398 if (current_load > this_eff_load)
5410 if (current_load > this_stats.load)
5411 return true; 5399 return true;
5412 5400
5413 this_stats.load -= current_load; 5401 this_eff_load -= current_load;
5414 } 5402 }
5415 5403
5416 /*
5417 * The has_capacity stuff is not SMT aware, but by trying to balance
5418 * the nr_running on both ends we try and fill the domain at equal
5419 * rates, thereby first consuming cores before siblings.
5420 */
5421
5422 /* if the old cache has capacity, stay there */
5423 if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424 return false;
5425
5426 /* if this cache has capacity, come here */
5427 if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428 return true;
5429
5430 /*
5431 * Check to see if we can move the load without causing too much
5432 * imbalance.
5433 */
5434 task_load = task_h_load(p); 5404 task_load = task_h_load(p);
5435 5405
5436 this_eff_load = 100; 5406 this_eff_load += task_load;
5437 this_eff_load *= prev_stats.capacity; 5407 if (sched_feat(WA_BIAS))
5438 5408 this_eff_load *= 100;
5439 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 5409 this_eff_load *= capacity_of(prev_cpu);
5440 prev_eff_load *= this_stats.capacity;
5441 5410
5442 this_eff_load *= this_stats.load + task_load; 5411 prev_eff_load -= task_load;
5443 prev_eff_load *= prev_stats.load - task_load; 5412 if (sched_feat(WA_BIAS))
5413 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5414 prev_eff_load *= capacity_of(this_cpu);
5444 5415
5445 return this_eff_load <= prev_eff_load; 5416 return this_eff_load <= prev_eff_load;
5446} 5417}
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5449 int prev_cpu, int sync) 5420 int prev_cpu, int sync)
5450{ 5421{
5451 int this_cpu = smp_processor_id(); 5422 int this_cpu = smp_processor_id();
5452 bool affine; 5423 bool affine = false;
5453 5424
5454 /* 5425 if (sched_feat(WA_IDLE) && !affine)
5455 * Default to no affine wakeups; wake_affine() should not effect a task 5426 affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
5456 * placement the load-balancer feels inclined to undo. The conservative
5457 * option is therefore to not move tasks when they wake up.
5458 */
5459 affine = false;
5460 5427
5461 /* 5428 if (sched_feat(WA_WEIGHT) && !affine)
5462 * If the wakeup is across cache domains, try to evaluate if movement 5429 affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5463 * makes sense, otherwise rely on select_idle_siblings() to do
5464 * placement inside the cache domain.
5465 */
5466 if (!cpus_share_cache(prev_cpu, this_cpu))
5467 affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468 5430
5469 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5431 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470 if (affine) { 5432 if (affine) {
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600 */ 7562 */
7601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 7563static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7602{ 7564{
7603 struct sched_domain_shared *shared = env->sd->shared;
7604 struct sched_domain *child = env->sd->child; 7565 struct sched_domain *child = env->sd->child;
7605 struct sched_group *sg = env->sd->groups; 7566 struct sched_group *sg = env->sd->groups;
7606 struct sg_lb_stats *local = &sds->local_stat; 7567 struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7633,6 @@ next_group:
7672 if (env->dst_rq->rd->overload != overload) 7633 if (env->dst_rq->rd->overload != overload)
7673 env->dst_rq->rd->overload = overload; 7634 env->dst_rq->rd->overload = overload;
7674 } 7635 }
7675
7676 if (!shared)
7677 return;
7678
7679 /*
7680 * Since these are sums over groups they can contain some CPUs
7681 * multiple times for the NUMA domains.
7682 *
7683 * Currently only wake_affine_llc() and find_busiest_group()
7684 * uses these numbers, only the last is affected by this problem.
7685 *
7686 * XXX fix that.
7687 */
7688 WRITE_ONCE(shared->nr_running, sds->total_running);
7689 WRITE_ONCE(shared->load, sds->total_load);
7690 WRITE_ONCE(shared->capacity, sds->total_capacity);
7691} 7636}
7692 7637
7693/** 7638/**
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env)
8098 int cpu, balance_cpu = -1; 8043 int cpu, balance_cpu = -1;
8099 8044
8100 /* 8045 /*
8046 * Ensure the balancing environment is consistent; can happen
8047 * when the softirq triggers 'during' hotplug.
8048 */
8049 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8050 return 0;
8051
8052 /*
8101 * In the newly idle case, we will allow all the cpu's 8053 * In the newly idle case, we will allow all the cpu's
8102 * to do the newly idle load balance. 8054 * to do the newly idle load balance.
8103 */ 8055 */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..319ed0e8a347 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
81SCHED_FEAT(LB_MIN, false) 81SCHED_FEAT(LB_MIN, false)
82SCHED_FEAT(ATTACH_AGE_LOAD, true) 82SCHED_FEAT(ATTACH_AGE_LOAD, true)
83 83
84SCHED_FEAT(WA_IDLE, true)
85SCHED_FEAT(WA_WEIGHT, true)
86SCHED_FEAT(WA_BIAS, true)