diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-10-14 15:20:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-10-14 15:20:38 -0400 |
commit | a339b351304d5e6b02c7cf8eed895d181e64bce0 (patch) | |
tree | 5335b3fd01a73ddf9f9edaadbc67fdae91b3f7e2 /kernel | |
parent | 7b764cedcb1a04e795795dd0fa38570467583be3 (diff) | |
parent | 024c9d2faebdad3fb43fe49ad68e91a36190f1e2 (diff) |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"Three fixes that address an SMP balancing performance regression"
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/core: Ensure load_balance() respects the active_mask
sched/core: Address more wake_affine() regressions
sched/core: Fix wake_affine() performance regression
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/fair.c | 140 | ||||
-rw-r--r-- | kernel/sched/features.h | 3 |
2 files changed, 49 insertions, 94 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..d3f3094856fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p) | |||
5356 | return 1; | 5356 | return 1; |
5357 | } | 5357 | } |
5358 | 5358 | ||
5359 | struct llc_stats { | 5359 | /* |
5360 | unsigned long nr_running; | 5360 | * The purpose of wake_affine() is to quickly determine on which CPU we can run |
5361 | unsigned long load; | 5361 | * soonest. For the purpose of speed we only consider the waking and previous |
5362 | unsigned long capacity; | 5362 | * CPU. |
5363 | int has_capacity; | 5363 | * |
5364 | }; | 5364 | * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or |
5365 | * will be) idle. | ||
5366 | * | ||
5367 | * wake_affine_weight() - considers the weight to reflect the average | ||
5368 | * scheduling latency of the CPUs. This seems to work | ||
5369 | * for the overloaded case. | ||
5370 | */ | ||
5365 | 5371 | ||
5366 | static bool get_llc_stats(struct llc_stats *stats, int cpu) | 5372 | static bool |
5373 | wake_affine_idle(struct sched_domain *sd, struct task_struct *p, | ||
5374 | int this_cpu, int prev_cpu, int sync) | ||
5367 | { | 5375 | { |
5368 | struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | 5376 | if (idle_cpu(this_cpu)) |
5369 | 5377 | return true; | |
5370 | if (!sds) | ||
5371 | return false; | ||
5372 | 5378 | ||
5373 | stats->nr_running = READ_ONCE(sds->nr_running); | 5379 | if (sync && cpu_rq(this_cpu)->nr_running == 1) |
5374 | stats->load = READ_ONCE(sds->load); | 5380 | return true; |
5375 | stats->capacity = READ_ONCE(sds->capacity); | ||
5376 | stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); | ||
5377 | 5381 | ||
5378 | return true; | 5382 | return false; |
5379 | } | 5383 | } |
5380 | 5384 | ||
5381 | /* | ||
5382 | * Can a task be moved from prev_cpu to this_cpu without causing a load | ||
5383 | * imbalance that would trigger the load balancer? | ||
5384 | * | ||
5385 | * Since we're running on 'stale' values, we might in fact create an imbalance | ||
5386 | * but recomputing these values is expensive, as that'd mean iteration 2 cache | ||
5387 | * domains worth of CPUs. | ||
5388 | */ | ||
5389 | static bool | 5385 | static bool |
5390 | wake_affine_llc(struct sched_domain *sd, struct task_struct *p, | 5386 | wake_affine_weight(struct sched_domain *sd, struct task_struct *p, |
5391 | int this_cpu, int prev_cpu, int sync) | 5387 | int this_cpu, int prev_cpu, int sync) |
5392 | { | 5388 | { |
5393 | struct llc_stats prev_stats, this_stats; | ||
5394 | s64 this_eff_load, prev_eff_load; | 5389 | s64 this_eff_load, prev_eff_load; |
5395 | unsigned long task_load; | 5390 | unsigned long task_load; |
5396 | 5391 | ||
5397 | if (!get_llc_stats(&prev_stats, prev_cpu) || | 5392 | this_eff_load = target_load(this_cpu, sd->wake_idx); |
5398 | !get_llc_stats(&this_stats, this_cpu)) | 5393 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); |
5399 | return false; | ||
5400 | 5394 | ||
5401 | /* | ||
5402 | * If sync wakeup then subtract the (maximum possible) | ||
5403 | * effect of the currently running task from the load | ||
5404 | * of the current LLC. | ||
5405 | */ | ||
5406 | if (sync) { | 5395 | if (sync) { |
5407 | unsigned long current_load = task_h_load(current); | 5396 | unsigned long current_load = task_h_load(current); |
5408 | 5397 | ||
5409 | /* in this case load hits 0 and this LLC is considered 'idle' */ | 5398 | if (current_load > this_eff_load) |
5410 | if (current_load > this_stats.load) | ||
5411 | return true; | 5399 | return true; |
5412 | 5400 | ||
5413 | this_stats.load -= current_load; | 5401 | this_eff_load -= current_load; |
5414 | } | 5402 | } |
5415 | 5403 | ||
5416 | /* | ||
5417 | * The has_capacity stuff is not SMT aware, but by trying to balance | ||
5418 | * the nr_running on both ends we try and fill the domain at equal | ||
5419 | * rates, thereby first consuming cores before siblings. | ||
5420 | */ | ||
5421 | |||
5422 | /* if the old cache has capacity, stay there */ | ||
5423 | if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) | ||
5424 | return false; | ||
5425 | |||
5426 | /* if this cache has capacity, come here */ | ||
5427 | if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) | ||
5428 | return true; | ||
5429 | |||
5430 | /* | ||
5431 | * Check to see if we can move the load without causing too much | ||
5432 | * imbalance. | ||
5433 | */ | ||
5434 | task_load = task_h_load(p); | 5404 | task_load = task_h_load(p); |
5435 | 5405 | ||
5436 | this_eff_load = 100; | 5406 | this_eff_load += task_load; |
5437 | this_eff_load *= prev_stats.capacity; | 5407 | if (sched_feat(WA_BIAS)) |
5438 | 5408 | this_eff_load *= 100; | |
5439 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | 5409 | this_eff_load *= capacity_of(prev_cpu); |
5440 | prev_eff_load *= this_stats.capacity; | ||
5441 | 5410 | ||
5442 | this_eff_load *= this_stats.load + task_load; | 5411 | prev_eff_load -= task_load; |
5443 | prev_eff_load *= prev_stats.load - task_load; | 5412 | if (sched_feat(WA_BIAS)) |
5413 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | ||
5414 | prev_eff_load *= capacity_of(this_cpu); | ||
5444 | 5415 | ||
5445 | return this_eff_load <= prev_eff_load; | 5416 | return this_eff_load <= prev_eff_load; |
5446 | } | 5417 | } |
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5449 | int prev_cpu, int sync) | 5420 | int prev_cpu, int sync) |
5450 | { | 5421 | { |
5451 | int this_cpu = smp_processor_id(); | 5422 | int this_cpu = smp_processor_id(); |
5452 | bool affine; | 5423 | bool affine = false; |
5453 | 5424 | ||
5454 | /* | 5425 | if (sched_feat(WA_IDLE) && !affine) |
5455 | * Default to no affine wakeups; wake_affine() should not effect a task | 5426 | affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); |
5456 | * placement the load-balancer feels inclined to undo. The conservative | ||
5457 | * option is therefore to not move tasks when they wake up. | ||
5458 | */ | ||
5459 | affine = false; | ||
5460 | 5427 | ||
5461 | /* | 5428 | if (sched_feat(WA_WEIGHT) && !affine) |
5462 | * If the wakeup is across cache domains, try to evaluate if movement | 5429 | affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); |
5463 | * makes sense, otherwise rely on select_idle_siblings() to do | ||
5464 | * placement inside the cache domain. | ||
5465 | */ | ||
5466 | if (!cpus_share_cache(prev_cpu, this_cpu)) | ||
5467 | affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); | ||
5468 | 5430 | ||
5469 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); | 5431 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
5470 | if (affine) { | 5432 | if (affine) { |
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) | |||
7600 | */ | 7562 | */ |
7601 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) | 7563 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
7602 | { | 7564 | { |
7603 | struct sched_domain_shared *shared = env->sd->shared; | ||
7604 | struct sched_domain *child = env->sd->child; | 7565 | struct sched_domain *child = env->sd->child; |
7605 | struct sched_group *sg = env->sd->groups; | 7566 | struct sched_group *sg = env->sd->groups; |
7606 | struct sg_lb_stats *local = &sds->local_stat; | 7567 | struct sg_lb_stats *local = &sds->local_stat; |
@@ -7672,22 +7633,6 @@ next_group: | |||
7672 | if (env->dst_rq->rd->overload != overload) | 7633 | if (env->dst_rq->rd->overload != overload) |
7673 | env->dst_rq->rd->overload = overload; | 7634 | env->dst_rq->rd->overload = overload; |
7674 | } | 7635 | } |
7675 | |||
7676 | if (!shared) | ||
7677 | return; | ||
7678 | |||
7679 | /* | ||
7680 | * Since these are sums over groups they can contain some CPUs | ||
7681 | * multiple times for the NUMA domains. | ||
7682 | * | ||
7683 | * Currently only wake_affine_llc() and find_busiest_group() | ||
7684 | * uses these numbers, only the last is affected by this problem. | ||
7685 | * | ||
7686 | * XXX fix that. | ||
7687 | */ | ||
7688 | WRITE_ONCE(shared->nr_running, sds->total_running); | ||
7689 | WRITE_ONCE(shared->load, sds->total_load); | ||
7690 | WRITE_ONCE(shared->capacity, sds->total_capacity); | ||
7691 | } | 7636 | } |
7692 | 7637 | ||
7693 | /** | 7638 | /** |
@@ -8098,6 +8043,13 @@ static int should_we_balance(struct lb_env *env) | |||
8098 | int cpu, balance_cpu = -1; | 8043 | int cpu, balance_cpu = -1; |
8099 | 8044 | ||
8100 | /* | 8045 | /* |
8046 | * Ensure the balancing environment is consistent; can happen | ||
8047 | * when the softirq triggers 'during' hotplug. | ||
8048 | */ | ||
8049 | if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) | ||
8050 | return 0; | ||
8051 | |||
8052 | /* | ||
8101 | * In the newly idle case, we will allow all the cpu's | 8053 | * In the newly idle case, we will allow all the cpu's |
8102 | * to do the newly idle load balance. | 8054 | * to do the newly idle load balance. |
8103 | */ | 8055 | */ |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..319ed0e8a347 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) | |||
81 | SCHED_FEAT(LB_MIN, false) | 81 | SCHED_FEAT(LB_MIN, false) |
82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | 82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) |
83 | 83 | ||
84 | SCHED_FEAT(WA_IDLE, true) | ||
85 | SCHED_FEAT(WA_WEIGHT, true) | ||
86 | SCHED_FEAT(WA_BIAS, true) | ||