diff options
| -rw-r--r-- | include/linux/sched/topology.h | 8 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 126 | ||||
| -rw-r--r-- | kernel/sched/features.h | 1 |
3 files changed, 16 insertions, 119 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index d7b6dab956ec..7d065abc7a47 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h | |||
| @@ -71,14 +71,6 @@ struct sched_domain_shared { | |||
| 71 | atomic_t ref; | 71 | atomic_t ref; |
| 72 | atomic_t nr_busy_cpus; | 72 | atomic_t nr_busy_cpus; |
| 73 | int has_idle_cores; | 73 | int has_idle_cores; |
| 74 | |||
| 75 | /* | ||
| 76 | * Some variables from the most recent sd_lb_stats for this domain, | ||
| 77 | * used by wake_affine(). | ||
| 78 | */ | ||
| 79 | unsigned long nr_running; | ||
| 80 | unsigned long load; | ||
| 81 | unsigned long capacity; | ||
| 82 | }; | 74 | }; |
| 83 | 75 | ||
| 84 | struct sched_domain { | 76 | struct sched_domain { |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..28cabed85387 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p) | |||
| 5356 | return 1; | 5356 | return 1; |
| 5357 | } | 5357 | } |
| 5358 | 5358 | ||
| 5359 | struct llc_stats { | ||
| 5360 | unsigned long nr_running; | ||
| 5361 | unsigned long load; | ||
| 5362 | unsigned long capacity; | ||
| 5363 | int has_capacity; | ||
| 5364 | }; | ||
| 5365 | |||
| 5366 | static bool get_llc_stats(struct llc_stats *stats, int cpu) | ||
| 5367 | { | ||
| 5368 | struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | ||
| 5369 | |||
| 5370 | if (!sds) | ||
| 5371 | return false; | ||
| 5372 | |||
| 5373 | stats->nr_running = READ_ONCE(sds->nr_running); | ||
| 5374 | stats->load = READ_ONCE(sds->load); | ||
| 5375 | stats->capacity = READ_ONCE(sds->capacity); | ||
| 5376 | stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); | ||
| 5377 | |||
| 5378 | return true; | ||
| 5379 | } | ||
| 5380 | |||
| 5381 | /* | 5359 | /* |
| 5382 | * Can a task be moved from prev_cpu to this_cpu without causing a load | 5360 | * The purpose of wake_affine() is to quickly determine on which CPU we can run |
| 5383 | * imbalance that would trigger the load balancer? | 5361 | * soonest. For the purpose of speed we only consider the waking and previous |
| 5362 | * CPU. | ||
| 5384 | * | 5363 | * |
| 5385 | * Since we're running on 'stale' values, we might in fact create an imbalance | 5364 | * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or |
| 5386 | * but recomputing these values is expensive, as that'd mean iteration 2 cache | 5365 | * will be) idle. |
| 5387 | * domains worth of CPUs. | ||
| 5388 | */ | 5366 | */ |
| 5367 | |||
| 5389 | static bool | 5368 | static bool |
| 5390 | wake_affine_llc(struct sched_domain *sd, struct task_struct *p, | 5369 | wake_affine_idle(struct sched_domain *sd, struct task_struct *p, |
| 5391 | int this_cpu, int prev_cpu, int sync) | 5370 | int this_cpu, int prev_cpu, int sync) |
| 5392 | { | 5371 | { |
| 5393 | struct llc_stats prev_stats, this_stats; | 5372 | if (idle_cpu(this_cpu)) |
| 5394 | s64 this_eff_load, prev_eff_load; | ||
| 5395 | unsigned long task_load; | ||
| 5396 | |||
| 5397 | if (!get_llc_stats(&prev_stats, prev_cpu) || | ||
| 5398 | !get_llc_stats(&this_stats, this_cpu)) | ||
| 5399 | return false; | ||
| 5400 | |||
| 5401 | /* | ||
| 5402 | * If sync wakeup then subtract the (maximum possible) | ||
| 5403 | * effect of the currently running task from the load | ||
| 5404 | * of the current LLC. | ||
| 5405 | */ | ||
| 5406 | if (sync) { | ||
| 5407 | unsigned long current_load = task_h_load(current); | ||
| 5408 | |||
| 5409 | /* in this case load hits 0 and this LLC is considered 'idle' */ | ||
| 5410 | if (current_load > this_stats.load) | ||
| 5411 | return true; | ||
| 5412 | |||
| 5413 | this_stats.load -= current_load; | ||
| 5414 | } | ||
| 5415 | |||
| 5416 | /* | ||
| 5417 | * The has_capacity stuff is not SMT aware, but by trying to balance | ||
| 5418 | * the nr_running on both ends we try and fill the domain at equal | ||
| 5419 | * rates, thereby first consuming cores before siblings. | ||
| 5420 | */ | ||
| 5421 | |||
| 5422 | /* if the old cache has capacity, stay there */ | ||
| 5423 | if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) | ||
| 5424 | return false; | ||
| 5425 | |||
| 5426 | /* if this cache has capacity, come here */ | ||
| 5427 | if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) | ||
| 5428 | return true; | 5373 | return true; |
| 5429 | 5374 | ||
| 5430 | /* | 5375 | if (sync && cpu_rq(this_cpu)->nr_running == 1) |
| 5431 | * Check to see if we can move the load without causing too much | 5376 | return true; |
| 5432 | * imbalance. | ||
| 5433 | */ | ||
| 5434 | task_load = task_h_load(p); | ||
| 5435 | |||
| 5436 | this_eff_load = 100; | ||
| 5437 | this_eff_load *= prev_stats.capacity; | ||
| 5438 | |||
| 5439 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 5440 | prev_eff_load *= this_stats.capacity; | ||
| 5441 | |||
| 5442 | this_eff_load *= this_stats.load + task_load; | ||
| 5443 | prev_eff_load *= prev_stats.load - task_load; | ||
| 5444 | 5377 | ||
| 5445 | return this_eff_load <= prev_eff_load; | 5378 | return false; |
| 5446 | } | 5379 | } |
| 5447 | 5380 | ||
| 5448 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, | 5381 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
| 5449 | int prev_cpu, int sync) | 5382 | int prev_cpu, int sync) |
| 5450 | { | 5383 | { |
| 5451 | int this_cpu = smp_processor_id(); | 5384 | int this_cpu = smp_processor_id(); |
| 5452 | bool affine; | 5385 | bool affine = false; |
| 5453 | |||
| 5454 | /* | ||
| 5455 | * Default to no affine wakeups; wake_affine() should not effect a task | ||
| 5456 | * placement the load-balancer feels inclined to undo. The conservative | ||
| 5457 | * option is therefore to not move tasks when they wake up. | ||
| 5458 | */ | ||
| 5459 | affine = false; | ||
| 5460 | 5386 | ||
| 5461 | /* | 5387 | if (sched_feat(WA_IDLE) && !affine) |
| 5462 | * If the wakeup is across cache domains, try to evaluate if movement | 5388 | affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); |
| 5463 | * makes sense, otherwise rely on select_idle_siblings() to do | ||
| 5464 | * placement inside the cache domain. | ||
| 5465 | */ | ||
| 5466 | if (!cpus_share_cache(prev_cpu, this_cpu)) | ||
| 5467 | affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); | ||
| 5468 | 5389 | ||
| 5469 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); | 5390 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
| 5470 | if (affine) { | 5391 | if (affine) { |
| @@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) | |||
| 7600 | */ | 7521 | */ |
| 7601 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) | 7522 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
| 7602 | { | 7523 | { |
| 7603 | struct sched_domain_shared *shared = env->sd->shared; | ||
| 7604 | struct sched_domain *child = env->sd->child; | 7524 | struct sched_domain *child = env->sd->child; |
| 7605 | struct sched_group *sg = env->sd->groups; | 7525 | struct sched_group *sg = env->sd->groups; |
| 7606 | struct sg_lb_stats *local = &sds->local_stat; | 7526 | struct sg_lb_stats *local = &sds->local_stat; |
| @@ -7672,22 +7592,6 @@ next_group: | |||
| 7672 | if (env->dst_rq->rd->overload != overload) | 7592 | if (env->dst_rq->rd->overload != overload) |
| 7673 | env->dst_rq->rd->overload = overload; | 7593 | env->dst_rq->rd->overload = overload; |
| 7674 | } | 7594 | } |
| 7675 | |||
| 7676 | if (!shared) | ||
| 7677 | return; | ||
| 7678 | |||
| 7679 | /* | ||
| 7680 | * Since these are sums over groups they can contain some CPUs | ||
| 7681 | * multiple times for the NUMA domains. | ||
| 7682 | * | ||
| 7683 | * Currently only wake_affine_llc() and find_busiest_group() | ||
| 7684 | * uses these numbers, only the last is affected by this problem. | ||
| 7685 | * | ||
| 7686 | * XXX fix that. | ||
| 7687 | */ | ||
| 7688 | WRITE_ONCE(shared->nr_running, sds->total_running); | ||
| 7689 | WRITE_ONCE(shared->load, sds->total_load); | ||
| 7690 | WRITE_ONCE(shared->capacity, sds->total_capacity); | ||
| 7691 | } | 7595 | } |
| 7692 | 7596 | ||
| 7693 | /** | 7597 | /** |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d3fb15555291..0a519f8c224d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) | |||
| 81 | SCHED_FEAT(LB_MIN, false) | 81 | SCHED_FEAT(LB_MIN, false) |
| 82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | 82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) |
| 83 | 83 | ||
| 84 | SCHED_FEAT(WA_IDLE, true) | ||
