aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched/topology.h8
-rw-r--r--kernel/sched/fair.c126
-rw-r--r--kernel/sched/features.h1
3 files changed, 16 insertions, 119 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
71 atomic_t ref; 71 atomic_t ref;
72 atomic_t nr_busy_cpus; 72 atomic_t nr_busy_cpus;
73 int has_idle_cores; 73 int has_idle_cores;
74
75 /*
76 * Some variables from the most recent sd_lb_stats for this domain,
77 * used by wake_affine().
78 */
79 unsigned long nr_running;
80 unsigned long load;
81 unsigned long capacity;
82}; 74};
83 75
84struct sched_domain { 76struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..28cabed85387 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
5356 return 1; 5356 return 1;
5357} 5357}
5358 5358
5359struct llc_stats {
5360 unsigned long nr_running;
5361 unsigned long load;
5362 unsigned long capacity;
5363 int has_capacity;
5364};
5365
5366static bool get_llc_stats(struct llc_stats *stats, int cpu)
5367{
5368 struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5369
5370 if (!sds)
5371 return false;
5372
5373 stats->nr_running = READ_ONCE(sds->nr_running);
5374 stats->load = READ_ONCE(sds->load);
5375 stats->capacity = READ_ONCE(sds->capacity);
5376 stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
5377
5378 return true;
5379}
5380
5381/* 5359/*
5382 * Can a task be moved from prev_cpu to this_cpu without causing a load 5360 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5383 * imbalance that would trigger the load balancer? 5361 * soonest. For the purpose of speed we only consider the waking and previous
5362 * CPU.
5384 * 5363 *
5385 * Since we're running on 'stale' values, we might in fact create an imbalance 5364 * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
5386 * but recomputing these values is expensive, as that'd mean iteration 2 cache 5365 * will be) idle.
5387 * domains worth of CPUs.
5388 */ 5366 */
5367
5389static bool 5368static bool
5390wake_affine_llc(struct sched_domain *sd, struct task_struct *p, 5369wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5391 int this_cpu, int prev_cpu, int sync) 5370 int this_cpu, int prev_cpu, int sync)
5392{ 5371{
5393 struct llc_stats prev_stats, this_stats; 5372 if (idle_cpu(this_cpu))
5394 s64 this_eff_load, prev_eff_load;
5395 unsigned long task_load;
5396
5397 if (!get_llc_stats(&prev_stats, prev_cpu) ||
5398 !get_llc_stats(&this_stats, this_cpu))
5399 return false;
5400
5401 /*
5402 * If sync wakeup then subtract the (maximum possible)
5403 * effect of the currently running task from the load
5404 * of the current LLC.
5405 */
5406 if (sync) {
5407 unsigned long current_load = task_h_load(current);
5408
5409 /* in this case load hits 0 and this LLC is considered 'idle' */
5410 if (current_load > this_stats.load)
5411 return true;
5412
5413 this_stats.load -= current_load;
5414 }
5415
5416 /*
5417 * The has_capacity stuff is not SMT aware, but by trying to balance
5418 * the nr_running on both ends we try and fill the domain at equal
5419 * rates, thereby first consuming cores before siblings.
5420 */
5421
5422 /* if the old cache has capacity, stay there */
5423 if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
5424 return false;
5425
5426 /* if this cache has capacity, come here */
5427 if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
5428 return true; 5373 return true;
5429 5374
5430 /* 5375 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5431 * Check to see if we can move the load without causing too much 5376 return true;
5432 * imbalance.
5433 */
5434 task_load = task_h_load(p);
5435
5436 this_eff_load = 100;
5437 this_eff_load *= prev_stats.capacity;
5438
5439 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5440 prev_eff_load *= this_stats.capacity;
5441
5442 this_eff_load *= this_stats.load + task_load;
5443 prev_eff_load *= prev_stats.load - task_load;
5444 5377
5445 return this_eff_load <= prev_eff_load; 5378 return false;
5446} 5379}
5447 5380
5448static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5381static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5449 int prev_cpu, int sync) 5382 int prev_cpu, int sync)
5450{ 5383{
5451 int this_cpu = smp_processor_id(); 5384 int this_cpu = smp_processor_id();
5452 bool affine; 5385 bool affine = false;
5453
5454 /*
5455 * Default to no affine wakeups; wake_affine() should not effect a task
5456 * placement the load-balancer feels inclined to undo. The conservative
5457 * option is therefore to not move tasks when they wake up.
5458 */
5459 affine = false;
5460 5386
5461 /* 5387 if (sched_feat(WA_IDLE) && !affine)
5462 * If the wakeup is across cache domains, try to evaluate if movement 5388 affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
5463 * makes sense, otherwise rely on select_idle_siblings() to do
5464 * placement inside the cache domain.
5465 */
5466 if (!cpus_share_cache(prev_cpu, this_cpu))
5467 affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
5468 5389
5469 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5390 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5470 if (affine) { 5391 if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
7600 */ 7521 */
7601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 7522static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
7602{ 7523{
7603 struct sched_domain_shared *shared = env->sd->shared;
7604 struct sched_domain *child = env->sd->child; 7524 struct sched_domain *child = env->sd->child;
7605 struct sched_group *sg = env->sd->groups; 7525 struct sched_group *sg = env->sd->groups;
7606 struct sg_lb_stats *local = &sds->local_stat; 7526 struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ next_group:
7672 if (env->dst_rq->rd->overload != overload) 7592 if (env->dst_rq->rd->overload != overload)
7673 env->dst_rq->rd->overload = overload; 7593 env->dst_rq->rd->overload = overload;
7674 } 7594 }
7675
7676 if (!shared)
7677 return;
7678
7679 /*
7680 * Since these are sums over groups they can contain some CPUs
7681 * multiple times for the NUMA domains.
7682 *
7683 * Currently only wake_affine_llc() and find_busiest_group()
7684 * uses these numbers, only the last is affected by this problem.
7685 *
7686 * XXX fix that.
7687 */
7688 WRITE_ONCE(shared->nr_running, sds->total_running);
7689 WRITE_ONCE(shared->load, sds->total_load);
7690 WRITE_ONCE(shared->capacity, sds->total_capacity);
7691} 7595}
7692 7596
7693/** 7597/**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..0a519f8c224d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
81SCHED_FEAT(LB_MIN, false) 81SCHED_FEAT(LB_MIN, false)
82SCHED_FEAT(ATTACH_AGE_LOAD, true) 82SCHED_FEAT(ATTACH_AGE_LOAD, true)
83 83
84SCHED_FEAT(WA_IDLE, true)