diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-13 13:47:34 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-13 13:47:34 -0400 |
| commit | 49d2953c72c64182ef2dcac64f6979c0b4e25db7 (patch) | |
| tree | d339e498799617c8f79c760020f8442507cc381b /kernel/sched | |
| parent | cc76ee75a9d3201eeacc576d17fbc1511f673010 (diff) | |
| parent | 62a935b256f68a71697716595347209fb5275426 (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"Major changes:
- Reworked CPU capacity code, for better SMP load balancing on
systems with assymetric CPUs. (Vincent Guittot, Morten Rasmussen)
- Reworked RT task SMP balancing to be push based instead of pull
based, to reduce latencies on large CPU count systems. (Steven
Rostedt)
- SCHED_DEADLINE support updates and fixes. (Juri Lelli)
- SCHED_DEADLINE task migration support during CPU hotplug. (Wanpeng Li)
- x86 mwait-idle optimizations and fixes. (Mike Galbraith, Len Brown)
- sched/numa improvements. (Rik van Riel)
- various cleanups"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits)
sched/core: Drop debugging leftover trace_printk call
sched/deadline: Support DL task migration during CPU hotplug
sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()
sched/deadline: Always enqueue on previous rq when dl_task_timer() fires
sched/core: Remove unused argument from init_[rt|dl]_rq()
sched/deadline: Fix rt runtime corruption when dl fails its global constraints
sched/deadline: Avoid a superfluous check
sched: Improve load balancing in the presence of idle CPUs
sched: Optimize freq invariant accounting
sched: Move CFS tasks to CPUs with higher capacity
sched: Add SD_PREFER_SIBLING for SMT level
sched: Remove unused struct sched_group_capacity::capacity_orig
sched: Replace capacity_factor by usage
sched: Calculate CPU's usage statistic and put it into struct sg_lb_stats::group_usage
sched: Add struct rq::cpu_capacity_orig
sched: Make scale_rt invariant with frequency
sched: Make sched entity usage tracking scale-invariant
sched: Remove frequency scaling from cpu_capacity
sched: Track group sched_entity usage contributions
sched: Add sched_avg::utilization_avg_contrib
...
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 96 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 77 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 12 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 425 | ||||
| -rw-r--r-- | kernel/sched/features.h | 13 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 181 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 38 |
7 files changed, 612 insertions, 230 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3d5f6f6d14c2..261af7bfcb67 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -690,6 +690,23 @@ static inline bool got_nohz_idle_kick(void) | |||
| 690 | bool sched_can_stop_tick(void) | 690 | bool sched_can_stop_tick(void) |
| 691 | { | 691 | { |
| 692 | /* | 692 | /* |
| 693 | * FIFO realtime policy runs the highest priority task. Other runnable | ||
| 694 | * tasks are of a lower priority. The scheduler tick does nothing. | ||
| 695 | */ | ||
| 696 | if (current->policy == SCHED_FIFO) | ||
| 697 | return true; | ||
| 698 | |||
| 699 | /* | ||
| 700 | * Round-robin realtime tasks time slice with other tasks at the same | ||
| 701 | * realtime priority. Is this task the only one at this priority? | ||
| 702 | */ | ||
| 703 | if (current->policy == SCHED_RR) { | ||
| 704 | struct sched_rt_entity *rt_se = ¤t->rt; | ||
| 705 | |||
| 706 | return rt_se->run_list.prev == rt_se->run_list.next; | ||
| 707 | } | ||
| 708 | |||
| 709 | /* | ||
| 693 | * More than one running task need preemption. | 710 | * More than one running task need preemption. |
| 694 | * nr_running update is assumed to be visible | 711 | * nr_running update is assumed to be visible |
| 695 | * after IPI is sent from wakers. | 712 | * after IPI is sent from wakers. |
| @@ -5335,36 +5352,13 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
| 5335 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5352 | static int sched_cpu_inactive(struct notifier_block *nfb, |
| 5336 | unsigned long action, void *hcpu) | 5353 | unsigned long action, void *hcpu) |
| 5337 | { | 5354 | { |
| 5338 | unsigned long flags; | ||
| 5339 | long cpu = (long)hcpu; | ||
| 5340 | struct dl_bw *dl_b; | ||
| 5341 | |||
| 5342 | switch (action & ~CPU_TASKS_FROZEN) { | 5355 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5343 | case CPU_DOWN_PREPARE: | 5356 | case CPU_DOWN_PREPARE: |
| 5344 | set_cpu_active(cpu, false); | 5357 | set_cpu_active((long)hcpu, false); |
| 5345 | |||
| 5346 | /* explicitly allow suspend */ | ||
| 5347 | if (!(action & CPU_TASKS_FROZEN)) { | ||
| 5348 | bool overflow; | ||
| 5349 | int cpus; | ||
| 5350 | |||
| 5351 | rcu_read_lock_sched(); | ||
| 5352 | dl_b = dl_bw_of(cpu); | ||
| 5353 | |||
| 5354 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 5355 | cpus = dl_bw_cpus(cpu); | ||
| 5356 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 5357 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 5358 | |||
| 5359 | rcu_read_unlock_sched(); | ||
| 5360 | |||
| 5361 | if (overflow) | ||
| 5362 | return notifier_from_errno(-EBUSY); | ||
| 5363 | } | ||
| 5364 | return NOTIFY_OK; | 5358 | return NOTIFY_OK; |
| 5359 | default: | ||
| 5360 | return NOTIFY_DONE; | ||
| 5365 | } | 5361 | } |
| 5366 | |||
| 5367 | return NOTIFY_DONE; | ||
| 5368 | } | 5362 | } |
| 5369 | 5363 | ||
| 5370 | static int __init migration_init(void) | 5364 | static int __init migration_init(void) |
| @@ -5445,17 +5439,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 5445 | break; | 5439 | break; |
| 5446 | } | 5440 | } |
| 5447 | 5441 | ||
| 5448 | /* | ||
| 5449 | * Even though we initialize ->capacity to something semi-sane, | ||
| 5450 | * we leave capacity_orig unset. This allows us to detect if | ||
| 5451 | * domain iteration is still funny without causing /0 traps. | ||
| 5452 | */ | ||
| 5453 | if (!group->sgc->capacity_orig) { | ||
| 5454 | printk(KERN_CONT "\n"); | ||
| 5455 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); | ||
| 5456 | break; | ||
| 5457 | } | ||
| 5458 | |||
| 5459 | if (!cpumask_weight(sched_group_cpus(group))) { | 5442 | if (!cpumask_weight(sched_group_cpus(group))) { |
| 5460 | printk(KERN_CONT "\n"); | 5443 | printk(KERN_CONT "\n"); |
| 5461 | printk(KERN_ERR "ERROR: empty group\n"); | 5444 | printk(KERN_ERR "ERROR: empty group\n"); |
| @@ -5939,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5939 | * die on a /0 trap. | 5922 | * die on a /0 trap. |
| 5940 | */ | 5923 | */ |
| 5941 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
| 5942 | sg->sgc->capacity_orig = sg->sgc->capacity; | ||
| 5943 | 5925 | ||
| 5944 | /* | 5926 | /* |
| 5945 | * Make sure the first group of this domain contains the | 5927 | * Make sure the first group of this domain contains the |
| @@ -6250,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6250 | */ | 6232 | */ |
| 6251 | 6233 | ||
| 6252 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6234 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
| 6235 | sd->flags |= SD_PREFER_SIBLING; | ||
| 6253 | sd->imbalance_pct = 110; | 6236 | sd->imbalance_pct = 110; |
| 6254 | sd->smt_gain = 1178; /* ~15% */ | 6237 | sd->smt_gain = 1178; /* ~15% */ |
| 6255 | 6238 | ||
| @@ -7015,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
| 7015 | */ | 6998 | */ |
| 7016 | 6999 | ||
| 7017 | case CPU_ONLINE: | 7000 | case CPU_ONLINE: |
| 7018 | case CPU_DOWN_FAILED: | ||
| 7019 | cpuset_update_active_cpus(true); | 7001 | cpuset_update_active_cpus(true); |
| 7020 | break; | 7002 | break; |
| 7021 | default: | 7003 | default: |
| @@ -7027,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
| 7027 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7009 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
| 7028 | void *hcpu) | 7010 | void *hcpu) |
| 7029 | { | 7011 | { |
| 7030 | switch (action) { | 7012 | unsigned long flags; |
| 7013 | long cpu = (long)hcpu; | ||
| 7014 | struct dl_bw *dl_b; | ||
| 7015 | |||
| 7016 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 7031 | case CPU_DOWN_PREPARE: | 7017 | case CPU_DOWN_PREPARE: |
| 7018 | /* explicitly allow suspend */ | ||
| 7019 | if (!(action & CPU_TASKS_FROZEN)) { | ||
| 7020 | bool overflow; | ||
| 7021 | int cpus; | ||
| 7022 | |||
| 7023 | rcu_read_lock_sched(); | ||
| 7024 | dl_b = dl_bw_of(cpu); | ||
| 7025 | |||
| 7026 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 7027 | cpus = dl_bw_cpus(cpu); | ||
| 7028 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 7029 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 7030 | |||
| 7031 | rcu_read_unlock_sched(); | ||
| 7032 | |||
| 7033 | if (overflow) | ||
| 7034 | return notifier_from_errno(-EBUSY); | ||
| 7035 | } | ||
| 7032 | cpuset_update_active_cpus(false); | 7036 | cpuset_update_active_cpus(false); |
| 7033 | break; | 7037 | break; |
| 7034 | case CPU_DOWN_PREPARE_FROZEN: | 7038 | case CPU_DOWN_PREPARE_FROZEN: |
| @@ -7173,8 +7177,8 @@ void __init sched_init(void) | |||
| 7173 | rq->calc_load_active = 0; | 7177 | rq->calc_load_active = 0; |
| 7174 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7178 | rq->calc_load_update = jiffies + LOAD_FREQ; |
| 7175 | init_cfs_rq(&rq->cfs); | 7179 | init_cfs_rq(&rq->cfs); |
| 7176 | init_rt_rq(&rq->rt, rq); | 7180 | init_rt_rq(&rq->rt); |
| 7177 | init_dl_rq(&rq->dl, rq); | 7181 | init_dl_rq(&rq->dl); |
| 7178 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7182 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7179 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 7183 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
| 7180 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7184 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| @@ -7214,7 +7218,7 @@ void __init sched_init(void) | |||
| 7214 | #ifdef CONFIG_SMP | 7218 | #ifdef CONFIG_SMP |
| 7215 | rq->sd = NULL; | 7219 | rq->sd = NULL; |
| 7216 | rq->rd = NULL; | 7220 | rq->rd = NULL; |
| 7217 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; | 7221 | rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; |
| 7218 | rq->post_schedule = 0; | 7222 | rq->post_schedule = 0; |
| 7219 | rq->active_balance = 0; | 7223 | rq->active_balance = 0; |
| 7220 | rq->next_balance = jiffies; | 7224 | rq->next_balance = jiffies; |
| @@ -7813,7 +7817,7 @@ static int sched_rt_global_constraints(void) | |||
| 7813 | } | 7817 | } |
| 7814 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7818 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7815 | 7819 | ||
| 7816 | static int sched_dl_global_constraints(void) | 7820 | static int sched_dl_global_validate(void) |
| 7817 | { | 7821 | { |
| 7818 | u64 runtime = global_rt_runtime(); | 7822 | u64 runtime = global_rt_runtime(); |
| 7819 | u64 period = global_rt_period(); | 7823 | u64 period = global_rt_period(); |
| @@ -7914,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
| 7914 | if (ret) | 7918 | if (ret) |
| 7915 | goto undo; | 7919 | goto undo; |
| 7916 | 7920 | ||
| 7917 | ret = sched_rt_global_constraints(); | 7921 | ret = sched_dl_global_validate(); |
| 7918 | if (ret) | 7922 | if (ret) |
| 7919 | goto undo; | 7923 | goto undo; |
| 7920 | 7924 | ||
| 7921 | ret = sched_dl_global_constraints(); | 7925 | ret = sched_rt_global_constraints(); |
| 7922 | if (ret) | 7926 | if (ret) |
| 7923 | goto undo; | 7927 | goto undo; |
| 7924 | 7928 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) | |||
| 69 | dl_b->total_bw = 0; | 69 | dl_b->total_bw = 0; |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | 72 | void init_dl_rq(struct dl_rq *dl_rq) |
| 73 | { | 73 | { |
| 74 | dl_rq->rb_root = RB_ROOT; | 74 | dl_rq->rb_root = RB_ROOT; |
| 75 | 75 | ||
| @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) | |||
| 218 | rq->post_schedule = has_pushable_dl_tasks(rq); | 218 | rq->post_schedule = has_pushable_dl_tasks(rq); |
| 219 | } | 219 | } |
| 220 | 220 | ||
| 221 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | ||
| 222 | |||
| 223 | static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) | ||
| 224 | { | ||
| 225 | struct rq *later_rq = NULL; | ||
| 226 | bool fallback = false; | ||
| 227 | |||
| 228 | later_rq = find_lock_later_rq(p, rq); | ||
| 229 | |||
| 230 | if (!later_rq) { | ||
| 231 | int cpu; | ||
| 232 | |||
| 233 | /* | ||
| 234 | * If we cannot preempt any rq, fall back to pick any | ||
| 235 | * online cpu. | ||
| 236 | */ | ||
| 237 | fallback = true; | ||
| 238 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | ||
| 239 | if (cpu >= nr_cpu_ids) { | ||
| 240 | /* | ||
| 241 | * Fail to find any suitable cpu. | ||
| 242 | * The task will never come back! | ||
| 243 | */ | ||
| 244 | BUG_ON(dl_bandwidth_enabled()); | ||
| 245 | |||
| 246 | /* | ||
| 247 | * If admission control is disabled we | ||
| 248 | * try a little harder to let the task | ||
| 249 | * run. | ||
| 250 | */ | ||
| 251 | cpu = cpumask_any(cpu_active_mask); | ||
| 252 | } | ||
| 253 | later_rq = cpu_rq(cpu); | ||
| 254 | double_lock_balance(rq, later_rq); | ||
| 255 | } | ||
| 256 | |||
| 257 | deactivate_task(rq, p, 0); | ||
| 258 | set_task_cpu(p, later_rq->cpu); | ||
| 259 | activate_task(later_rq, p, ENQUEUE_REPLENISH); | ||
| 260 | |||
| 261 | if (!fallback) | ||
| 262 | resched_curr(later_rq); | ||
| 263 | |||
| 264 | double_unlock_balance(rq, later_rq); | ||
| 265 | } | ||
| 266 | |||
| 221 | #else | 267 | #else |
| 222 | 268 | ||
| 223 | static inline | 269 | static inline |
| @@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 514 | unsigned long flags; | 560 | unsigned long flags; |
| 515 | struct rq *rq; | 561 | struct rq *rq; |
| 516 | 562 | ||
| 517 | rq = task_rq_lock(current, &flags); | 563 | rq = task_rq_lock(p, &flags); |
| 518 | 564 | ||
| 519 | /* | 565 | /* |
| 520 | * We need to take care of several possible races here: | 566 | * We need to take care of several possible races here: |
| @@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 536 | sched_clock_tick(); | 582 | sched_clock_tick(); |
| 537 | update_rq_clock(rq); | 583 | update_rq_clock(rq); |
| 538 | 584 | ||
| 585 | #ifdef CONFIG_SMP | ||
| 586 | /* | ||
| 587 | * If we find that the rq the task was on is no longer | ||
| 588 | * available, we need to select a new rq. | ||
| 589 | */ | ||
| 590 | if (unlikely(!rq->online)) { | ||
| 591 | dl_task_offline_migration(rq, p); | ||
| 592 | goto unlock; | ||
| 593 | } | ||
| 594 | #endif | ||
| 595 | |||
| 539 | /* | 596 | /* |
| 540 | * If the throttle happened during sched-out; like: | 597 | * If the throttle happened during sched-out; like: |
| 541 | * | 598 | * |
| @@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 569 | push_dl_task(rq); | 626 | push_dl_task(rq); |
| 570 | #endif | 627 | #endif |
| 571 | unlock: | 628 | unlock: |
| 572 | task_rq_unlock(rq, current, &flags); | 629 | task_rq_unlock(rq, p, &flags); |
| 573 | 630 | ||
| 574 | return HRTIMER_NORESTART; | 631 | return HRTIMER_NORESTART; |
| 575 | } | 632 | } |
| @@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq) | |||
| 914 | } | 971 | } |
| 915 | update_rq_clock(rq); | 972 | update_rq_clock(rq); |
| 916 | update_curr_dl(rq); | 973 | update_curr_dl(rq); |
| 974 | /* | ||
| 975 | * Tell update_rq_clock() that we've just updated, | ||
| 976 | * so we don't do microscopic update in schedule() | ||
| 977 | * and double the fastpath cost. | ||
| 978 | */ | ||
| 979 | rq_clock_skip_update(rq, true); | ||
| 917 | } | 980 | } |
| 918 | 981 | ||
| 919 | #ifdef CONFIG_SMP | 982 | #ifdef CONFIG_SMP |
| @@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1659 | { | 1722 | { |
| 1660 | int check_resched = 1; | 1723 | int check_resched = 1; |
| 1661 | 1724 | ||
| 1662 | /* | ||
| 1663 | * If p is throttled, don't consider the possibility | ||
| 1664 | * of preempting rq->curr, the check will be done right | ||
| 1665 | * after its runtime will get replenished. | ||
| 1666 | */ | ||
| 1667 | if (unlikely(p->dl.dl_throttled)) | ||
| 1668 | return; | ||
| 1669 | |||
| 1670 | if (task_on_rq_queued(p) && rq->curr != p) { | 1725 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1671 | #ifdef CONFIG_SMP | 1726 | #ifdef CONFIG_SMP |
| 1672 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && | 1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf858d25c..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 71 | if (!se) { | 71 | if (!se) { |
| 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; |
| 73 | P(avg->runnable_avg_sum); | 73 | P(avg->runnable_avg_sum); |
| 74 | P(avg->runnable_avg_period); | 74 | P(avg->avg_period); |
| 75 | return; | 75 | return; |
| 76 | } | 76 | } |
| 77 | 77 | ||
| @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 94 | P(se->load.weight); | 94 | P(se->load.weight); |
| 95 | #ifdef CONFIG_SMP | 95 | #ifdef CONFIG_SMP |
| 96 | P(se->avg.runnable_avg_sum); | 96 | P(se->avg.runnable_avg_sum); |
| 97 | P(se->avg.runnable_avg_period); | 97 | P(se->avg.running_avg_sum); |
| 98 | P(se->avg.avg_period); | ||
| 98 | P(se->avg.load_avg_contrib); | 99 | P(se->avg.load_avg_contrib); |
| 100 | P(se->avg.utilization_avg_contrib); | ||
| 99 | P(se->avg.decay_count); | 101 | P(se->avg.decay_count); |
| 100 | #endif | 102 | #endif |
| 101 | #undef PN | 103 | #undef PN |
| @@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 214 | cfs_rq->runnable_load_avg); | 216 | cfs_rq->runnable_load_avg); |
| 215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 217 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", |
| 216 | cfs_rq->blocked_load_avg); | 218 | cfs_rq->blocked_load_avg); |
| 219 | SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", | ||
| 220 | cfs_rq->utilization_load_avg); | ||
| 217 | #ifdef CONFIG_FAIR_GROUP_SCHED | 221 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", |
| 219 | cfs_rq->tg_load_contrib); | 223 | cfs_rq->tg_load_contrib); |
| @@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 636 | P(se.load.weight); | 640 | P(se.load.weight); |
| 637 | #ifdef CONFIG_SMP | 641 | #ifdef CONFIG_SMP |
| 638 | P(se.avg.runnable_avg_sum); | 642 | P(se.avg.runnable_avg_sum); |
| 639 | P(se.avg.runnable_avg_period); | 643 | P(se.avg.running_avg_sum); |
| 644 | P(se.avg.avg_period); | ||
| 640 | P(se.avg.load_avg_contrib); | 645 | P(se.avg.load_avg_contrib); |
| 646 | P(se.avg.utilization_avg_contrib); | ||
| 641 | P(se.avg.decay_count); | 647 | P(se.avg.decay_count); |
| 642 | #endif | 648 | #endif |
| 643 | P(policy); | 649 | P(policy); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 241213be507c..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); | |||
| 670 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
| 671 | 671 | ||
| 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| 673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | ||
| 673 | 674 | ||
| 674 | /* Give new task start runnable values to heavy its load in infant time */ | 675 | /* Give new task start runnable values to heavy its load in infant time */ |
| 675 | void init_task_runnable_average(struct task_struct *p) | 676 | void init_task_runnable_average(struct task_struct *p) |
| @@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) | |||
| 677 | u32 slice; | 678 | u32 slice; |
| 678 | 679 | ||
| 679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; |
| 680 | p->se.avg.runnable_avg_sum = slice; | 681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; |
| 681 | p->se.avg.runnable_avg_period = slice; | 682 | p->se.avg.avg_period = slice; |
| 682 | __update_task_entity_contrib(&p->se); | 683 | __update_task_entity_contrib(&p->se); |
| 684 | __update_task_entity_utilization(&p->se); | ||
| 683 | } | 685 | } |
| 684 | #else | 686 | #else |
| 685 | void init_task_runnable_average(struct task_struct *p) | 687 | void init_task_runnable_average(struct task_struct *p) |
| @@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, | |||
| 1196 | static bool load_too_imbalanced(long src_load, long dst_load, | 1198 | static bool load_too_imbalanced(long src_load, long dst_load, |
| 1197 | struct task_numa_env *env) | 1199 | struct task_numa_env *env) |
| 1198 | { | 1200 | { |
| 1199 | long imb, old_imb; | ||
| 1200 | long orig_src_load, orig_dst_load; | ||
| 1201 | long src_capacity, dst_capacity; | 1201 | long src_capacity, dst_capacity; |
| 1202 | long orig_src_load; | ||
| 1203 | long load_a, load_b; | ||
| 1204 | long moved_load; | ||
| 1205 | long imb; | ||
| 1202 | 1206 | ||
| 1203 | /* | 1207 | /* |
| 1204 | * The load is corrected for the CPU capacity available on each node. | 1208 | * The load is corrected for the CPU capacity available on each node. |
| @@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
| 1211 | dst_capacity = env->dst_stats.compute_capacity; | 1215 | dst_capacity = env->dst_stats.compute_capacity; |
| 1212 | 1216 | ||
| 1213 | /* We care about the slope of the imbalance, not the direction. */ | 1217 | /* We care about the slope of the imbalance, not the direction. */ |
| 1214 | if (dst_load < src_load) | 1218 | load_a = dst_load; |
| 1215 | swap(dst_load, src_load); | 1219 | load_b = src_load; |
| 1220 | if (load_a < load_b) | ||
| 1221 | swap(load_a, load_b); | ||
| 1216 | 1222 | ||
| 1217 | /* Is the difference below the threshold? */ | 1223 | /* Is the difference below the threshold? */ |
| 1218 | imb = dst_load * src_capacity * 100 - | 1224 | imb = load_a * src_capacity * 100 - |
| 1219 | src_load * dst_capacity * env->imbalance_pct; | 1225 | load_b * dst_capacity * env->imbalance_pct; |
| 1220 | if (imb <= 0) | 1226 | if (imb <= 0) |
| 1221 | return false; | 1227 | return false; |
| 1222 | 1228 | ||
| 1223 | /* | 1229 | /* |
| 1224 | * The imbalance is above the allowed threshold. | 1230 | * The imbalance is above the allowed threshold. |
| 1225 | * Compare it with the old imbalance. | 1231 | * Allow a move that brings us closer to a balanced situation, |
| 1232 | * without moving things past the point of balance. | ||
| 1226 | */ | 1233 | */ |
| 1227 | orig_src_load = env->src_stats.load; | 1234 | orig_src_load = env->src_stats.load; |
| 1228 | orig_dst_load = env->dst_stats.load; | ||
| 1229 | 1235 | ||
| 1230 | if (orig_dst_load < orig_src_load) | 1236 | /* |
| 1231 | swap(orig_dst_load, orig_src_load); | 1237 | * In a task swap, there will be one load moving from src to dst, |
| 1232 | 1238 | * and another moving back. This is the net sum of both moves. | |
| 1233 | old_imb = orig_dst_load * src_capacity * 100 - | 1239 | * A simple task move will always have a positive value. |
| 1234 | orig_src_load * dst_capacity * env->imbalance_pct; | 1240 | * Allow the move if it brings the system closer to a balanced |
| 1241 | * situation, without crossing over the balance point. | ||
| 1242 | */ | ||
| 1243 | moved_load = orig_src_load - src_load; | ||
| 1235 | 1244 | ||
| 1236 | /* Would this change make things worse? */ | 1245 | if (moved_load > 0) |
| 1237 | return (imb > old_imb); | 1246 | /* Moving src -> dst. Did we overshoot balance? */ |
| 1247 | return src_load * dst_capacity < dst_load * src_capacity; | ||
| 1248 | else | ||
| 1249 | /* Moving dst -> src. Did we overshoot balance? */ | ||
| 1250 | return dst_load * src_capacity < src_load * dst_capacity; | ||
| 1238 | } | 1251 | } |
| 1239 | 1252 | ||
| 1240 | /* | 1253 | /* |
| @@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
| 1675 | *period = now - p->last_task_numa_placement; | 1688 | *period = now - p->last_task_numa_placement; |
| 1676 | } else { | 1689 | } else { |
| 1677 | delta = p->se.avg.runnable_avg_sum; | 1690 | delta = p->se.avg.runnable_avg_sum; |
| 1678 | *period = p->se.avg.runnable_avg_period; | 1691 | *period = p->se.avg.avg_period; |
| 1679 | } | 1692 | } |
| 1680 | 1693 | ||
| 1681 | p->last_sum_exec_runtime = runtime; | 1694 | p->last_sum_exec_runtime = runtime; |
| @@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
| 1765 | } | 1778 | } |
| 1766 | } | 1779 | } |
| 1767 | /* Next round, evaluate the nodes within max_group. */ | 1780 | /* Next round, evaluate the nodes within max_group. */ |
| 1781 | if (!max_faults) | ||
| 1782 | break; | ||
| 1768 | nodes = max_group; | 1783 | nodes = max_group; |
| 1769 | } | 1784 | } |
| 1770 | return nid; | 1785 | return nid; |
| @@ -2503,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) | |||
| 2503 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2518 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) |
| 2504 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2519 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
| 2505 | */ | 2520 | */ |
| 2506 | static __always_inline int __update_entity_runnable_avg(u64 now, | 2521 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, |
| 2507 | struct sched_avg *sa, | 2522 | struct sched_avg *sa, |
| 2508 | int runnable) | 2523 | int runnable, |
| 2524 | int running) | ||
| 2509 | { | 2525 | { |
| 2510 | u64 delta, periods; | 2526 | u64 delta, periods; |
| 2511 | u32 runnable_contrib; | 2527 | u32 runnable_contrib; |
| 2512 | int delta_w, decayed = 0; | 2528 | int delta_w, decayed = 0; |
| 2529 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
| 2513 | 2530 | ||
| 2514 | delta = now - sa->last_runnable_update; | 2531 | delta = now - sa->last_runnable_update; |
| 2515 | /* | 2532 | /* |
| @@ -2531,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2531 | sa->last_runnable_update = now; | 2548 | sa->last_runnable_update = now; |
| 2532 | 2549 | ||
| 2533 | /* delta_w is the amount already accumulated against our next period */ | 2550 | /* delta_w is the amount already accumulated against our next period */ |
| 2534 | delta_w = sa->runnable_avg_period % 1024; | 2551 | delta_w = sa->avg_period % 1024; |
| 2535 | if (delta + delta_w >= 1024) { | 2552 | if (delta + delta_w >= 1024) { |
| 2536 | /* period roll-over */ | 2553 | /* period roll-over */ |
| 2537 | decayed = 1; | 2554 | decayed = 1; |
| @@ -2544,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2544 | delta_w = 1024 - delta_w; | 2561 | delta_w = 1024 - delta_w; |
| 2545 | if (runnable) | 2562 | if (runnable) |
| 2546 | sa->runnable_avg_sum += delta_w; | 2563 | sa->runnable_avg_sum += delta_w; |
| 2547 | sa->runnable_avg_period += delta_w; | 2564 | if (running) |
| 2565 | sa->running_avg_sum += delta_w * scale_freq | ||
| 2566 | >> SCHED_CAPACITY_SHIFT; | ||
| 2567 | sa->avg_period += delta_w; | ||
| 2548 | 2568 | ||
| 2549 | delta -= delta_w; | 2569 | delta -= delta_w; |
| 2550 | 2570 | ||
| @@ -2554,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2554 | 2574 | ||
| 2555 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2575 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, |
| 2556 | periods + 1); | 2576 | periods + 1); |
| 2557 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | 2577 | sa->running_avg_sum = decay_load(sa->running_avg_sum, |
| 2578 | periods + 1); | ||
| 2579 | sa->avg_period = decay_load(sa->avg_period, | ||
| 2558 | periods + 1); | 2580 | periods + 1); |
| 2559 | 2581 | ||
| 2560 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2582 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
| 2561 | runnable_contrib = __compute_runnable_contrib(periods); | 2583 | runnable_contrib = __compute_runnable_contrib(periods); |
| 2562 | if (runnable) | 2584 | if (runnable) |
| 2563 | sa->runnable_avg_sum += runnable_contrib; | 2585 | sa->runnable_avg_sum += runnable_contrib; |
| 2564 | sa->runnable_avg_period += runnable_contrib; | 2586 | if (running) |
| 2587 | sa->running_avg_sum += runnable_contrib * scale_freq | ||
| 2588 | >> SCHED_CAPACITY_SHIFT; | ||
| 2589 | sa->avg_period += runnable_contrib; | ||
| 2565 | } | 2590 | } |
| 2566 | 2591 | ||
| 2567 | /* Remainder of delta accrued against u_0` */ | 2592 | /* Remainder of delta accrued against u_0` */ |
| 2568 | if (runnable) | 2593 | if (runnable) |
| 2569 | sa->runnable_avg_sum += delta; | 2594 | sa->runnable_avg_sum += delta; |
| 2570 | sa->runnable_avg_period += delta; | 2595 | if (running) |
| 2596 | sa->running_avg_sum += delta * scale_freq | ||
| 2597 | >> SCHED_CAPACITY_SHIFT; | ||
| 2598 | sa->avg_period += delta; | ||
| 2571 | 2599 | ||
| 2572 | return decayed; | 2600 | return decayed; |
| 2573 | } | 2601 | } |
| @@ -2584,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
| 2584 | return 0; | 2612 | return 0; |
| 2585 | 2613 | ||
| 2586 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2614 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); |
| 2615 | se->avg.utilization_avg_contrib = | ||
| 2616 | decay_load(se->avg.utilization_avg_contrib, decays); | ||
| 2587 | 2617 | ||
| 2588 | return decays; | 2618 | return decays; |
| 2589 | } | 2619 | } |
| @@ -2619,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
| 2619 | 2649 | ||
| 2620 | /* The fraction of a cpu used by this cfs_rq */ | 2650 | /* The fraction of a cpu used by this cfs_rq */ |
| 2621 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2651 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
| 2622 | sa->runnable_avg_period + 1); | 2652 | sa->avg_period + 1); |
| 2623 | contrib -= cfs_rq->tg_runnable_contrib; | 2653 | contrib -= cfs_rq->tg_runnable_contrib; |
| 2624 | 2654 | ||
| 2625 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 2655 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { |
| @@ -2672,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
| 2672 | 2702 | ||
| 2673 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 2703 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
| 2674 | { | 2704 | { |
| 2675 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | 2705 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, |
| 2706 | runnable, runnable); | ||
| 2676 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 2707 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
| 2677 | } | 2708 | } |
| 2678 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2709 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| @@ -2690,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) | |||
| 2690 | 2721 | ||
| 2691 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2722 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ |
| 2692 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 2723 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); |
| 2693 | contrib /= (se->avg.runnable_avg_period + 1); | 2724 | contrib /= (se->avg.avg_period + 1); |
| 2694 | se->avg.load_avg_contrib = scale_load(contrib); | 2725 | se->avg.load_avg_contrib = scale_load(contrib); |
| 2695 | } | 2726 | } |
| 2696 | 2727 | ||
| @@ -2709,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
| 2709 | return se->avg.load_avg_contrib - old_contrib; | 2740 | return se->avg.load_avg_contrib - old_contrib; |
| 2710 | } | 2741 | } |
| 2711 | 2742 | ||
| 2743 | |||
| 2744 | static inline void __update_task_entity_utilization(struct sched_entity *se) | ||
| 2745 | { | ||
| 2746 | u32 contrib; | ||
| 2747 | |||
| 2748 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
| 2749 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | ||
| 2750 | contrib /= (se->avg.avg_period + 1); | ||
| 2751 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
| 2752 | } | ||
| 2753 | |||
| 2754 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | ||
| 2755 | { | ||
| 2756 | long old_contrib = se->avg.utilization_avg_contrib; | ||
| 2757 | |||
| 2758 | if (entity_is_task(se)) | ||
| 2759 | __update_task_entity_utilization(se); | ||
| 2760 | else | ||
| 2761 | se->avg.utilization_avg_contrib = | ||
| 2762 | group_cfs_rq(se)->utilization_load_avg; | ||
| 2763 | |||
| 2764 | return se->avg.utilization_avg_contrib - old_contrib; | ||
| 2765 | } | ||
| 2766 | |||
| 2712 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2767 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, |
| 2713 | long load_contrib) | 2768 | long load_contrib) |
| 2714 | { | 2769 | { |
| @@ -2725,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2725 | int update_cfs_rq) | 2780 | int update_cfs_rq) |
| 2726 | { | 2781 | { |
| 2727 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2782 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 2728 | long contrib_delta; | 2783 | long contrib_delta, utilization_delta; |
| 2784 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
| 2729 | u64 now; | 2785 | u64 now; |
| 2730 | 2786 | ||
| 2731 | /* | 2787 | /* |
| @@ -2737,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2737 | else | 2793 | else |
| 2738 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 2794 | now = cfs_rq_clock_task(group_cfs_rq(se)); |
| 2739 | 2795 | ||
| 2740 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | 2796 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, |
| 2797 | cfs_rq->curr == se)) | ||
| 2741 | return; | 2798 | return; |
| 2742 | 2799 | ||
| 2743 | contrib_delta = __update_entity_load_avg_contrib(se); | 2800 | contrib_delta = __update_entity_load_avg_contrib(se); |
| 2801 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
| 2744 | 2802 | ||
| 2745 | if (!update_cfs_rq) | 2803 | if (!update_cfs_rq) |
| 2746 | return; | 2804 | return; |
| 2747 | 2805 | ||
| 2748 | if (se->on_rq) | 2806 | if (se->on_rq) { |
| 2749 | cfs_rq->runnable_load_avg += contrib_delta; | 2807 | cfs_rq->runnable_load_avg += contrib_delta; |
| 2750 | else | 2808 | cfs_rq->utilization_load_avg += utilization_delta; |
| 2809 | } else { | ||
| 2751 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 2810 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); |
| 2811 | } | ||
| 2752 | } | 2812 | } |
| 2753 | 2813 | ||
| 2754 | /* | 2814 | /* |
| @@ -2823,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2823 | } | 2883 | } |
| 2824 | 2884 | ||
| 2825 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 2885 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; |
| 2886 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
| 2826 | /* we force update consideration on load-balancer moves */ | 2887 | /* we force update consideration on load-balancer moves */ |
| 2827 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 2888 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); |
| 2828 | } | 2889 | } |
| @@ -2841,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2841 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2902 | update_cfs_rq_blocked_load(cfs_rq, !sleep); |
| 2842 | 2903 | ||
| 2843 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2904 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; |
| 2905 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | ||
| 2844 | if (sleep) { | 2906 | if (sleep) { |
| 2845 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2907 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
| 2846 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2908 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
| @@ -3178,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3178 | */ | 3240 | */ |
| 3179 | update_stats_wait_end(cfs_rq, se); | 3241 | update_stats_wait_end(cfs_rq, se); |
| 3180 | __dequeue_entity(cfs_rq, se); | 3242 | __dequeue_entity(cfs_rq, se); |
| 3243 | update_entity_load_avg(se, 1); | ||
| 3181 | } | 3244 | } |
| 3182 | 3245 | ||
| 3183 | update_stats_curr_start(cfs_rq, se); | 3246 | update_stats_curr_start(cfs_rq, se); |
| @@ -4304,6 +4367,11 @@ static unsigned long capacity_of(int cpu) | |||
| 4304 | return cpu_rq(cpu)->cpu_capacity; | 4367 | return cpu_rq(cpu)->cpu_capacity; |
| 4305 | } | 4368 | } |
| 4306 | 4369 | ||
| 4370 | static unsigned long capacity_orig_of(int cpu) | ||
| 4371 | { | ||
| 4372 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
| 4373 | } | ||
| 4374 | |||
| 4307 | static unsigned long cpu_avg_load_per_task(int cpu) | 4375 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 4308 | { | 4376 | { |
| 4309 | struct rq *rq = cpu_rq(cpu); | 4377 | struct rq *rq = cpu_rq(cpu); |
| @@ -4717,6 +4785,33 @@ next: | |||
| 4717 | done: | 4785 | done: |
| 4718 | return target; | 4786 | return target; |
| 4719 | } | 4787 | } |
| 4788 | /* | ||
| 4789 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | ||
| 4790 | * tasks. The unit of the return value must be the one of capacity so we can | ||
| 4791 | * compare the usage with the capacity of the CPU that is available for CFS | ||
| 4792 | * task (ie cpu_capacity). | ||
| 4793 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | ||
| 4794 | * CPU. It represents the amount of utilization of a CPU in the range | ||
| 4795 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | ||
| 4796 | * capacity of the CPU because it's about the running time on this CPU. | ||
| 4797 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | ||
| 4798 | * because of unfortunate rounding in avg_period and running_load_avg or just | ||
| 4799 | * after migrating tasks until the average stabilizes with the new running | ||
| 4800 | * time. So we need to check that the usage stays into the range | ||
| 4801 | * [0..cpu_capacity_orig] and cap if necessary. | ||
| 4802 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | ||
| 4803 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | ||
| 4804 | */ | ||
| 4805 | static int get_cpu_usage(int cpu) | ||
| 4806 | { | ||
| 4807 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | ||
| 4808 | unsigned long capacity = capacity_orig_of(cpu); | ||
| 4809 | |||
| 4810 | if (usage >= SCHED_LOAD_SCALE) | ||
| 4811 | return capacity; | ||
| 4812 | |||
| 4813 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
| 4814 | } | ||
| 4720 | 4815 | ||
| 4721 | /* | 4816 | /* |
| 4722 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 4817 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
| @@ -5843,12 +5938,12 @@ struct sg_lb_stats { | |||
| 5843 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5938 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 5844 | unsigned long load_per_task; | 5939 | unsigned long load_per_task; |
| 5845 | unsigned long group_capacity; | 5940 | unsigned long group_capacity; |
| 5941 | unsigned long group_usage; /* Total usage of the group */ | ||
| 5846 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5942 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
| 5847 | unsigned int group_capacity_factor; | ||
| 5848 | unsigned int idle_cpus; | 5943 | unsigned int idle_cpus; |
| 5849 | unsigned int group_weight; | 5944 | unsigned int group_weight; |
| 5850 | enum group_type group_type; | 5945 | enum group_type group_type; |
| 5851 | int group_has_free_capacity; | 5946 | int group_no_capacity; |
| 5852 | #ifdef CONFIG_NUMA_BALANCING | 5947 | #ifdef CONFIG_NUMA_BALANCING |
| 5853 | unsigned int nr_numa_running; | 5948 | unsigned int nr_numa_running; |
| 5854 | unsigned int nr_preferred_running; | 5949 | unsigned int nr_preferred_running; |
| @@ -5919,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
| 5919 | return load_idx; | 6014 | return load_idx; |
| 5920 | } | 6015 | } |
| 5921 | 6016 | ||
| 5922 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) | ||
| 5923 | { | ||
| 5924 | return SCHED_CAPACITY_SCALE; | ||
| 5925 | } | ||
| 5926 | |||
| 5927 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
| 5928 | { | ||
| 5929 | return default_scale_capacity(sd, cpu); | ||
| 5930 | } | ||
| 5931 | |||
| 5932 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 6017 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5933 | { | 6018 | { |
| 5934 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 6019 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
| @@ -5945,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5945 | static unsigned long scale_rt_capacity(int cpu) | 6030 | static unsigned long scale_rt_capacity(int cpu) |
| 5946 | { | 6031 | { |
| 5947 | struct rq *rq = cpu_rq(cpu); | 6032 | struct rq *rq = cpu_rq(cpu); |
| 5948 | u64 total, available, age_stamp, avg; | 6033 | u64 total, used, age_stamp, avg; |
| 5949 | s64 delta; | 6034 | s64 delta; |
| 5950 | 6035 | ||
| 5951 | /* | 6036 | /* |
| @@ -5961,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 5961 | 6046 | ||
| 5962 | total = sched_avg_period() + delta; | 6047 | total = sched_avg_period() + delta; |
| 5963 | 6048 | ||
| 5964 | if (unlikely(total < avg)) { | 6049 | used = div_u64(avg, total); |
| 5965 | /* Ensures that capacity won't end up being negative */ | ||
| 5966 | available = 0; | ||
| 5967 | } else { | ||
| 5968 | available = total - avg; | ||
| 5969 | } | ||
| 5970 | 6050 | ||
| 5971 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) | 6051 | if (likely(used < SCHED_CAPACITY_SCALE)) |
| 5972 | total = SCHED_CAPACITY_SCALE; | 6052 | return SCHED_CAPACITY_SCALE - used; |
| 5973 | 6053 | ||
| 5974 | total >>= SCHED_CAPACITY_SHIFT; | 6054 | return 1; |
| 5975 | |||
| 5976 | return div_u64(available, total); | ||
| 5977 | } | 6055 | } |
| 5978 | 6056 | ||
| 5979 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6057 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
| @@ -5988,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5988 | 6066 | ||
| 5989 | capacity >>= SCHED_CAPACITY_SHIFT; | 6067 | capacity >>= SCHED_CAPACITY_SHIFT; |
| 5990 | 6068 | ||
| 5991 | sdg->sgc->capacity_orig = capacity; | 6069 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
| 5992 | |||
| 5993 | if (sched_feat(ARCH_CAPACITY)) | ||
| 5994 | capacity *= arch_scale_freq_capacity(sd, cpu); | ||
| 5995 | else | ||
| 5996 | capacity *= default_scale_capacity(sd, cpu); | ||
| 5997 | |||
| 5998 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
| 5999 | 6070 | ||
| 6000 | capacity *= scale_rt_capacity(cpu); | 6071 | capacity *= scale_rt_capacity(cpu); |
| 6001 | capacity >>= SCHED_CAPACITY_SHIFT; | 6072 | capacity >>= SCHED_CAPACITY_SHIFT; |
| @@ -6011,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6011 | { | 6082 | { |
| 6012 | struct sched_domain *child = sd->child; | 6083 | struct sched_domain *child = sd->child; |
| 6013 | struct sched_group *group, *sdg = sd->groups; | 6084 | struct sched_group *group, *sdg = sd->groups; |
| 6014 | unsigned long capacity, capacity_orig; | 6085 | unsigned long capacity; |
| 6015 | unsigned long interval; | 6086 | unsigned long interval; |
| 6016 | 6087 | ||
| 6017 | interval = msecs_to_jiffies(sd->balance_interval); | 6088 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -6023,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6023 | return; | 6094 | return; |
| 6024 | } | 6095 | } |
| 6025 | 6096 | ||
| 6026 | capacity_orig = capacity = 0; | 6097 | capacity = 0; |
| 6027 | 6098 | ||
| 6028 | if (child->flags & SD_OVERLAP) { | 6099 | if (child->flags & SD_OVERLAP) { |
| 6029 | /* | 6100 | /* |
| @@ -6043,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6043 | * Use capacity_of(), which is set irrespective of domains | 6114 | * Use capacity_of(), which is set irrespective of domains |
| 6044 | * in update_cpu_capacity(). | 6115 | * in update_cpu_capacity(). |
| 6045 | * | 6116 | * |
| 6046 | * This avoids capacity/capacity_orig from being 0 and | 6117 | * This avoids capacity from being 0 and |
| 6047 | * causing divide-by-zero issues on boot. | 6118 | * causing divide-by-zero issues on boot. |
| 6048 | * | ||
| 6049 | * Runtime updates will correct capacity_orig. | ||
| 6050 | */ | 6119 | */ |
| 6051 | if (unlikely(!rq->sd)) { | 6120 | if (unlikely(!rq->sd)) { |
| 6052 | capacity_orig += capacity_of(cpu); | ||
| 6053 | capacity += capacity_of(cpu); | 6121 | capacity += capacity_of(cpu); |
| 6054 | continue; | 6122 | continue; |
| 6055 | } | 6123 | } |
| 6056 | 6124 | ||
| 6057 | sgc = rq->sd->groups->sgc; | 6125 | sgc = rq->sd->groups->sgc; |
| 6058 | capacity_orig += sgc->capacity_orig; | ||
| 6059 | capacity += sgc->capacity; | 6126 | capacity += sgc->capacity; |
| 6060 | } | 6127 | } |
| 6061 | } else { | 6128 | } else { |
| @@ -6066,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6066 | 6133 | ||
| 6067 | group = child->groups; | 6134 | group = child->groups; |
| 6068 | do { | 6135 | do { |
| 6069 | capacity_orig += group->sgc->capacity_orig; | ||
| 6070 | capacity += group->sgc->capacity; | 6136 | capacity += group->sgc->capacity; |
| 6071 | group = group->next; | 6137 | group = group->next; |
| 6072 | } while (group != child->groups); | 6138 | } while (group != child->groups); |
| 6073 | } | 6139 | } |
| 6074 | 6140 | ||
| 6075 | sdg->sgc->capacity_orig = capacity_orig; | ||
| 6076 | sdg->sgc->capacity = capacity; | 6141 | sdg->sgc->capacity = capacity; |
| 6077 | } | 6142 | } |
| 6078 | 6143 | ||
| 6079 | /* | 6144 | /* |
| 6080 | * Try and fix up capacity for tiny siblings, this is needed when | 6145 | * Check whether the capacity of the rq has been noticeably reduced by side |
| 6081 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | 6146 | * activity. The imbalance_pct is used for the threshold. |
| 6082 | * which on its own isn't powerful enough. | 6147 | * Return true is the capacity is reduced |
| 6083 | * | ||
| 6084 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
| 6085 | */ | 6148 | */ |
| 6086 | static inline int | 6149 | static inline int |
| 6087 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 6150 | check_cpu_capacity(struct rq *rq, struct sched_domain *sd) |
| 6088 | { | 6151 | { |
| 6089 | /* | 6152 | return ((rq->cpu_capacity * sd->imbalance_pct) < |
| 6090 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE | 6153 | (rq->cpu_capacity_orig * 100)); |
| 6091 | */ | ||
| 6092 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) | ||
| 6093 | return 0; | ||
| 6094 | |||
| 6095 | /* | ||
| 6096 | * If ~90% of the cpu_capacity is still there, we're good. | ||
| 6097 | */ | ||
| 6098 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) | ||
| 6099 | return 1; | ||
| 6100 | |||
| 6101 | return 0; | ||
| 6102 | } | 6154 | } |
| 6103 | 6155 | ||
| 6104 | /* | 6156 | /* |
| @@ -6136,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
| 6136 | } | 6188 | } |
| 6137 | 6189 | ||
| 6138 | /* | 6190 | /* |
| 6139 | * Compute the group capacity factor. | 6191 | * group_has_capacity returns true if the group has spare capacity that could |
| 6140 | * | 6192 | * be used by some tasks. |
| 6141 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by | 6193 | * We consider that a group has spare capacity if the * number of task is |
| 6142 | * first dividing out the smt factor and computing the actual number of cores | 6194 | * smaller than the number of CPUs or if the usage is lower than the available |
| 6143 | * and limit unit capacity with that. | 6195 | * capacity for CFS tasks. |
| 6196 | * For the latter, we use a threshold to stabilize the state, to take into | ||
| 6197 | * account the variance of the tasks' load and to return true if the available | ||
| 6198 | * capacity in meaningful for the load balancer. | ||
| 6199 | * As an example, an available capacity of 1% can appear but it doesn't make | ||
| 6200 | * any benefit for the load balance. | ||
| 6144 | */ | 6201 | */ |
| 6145 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) | 6202 | static inline bool |
| 6203 | group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6146 | { | 6204 | { |
| 6147 | unsigned int capacity_factor, smt, cpus; | 6205 | if (sgs->sum_nr_running < sgs->group_weight) |
| 6148 | unsigned int capacity, capacity_orig; | 6206 | return true; |
| 6149 | 6207 | ||
| 6150 | capacity = group->sgc->capacity; | 6208 | if ((sgs->group_capacity * 100) > |
| 6151 | capacity_orig = group->sgc->capacity_orig; | 6209 | (sgs->group_usage * env->sd->imbalance_pct)) |
| 6152 | cpus = group->group_weight; | 6210 | return true; |
| 6153 | 6211 | ||
| 6154 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ | 6212 | return false; |
| 6155 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); | 6213 | } |
| 6156 | capacity_factor = cpus / smt; /* cores */ | 6214 | |
| 6215 | /* | ||
| 6216 | * group_is_overloaded returns true if the group has more tasks than it can | ||
| 6217 | * handle. | ||
| 6218 | * group_is_overloaded is not equals to !group_has_capacity because a group | ||
| 6219 | * with the exact right number of tasks, has no more spare capacity but is not | ||
| 6220 | * overloaded so both group_has_capacity and group_is_overloaded return | ||
| 6221 | * false. | ||
| 6222 | */ | ||
| 6223 | static inline bool | ||
| 6224 | group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6225 | { | ||
| 6226 | if (sgs->sum_nr_running <= sgs->group_weight) | ||
| 6227 | return false; | ||
| 6157 | 6228 | ||
| 6158 | capacity_factor = min_t(unsigned, | 6229 | if ((sgs->group_capacity * 100) < |
| 6159 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); | 6230 | (sgs->group_usage * env->sd->imbalance_pct)) |
| 6160 | if (!capacity_factor) | 6231 | return true; |
| 6161 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6162 | 6232 | ||
| 6163 | return capacity_factor; | 6233 | return false; |
| 6164 | } | 6234 | } |
| 6165 | 6235 | ||
| 6166 | static enum group_type | 6236 | static enum group_type group_classify(struct lb_env *env, |
| 6167 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | 6237 | struct sched_group *group, |
| 6238 | struct sg_lb_stats *sgs) | ||
| 6168 | { | 6239 | { |
| 6169 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6240 | if (sgs->group_no_capacity) |
| 6170 | return group_overloaded; | 6241 | return group_overloaded; |
| 6171 | 6242 | ||
| 6172 | if (sg_imbalanced(group)) | 6243 | if (sg_imbalanced(group)) |
| @@ -6204,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6204 | load = source_load(i, load_idx); | 6275 | load = source_load(i, load_idx); |
| 6205 | 6276 | ||
| 6206 | sgs->group_load += load; | 6277 | sgs->group_load += load; |
| 6278 | sgs->group_usage += get_cpu_usage(i); | ||
| 6207 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6279 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 6208 | 6280 | ||
| 6209 | if (rq->nr_running > 1) | 6281 | if (rq->nr_running > 1) |
| @@ -6226,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6226 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6298 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 6227 | 6299 | ||
| 6228 | sgs->group_weight = group->group_weight; | 6300 | sgs->group_weight = group->group_weight; |
| 6229 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | ||
| 6230 | sgs->group_type = group_classify(group, sgs); | ||
| 6231 | 6301 | ||
| 6232 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6302 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
| 6233 | sgs->group_has_free_capacity = 1; | 6303 | sgs->group_type = group_classify(env, group, sgs); |
| 6234 | } | 6304 | } |
| 6235 | 6305 | ||
| 6236 | /** | 6306 | /** |
| @@ -6352,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 6352 | 6422 | ||
| 6353 | /* | 6423 | /* |
| 6354 | * In case the child domain prefers tasks go to siblings | 6424 | * In case the child domain prefers tasks go to siblings |
| 6355 | * first, lower the sg capacity factor to one so that we'll try | 6425 | * first, lower the sg capacity so that we'll try |
| 6356 | * and move all the excess tasks away. We lower the capacity | 6426 | * and move all the excess tasks away. We lower the capacity |
| 6357 | * of a group only if the local group has the capacity to fit | 6427 | * of a group only if the local group has the capacity to fit |
| 6358 | * these excess tasks, i.e. nr_running < group_capacity_factor. The | 6428 | * these excess tasks. The extra check prevents the case where |
| 6359 | * extra check prevents the case where you always pull from the | 6429 | * you always pull from the heaviest group when it is already |
| 6360 | * heaviest group when it is already under-utilized (possible | 6430 | * under-utilized (possible with a large weight task outweighs |
| 6361 | * with a large weight task outweighs the tasks on the system). | 6431 | * the tasks on the system). |
| 6362 | */ | 6432 | */ |
| 6363 | if (prefer_sibling && sds->local && | 6433 | if (prefer_sibling && sds->local && |
| 6364 | sds->local_stat.group_has_free_capacity) { | 6434 | group_has_capacity(env, &sds->local_stat) && |
| 6365 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6435 | (sgs->sum_nr_running > 1)) { |
| 6366 | sgs->group_type = group_classify(sg, sgs); | 6436 | sgs->group_no_capacity = 1; |
| 6437 | sgs->group_type = group_overloaded; | ||
| 6367 | } | 6438 | } |
| 6368 | 6439 | ||
| 6369 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6440 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| @@ -6543,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6543 | */ | 6614 | */ |
| 6544 | if (busiest->group_type == group_overloaded && | 6615 | if (busiest->group_type == group_overloaded && |
| 6545 | local->group_type == group_overloaded) { | 6616 | local->group_type == group_overloaded) { |
| 6546 | load_above_capacity = | 6617 | load_above_capacity = busiest->sum_nr_running * |
| 6547 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6618 | SCHED_LOAD_SCALE; |
| 6548 | 6619 | if (load_above_capacity > busiest->group_capacity) | |
| 6549 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); | 6620 | load_above_capacity -= busiest->group_capacity; |
| 6550 | load_above_capacity /= busiest->group_capacity; | 6621 | else |
| 6622 | load_above_capacity = ~0UL; | ||
| 6551 | } | 6623 | } |
| 6552 | 6624 | ||
| 6553 | /* | 6625 | /* |
| @@ -6610,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6610 | local = &sds.local_stat; | 6682 | local = &sds.local_stat; |
| 6611 | busiest = &sds.busiest_stat; | 6683 | busiest = &sds.busiest_stat; |
| 6612 | 6684 | ||
| 6685 | /* ASYM feature bypasses nice load balance check */ | ||
| 6613 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 6686 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
| 6614 | check_asym_packing(env, &sds)) | 6687 | check_asym_packing(env, &sds)) |
| 6615 | return sds.busiest; | 6688 | return sds.busiest; |
| @@ -6630,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6630 | goto force_balance; | 6703 | goto force_balance; |
| 6631 | 6704 | ||
| 6632 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6705 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| 6633 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && | 6706 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && |
| 6634 | !busiest->group_has_free_capacity) | 6707 | busiest->group_no_capacity) |
| 6635 | goto force_balance; | 6708 | goto force_balance; |
| 6636 | 6709 | ||
| 6637 | /* | 6710 | /* |
| @@ -6690,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6690 | int i; | 6763 | int i; |
| 6691 | 6764 | ||
| 6692 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6765 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 6693 | unsigned long capacity, capacity_factor, wl; | 6766 | unsigned long capacity, wl; |
| 6694 | enum fbq_type rt; | 6767 | enum fbq_type rt; |
| 6695 | 6768 | ||
| 6696 | rq = cpu_rq(i); | 6769 | rq = cpu_rq(i); |
| @@ -6719,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6719 | continue; | 6792 | continue; |
| 6720 | 6793 | ||
| 6721 | capacity = capacity_of(i); | 6794 | capacity = capacity_of(i); |
| 6722 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); | ||
| 6723 | if (!capacity_factor) | ||
| 6724 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6725 | 6795 | ||
| 6726 | wl = weighted_cpuload(i); | 6796 | wl = weighted_cpuload(i); |
| 6727 | 6797 | ||
| @@ -6729,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6729 | * When comparing with imbalance, use weighted_cpuload() | 6799 | * When comparing with imbalance, use weighted_cpuload() |
| 6730 | * which is not scaled with the cpu capacity. | 6800 | * which is not scaled with the cpu capacity. |
| 6731 | */ | 6801 | */ |
| 6732 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) | 6802 | |
| 6803 | if (rq->nr_running == 1 && wl > env->imbalance && | ||
| 6804 | !check_cpu_capacity(rq, env->sd)) | ||
| 6733 | continue; | 6805 | continue; |
| 6734 | 6806 | ||
| 6735 | /* | 6807 | /* |
| @@ -6777,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) | |||
| 6777 | return 1; | 6849 | return 1; |
| 6778 | } | 6850 | } |
| 6779 | 6851 | ||
| 6852 | /* | ||
| 6853 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | ||
| 6854 | * It's worth migrating the task if the src_cpu's capacity is reduced | ||
| 6855 | * because of other sched_class or IRQs if more capacity stays | ||
| 6856 | * available on dst_cpu. | ||
| 6857 | */ | ||
| 6858 | if ((env->idle != CPU_NOT_IDLE) && | ||
| 6859 | (env->src_rq->cfs.h_nr_running == 1)) { | ||
| 6860 | if ((check_cpu_capacity(env->src_rq, sd)) && | ||
| 6861 | (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) | ||
| 6862 | return 1; | ||
| 6863 | } | ||
| 6864 | |||
| 6780 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 6865 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
| 6781 | } | 6866 | } |
| 6782 | 6867 | ||
| @@ -6876,6 +6961,9 @@ redo: | |||
| 6876 | 6961 | ||
| 6877 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 6962 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
| 6878 | 6963 | ||
| 6964 | env.src_cpu = busiest->cpu; | ||
| 6965 | env.src_rq = busiest; | ||
| 6966 | |||
| 6879 | ld_moved = 0; | 6967 | ld_moved = 0; |
| 6880 | if (busiest->nr_running > 1) { | 6968 | if (busiest->nr_running > 1) { |
| 6881 | /* | 6969 | /* |
| @@ -6885,8 +6973,6 @@ redo: | |||
| 6885 | * correctly treated as an imbalance. | 6973 | * correctly treated as an imbalance. |
| 6886 | */ | 6974 | */ |
| 6887 | env.flags |= LBF_ALL_PINNED; | 6975 | env.flags |= LBF_ALL_PINNED; |
| 6888 | env.src_cpu = busiest->cpu; | ||
| 6889 | env.src_rq = busiest; | ||
| 6890 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6976 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 6891 | 6977 | ||
| 6892 | more_balance: | 6978 | more_balance: |
| @@ -7586,22 +7672,25 @@ end: | |||
| 7586 | 7672 | ||
| 7587 | /* | 7673 | /* |
| 7588 | * Current heuristic for kicking the idle load balancer in the presence | 7674 | * Current heuristic for kicking the idle load balancer in the presence |
| 7589 | * of an idle cpu is the system. | 7675 | * of an idle cpu in the system. |
| 7590 | * - This rq has more than one task. | 7676 | * - This rq has more than one task. |
| 7591 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7677 | * - This rq has at least one CFS task and the capacity of the CPU is |
| 7592 | * busy cpu's exceeding the group's capacity. | 7678 | * significantly reduced because of RT tasks or IRQs. |
| 7679 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
| 7680 | * multiple busy cpu. | ||
| 7593 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7681 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
| 7594 | * domain span are idle. | 7682 | * domain span are idle. |
| 7595 | */ | 7683 | */ |
| 7596 | static inline int nohz_kick_needed(struct rq *rq) | 7684 | static inline bool nohz_kick_needed(struct rq *rq) |
| 7597 | { | 7685 | { |
| 7598 | unsigned long now = jiffies; | 7686 | unsigned long now = jiffies; |
| 7599 | struct sched_domain *sd; | 7687 | struct sched_domain *sd; |
| 7600 | struct sched_group_capacity *sgc; | 7688 | struct sched_group_capacity *sgc; |
| 7601 | int nr_busy, cpu = rq->cpu; | 7689 | int nr_busy, cpu = rq->cpu; |
| 7690 | bool kick = false; | ||
| 7602 | 7691 | ||
| 7603 | if (unlikely(rq->idle_balance)) | 7692 | if (unlikely(rq->idle_balance)) |
| 7604 | return 0; | 7693 | return false; |
| 7605 | 7694 | ||
| 7606 | /* | 7695 | /* |
| 7607 | * We may be recently in ticked or tickless idle mode. At the first | 7696 | * We may be recently in ticked or tickless idle mode. At the first |
| @@ -7615,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
| 7615 | * balancing. | 7704 | * balancing. |
| 7616 | */ | 7705 | */ |
| 7617 | if (likely(!atomic_read(&nohz.nr_cpus))) | 7706 | if (likely(!atomic_read(&nohz.nr_cpus))) |
| 7618 | return 0; | 7707 | return false; |
| 7619 | 7708 | ||
| 7620 | if (time_before(now, nohz.next_balance)) | 7709 | if (time_before(now, nohz.next_balance)) |
| 7621 | return 0; | 7710 | return false; |
| 7622 | 7711 | ||
| 7623 | if (rq->nr_running >= 2) | 7712 | if (rq->nr_running >= 2) |
| 7624 | goto need_kick; | 7713 | return true; |
| 7625 | 7714 | ||
| 7626 | rcu_read_lock(); | 7715 | rcu_read_lock(); |
| 7627 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7716 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 7628 | |||
| 7629 | if (sd) { | 7717 | if (sd) { |
| 7630 | sgc = sd->groups->sgc; | 7718 | sgc = sd->groups->sgc; |
| 7631 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 7719 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
| 7632 | 7720 | ||
| 7633 | if (nr_busy > 1) | 7721 | if (nr_busy > 1) { |
| 7634 | goto need_kick_unlock; | 7722 | kick = true; |
| 7723 | goto unlock; | ||
| 7724 | } | ||
| 7725 | |||
| 7635 | } | 7726 | } |
| 7636 | 7727 | ||
| 7637 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 7728 | sd = rcu_dereference(rq->sd); |
| 7729 | if (sd) { | ||
| 7730 | if ((rq->cfs.h_nr_running >= 1) && | ||
| 7731 | check_cpu_capacity(rq, sd)) { | ||
| 7732 | kick = true; | ||
| 7733 | goto unlock; | ||
| 7734 | } | ||
| 7735 | } | ||
| 7638 | 7736 | ||
| 7737 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
| 7639 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 7738 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, |
| 7640 | sched_domain_span(sd)) < cpu)) | 7739 | sched_domain_span(sd)) < cpu)) { |
| 7641 | goto need_kick_unlock; | 7740 | kick = true; |
| 7642 | 7741 | goto unlock; | |
| 7643 | rcu_read_unlock(); | 7742 | } |
| 7644 | return 0; | ||
| 7645 | 7743 | ||
| 7646 | need_kick_unlock: | 7744 | unlock: |
| 7647 | rcu_read_unlock(); | 7745 | rcu_read_unlock(); |
| 7648 | need_kick: | 7746 | return kick; |
| 7649 | return 1; | ||
| 7650 | } | 7747 | } |
| 7651 | #else | 7748 | #else |
| 7652 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 7749 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
| @@ -7662,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 7662 | enum cpu_idle_type idle = this_rq->idle_balance ? | 7759 | enum cpu_idle_type idle = this_rq->idle_balance ? |
| 7663 | CPU_IDLE : CPU_NOT_IDLE; | 7760 | CPU_IDLE : CPU_NOT_IDLE; |
| 7664 | 7761 | ||
| 7665 | rebalance_domains(this_rq, idle); | ||
| 7666 | |||
| 7667 | /* | 7762 | /* |
| 7668 | * If this cpu has a pending nohz_balance_kick, then do the | 7763 | * If this cpu has a pending nohz_balance_kick, then do the |
| 7669 | * balancing on behalf of the other idle cpus whose ticks are | 7764 | * balancing on behalf of the other idle cpus whose ticks are |
| 7670 | * stopped. | 7765 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
| 7766 | * give the idle cpus a chance to load balance. Else we may | ||
| 7767 | * load balance only within the local sched_domain hierarchy | ||
| 7768 | * and abort nohz_idle_balance altogether if we pull some load. | ||
| 7671 | */ | 7769 | */ |
| 7672 | nohz_idle_balance(this_rq, idle); | 7770 | nohz_idle_balance(this_rq, idle); |
| 7771 | rebalance_domains(this_rq, idle); | ||
| 7673 | } | 7772 | } |
| 7674 | 7773 | ||
| 7675 | /* | 7774 | /* |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) | |||
| 56 | */ | 56 | */ |
| 57 | SCHED_FEAT(TTWU_QUEUE, true) | 57 | SCHED_FEAT(TTWU_QUEUE, true) |
| 58 | 58 | ||
| 59 | #ifdef HAVE_RT_PUSH_IPI | ||
| 60 | /* | ||
| 61 | * In order to avoid a thundering herd attack of CPUs that are | ||
| 62 | * lowering their priorities at the same time, and there being | ||
| 63 | * a single CPU that has an RT task that can migrate and is waiting | ||
| 64 | * to run, where the other CPUs will try to take that CPUs | ||
| 65 | * rq lock and possibly create a large contention, sending an | ||
| 66 | * IPI to that CPU and let that CPU push the RT task to where | ||
| 67 | * it should go may be a better scenario. | ||
| 68 | */ | ||
| 69 | SCHED_FEAT(RT_PUSH_IPI, true) | ||
| 70 | #endif | ||
| 71 | |||
| 59 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
| 60 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
| 61 | SCHED_FEAT(LB_MIN, false) | 74 | SCHED_FEAT(LB_MIN, false) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b077eba0..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include "sched.h" | 6 | #include "sched.h" |
| 7 | 7 | ||
| 8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
| 9 | #include <linux/irq_work.h> | ||
| 9 | 10 | ||
| 10 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; |
| 11 | 12 | ||
| @@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 59 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 60 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
| 60 | } | 61 | } |
| 61 | 62 | ||
| 62 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 63 | #ifdef CONFIG_SMP |
| 64 | static void push_irq_work_func(struct irq_work *work); | ||
| 65 | #endif | ||
| 66 | |||
| 67 | void init_rt_rq(struct rt_rq *rt_rq) | ||
| 63 | { | 68 | { |
| 64 | struct rt_prio_array *array; | 69 | struct rt_prio_array *array; |
| 65 | int i; | 70 | int i; |
| @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 78 | rt_rq->rt_nr_migratory = 0; | 83 | rt_rq->rt_nr_migratory = 0; |
| 79 | rt_rq->overloaded = 0; | 84 | rt_rq->overloaded = 0; |
| 80 | plist_head_init(&rt_rq->pushable_tasks); | 85 | plist_head_init(&rt_rq->pushable_tasks); |
| 86 | |||
| 87 | #ifdef HAVE_RT_PUSH_IPI | ||
| 88 | rt_rq->push_flags = 0; | ||
| 89 | rt_rq->push_cpu = nr_cpu_ids; | ||
| 90 | raw_spin_lock_init(&rt_rq->push_lock); | ||
| 91 | init_irq_work(&rt_rq->push_work, push_irq_work_func); | ||
| 81 | #endif | 92 | #endif |
| 93 | #endif /* CONFIG_SMP */ | ||
| 82 | /* We start is dequeued state, because no RT tasks are queued */ | 94 | /* We start is dequeued state, because no RT tasks are queued */ |
| 83 | rt_rq->rt_queued = 0; | 95 | rt_rq->rt_queued = 0; |
| 84 | 96 | ||
| @@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 193 | if (!rt_se) | 205 | if (!rt_se) |
| 194 | goto err_free_rq; | 206 | goto err_free_rq; |
| 195 | 207 | ||
| 196 | init_rt_rq(rt_rq, cpu_rq(i)); | 208 | init_rt_rq(rt_rq); |
| 197 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 209 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
| 198 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 210 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
| 199 | } | 211 | } |
| @@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) | |||
| 1778 | ; | 1790 | ; |
| 1779 | } | 1791 | } |
| 1780 | 1792 | ||
| 1793 | #ifdef HAVE_RT_PUSH_IPI | ||
| 1794 | /* | ||
| 1795 | * The search for the next cpu always starts at rq->cpu and ends | ||
| 1796 | * when we reach rq->cpu again. It will never return rq->cpu. | ||
| 1797 | * This returns the next cpu to check, or nr_cpu_ids if the loop | ||
| 1798 | * is complete. | ||
| 1799 | * | ||
| 1800 | * rq->rt.push_cpu holds the last cpu returned by this function, | ||
| 1801 | * or if this is the first instance, it must hold rq->cpu. | ||
| 1802 | */ | ||
| 1803 | static int rto_next_cpu(struct rq *rq) | ||
| 1804 | { | ||
| 1805 | int prev_cpu = rq->rt.push_cpu; | ||
| 1806 | int cpu; | ||
| 1807 | |||
| 1808 | cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); | ||
| 1809 | |||
| 1810 | /* | ||
| 1811 | * If the previous cpu is less than the rq's CPU, then it already | ||
| 1812 | * passed the end of the mask, and has started from the beginning. | ||
| 1813 | * We end if the next CPU is greater or equal to rq's CPU. | ||
| 1814 | */ | ||
| 1815 | if (prev_cpu < rq->cpu) { | ||
| 1816 | if (cpu >= rq->cpu) | ||
| 1817 | return nr_cpu_ids; | ||
| 1818 | |||
| 1819 | } else if (cpu >= nr_cpu_ids) { | ||
| 1820 | /* | ||
| 1821 | * We passed the end of the mask, start at the beginning. | ||
| 1822 | * If the result is greater or equal to the rq's CPU, then | ||
| 1823 | * the loop is finished. | ||
| 1824 | */ | ||
| 1825 | cpu = cpumask_first(rq->rd->rto_mask); | ||
| 1826 | if (cpu >= rq->cpu) | ||
| 1827 | return nr_cpu_ids; | ||
| 1828 | } | ||
| 1829 | rq->rt.push_cpu = cpu; | ||
| 1830 | |||
| 1831 | /* Return cpu to let the caller know if the loop is finished or not */ | ||
| 1832 | return cpu; | ||
| 1833 | } | ||
| 1834 | |||
| 1835 | static int find_next_push_cpu(struct rq *rq) | ||
| 1836 | { | ||
| 1837 | struct rq *next_rq; | ||
| 1838 | int cpu; | ||
| 1839 | |||
| 1840 | while (1) { | ||
| 1841 | cpu = rto_next_cpu(rq); | ||
| 1842 | if (cpu >= nr_cpu_ids) | ||
| 1843 | break; | ||
| 1844 | next_rq = cpu_rq(cpu); | ||
| 1845 | |||
| 1846 | /* Make sure the next rq can push to this rq */ | ||
| 1847 | if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) | ||
| 1848 | break; | ||
| 1849 | } | ||
| 1850 | |||
| 1851 | return cpu; | ||
| 1852 | } | ||
| 1853 | |||
| 1854 | #define RT_PUSH_IPI_EXECUTING 1 | ||
| 1855 | #define RT_PUSH_IPI_RESTART 2 | ||
| 1856 | |||
| 1857 | static void tell_cpu_to_push(struct rq *rq) | ||
| 1858 | { | ||
| 1859 | int cpu; | ||
| 1860 | |||
| 1861 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
| 1862 | raw_spin_lock(&rq->rt.push_lock); | ||
| 1863 | /* Make sure it's still executing */ | ||
| 1864 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
| 1865 | /* | ||
| 1866 | * Tell the IPI to restart the loop as things have | ||
| 1867 | * changed since it started. | ||
| 1868 | */ | ||
| 1869 | rq->rt.push_flags |= RT_PUSH_IPI_RESTART; | ||
| 1870 | raw_spin_unlock(&rq->rt.push_lock); | ||
| 1871 | return; | ||
| 1872 | } | ||
| 1873 | raw_spin_unlock(&rq->rt.push_lock); | ||
| 1874 | } | ||
| 1875 | |||
| 1876 | /* When here, there's no IPI going around */ | ||
| 1877 | |||
| 1878 | rq->rt.push_cpu = rq->cpu; | ||
| 1879 | cpu = find_next_push_cpu(rq); | ||
| 1880 | if (cpu >= nr_cpu_ids) | ||
| 1881 | return; | ||
| 1882 | |||
| 1883 | rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; | ||
| 1884 | |||
| 1885 | irq_work_queue_on(&rq->rt.push_work, cpu); | ||
| 1886 | } | ||
| 1887 | |||
| 1888 | /* Called from hardirq context */ | ||
| 1889 | static void try_to_push_tasks(void *arg) | ||
| 1890 | { | ||
| 1891 | struct rt_rq *rt_rq = arg; | ||
| 1892 | struct rq *rq, *src_rq; | ||
| 1893 | int this_cpu; | ||
| 1894 | int cpu; | ||
| 1895 | |||
| 1896 | this_cpu = rt_rq->push_cpu; | ||
| 1897 | |||
| 1898 | /* Paranoid check */ | ||
| 1899 | BUG_ON(this_cpu != smp_processor_id()); | ||
| 1900 | |||
| 1901 | rq = cpu_rq(this_cpu); | ||
| 1902 | src_rq = rq_of_rt_rq(rt_rq); | ||
| 1903 | |||
| 1904 | again: | ||
| 1905 | if (has_pushable_tasks(rq)) { | ||
| 1906 | raw_spin_lock(&rq->lock); | ||
| 1907 | push_rt_task(rq); | ||
| 1908 | raw_spin_unlock(&rq->lock); | ||
| 1909 | } | ||
| 1910 | |||
| 1911 | /* Pass the IPI to the next rt overloaded queue */ | ||
| 1912 | raw_spin_lock(&rt_rq->push_lock); | ||
| 1913 | /* | ||
| 1914 | * If the source queue changed since the IPI went out, | ||
| 1915 | * we need to restart the search from that CPU again. | ||
| 1916 | */ | ||
| 1917 | if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { | ||
| 1918 | rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; | ||
| 1919 | rt_rq->push_cpu = src_rq->cpu; | ||
| 1920 | } | ||
| 1921 | |||
| 1922 | cpu = find_next_push_cpu(src_rq); | ||
| 1923 | |||
| 1924 | if (cpu >= nr_cpu_ids) | ||
| 1925 | rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; | ||
| 1926 | raw_spin_unlock(&rt_rq->push_lock); | ||
| 1927 | |||
| 1928 | if (cpu >= nr_cpu_ids) | ||
| 1929 | return; | ||
| 1930 | |||
| 1931 | /* | ||
| 1932 | * It is possible that a restart caused this CPU to be | ||
| 1933 | * chosen again. Don't bother with an IPI, just see if we | ||
| 1934 | * have more to push. | ||
| 1935 | */ | ||
| 1936 | if (unlikely(cpu == rq->cpu)) | ||
| 1937 | goto again; | ||
| 1938 | |||
| 1939 | /* Try the next RT overloaded CPU */ | ||
| 1940 | irq_work_queue_on(&rt_rq->push_work, cpu); | ||
| 1941 | } | ||
| 1942 | |||
| 1943 | static void push_irq_work_func(struct irq_work *work) | ||
| 1944 | { | ||
| 1945 | struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); | ||
| 1946 | |||
| 1947 | try_to_push_tasks(rt_rq); | ||
| 1948 | } | ||
| 1949 | #endif /* HAVE_RT_PUSH_IPI */ | ||
| 1950 | |||
| 1781 | static int pull_rt_task(struct rq *this_rq) | 1951 | static int pull_rt_task(struct rq *this_rq) |
| 1782 | { | 1952 | { |
| 1783 | int this_cpu = this_rq->cpu, ret = 0, cpu; | 1953 | int this_cpu = this_rq->cpu, ret = 0, cpu; |
| @@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1793 | */ | 1963 | */ |
| 1794 | smp_rmb(); | 1964 | smp_rmb(); |
| 1795 | 1965 | ||
| 1966 | #ifdef HAVE_RT_PUSH_IPI | ||
| 1967 | if (sched_feat(RT_PUSH_IPI)) { | ||
| 1968 | tell_cpu_to_push(this_rq); | ||
| 1969 | return 0; | ||
| 1970 | } | ||
| 1971 | #endif | ||
| 1972 | |||
| 1796 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1973 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
| 1797 | if (this_cpu == cpu) | 1974 | if (this_cpu == cpu) |
| 1798 | continue; | 1975 | continue; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435a2779..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
| 7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
| 8 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
| 9 | #include <linux/irq_work.h> | ||
| 9 | #include <linux/tick.h> | 10 | #include <linux/tick.h> |
| 10 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
| 11 | 12 | ||
| @@ -362,8 +363,14 @@ struct cfs_rq { | |||
| 362 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 363 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
| 363 | * This allows for the description of both thread and group usage (in | 364 | * This allows for the description of both thread and group usage (in |
| 364 | * the FAIR_GROUP_SCHED case). | 365 | * the FAIR_GROUP_SCHED case). |
| 366 | * runnable_load_avg is the sum of the load_avg_contrib of the | ||
| 367 | * sched_entities on the rq. | ||
| 368 | * blocked_load_avg is similar to runnable_load_avg except that its | ||
| 369 | * the blocked sched_entities on the rq. | ||
| 370 | * utilization_load_avg is the sum of the average running time of the | ||
| 371 | * sched_entities on the rq. | ||
| 365 | */ | 372 | */ |
| 366 | unsigned long runnable_load_avg, blocked_load_avg; | 373 | unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; |
| 367 | atomic64_t decay_counter; | 374 | atomic64_t decay_counter; |
| 368 | u64 last_decay; | 375 | u64 last_decay; |
| 369 | atomic_long_t removed_load; | 376 | atomic_long_t removed_load; |
| @@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) | |||
| 418 | return sysctl_sched_rt_runtime >= 0; | 425 | return sysctl_sched_rt_runtime >= 0; |
| 419 | } | 426 | } |
| 420 | 427 | ||
| 428 | /* RT IPI pull logic requires IRQ_WORK */ | ||
| 429 | #ifdef CONFIG_IRQ_WORK | ||
| 430 | # define HAVE_RT_PUSH_IPI | ||
| 431 | #endif | ||
| 432 | |||
| 421 | /* Real-Time classes' related field in a runqueue: */ | 433 | /* Real-Time classes' related field in a runqueue: */ |
| 422 | struct rt_rq { | 434 | struct rt_rq { |
| 423 | struct rt_prio_array active; | 435 | struct rt_prio_array active; |
| @@ -435,7 +447,13 @@ struct rt_rq { | |||
| 435 | unsigned long rt_nr_total; | 447 | unsigned long rt_nr_total; |
| 436 | int overloaded; | 448 | int overloaded; |
| 437 | struct plist_head pushable_tasks; | 449 | struct plist_head pushable_tasks; |
| 450 | #ifdef HAVE_RT_PUSH_IPI | ||
| 451 | int push_flags; | ||
| 452 | int push_cpu; | ||
| 453 | struct irq_work push_work; | ||
| 454 | raw_spinlock_t push_lock; | ||
| 438 | #endif | 455 | #endif |
| 456 | #endif /* CONFIG_SMP */ | ||
| 439 | int rt_queued; | 457 | int rt_queued; |
| 440 | 458 | ||
| 441 | int rt_throttled; | 459 | int rt_throttled; |
| @@ -597,6 +615,7 @@ struct rq { | |||
| 597 | struct sched_domain *sd; | 615 | struct sched_domain *sd; |
| 598 | 616 | ||
| 599 | unsigned long cpu_capacity; | 617 | unsigned long cpu_capacity; |
| 618 | unsigned long cpu_capacity_orig; | ||
| 600 | 619 | ||
| 601 | unsigned char idle_balance; | 620 | unsigned char idle_balance; |
| 602 | /* For active balancing */ | 621 | /* For active balancing */ |
| @@ -807,7 +826,7 @@ struct sched_group_capacity { | |||
| 807 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity | 826 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity |
| 808 | * for a single CPU. | 827 | * for a single CPU. |
| 809 | */ | 828 | */ |
| 810 | unsigned int capacity, capacity_orig; | 829 | unsigned int capacity; |
| 811 | unsigned long next_update; | 830 | unsigned long next_update; |
| 812 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 831 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
| 813 | /* | 832 | /* |
| @@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) | |||
| 1368 | 1387 | ||
| 1369 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
| 1370 | extern void sched_avg_update(struct rq *rq); | 1389 | extern void sched_avg_update(struct rq *rq); |
| 1390 | |||
| 1391 | #ifndef arch_scale_freq_capacity | ||
| 1392 | static __always_inline | ||
| 1393 | unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
| 1394 | { | ||
| 1395 | return SCHED_CAPACITY_SCALE; | ||
| 1396 | } | ||
| 1397 | #endif | ||
| 1398 | |||
| 1371 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1399 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
| 1372 | { | 1400 | { |
| 1373 | rq->rt_avg += rt_delta; | 1401 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
| 1374 | sched_avg_update(rq); | 1402 | sched_avg_update(rq); |
| 1375 | } | 1403 | } |
| 1376 | #else | 1404 | #else |
| @@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
| 1643 | extern void print_dl_stats(struct seq_file *m, int cpu); | 1671 | extern void print_dl_stats(struct seq_file *m, int cpu); |
| 1644 | 1672 | ||
| 1645 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1673 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
| 1646 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1674 | extern void init_rt_rq(struct rt_rq *rt_rq); |
| 1647 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | 1675 | extern void init_dl_rq(struct dl_rq *dl_rq); |
| 1648 | 1676 | ||
| 1649 | extern void cfs_bandwidth_usage_inc(void); | 1677 | extern void cfs_bandwidth_usage_inc(void); |
| 1650 | extern void cfs_bandwidth_usage_dec(void); | 1678 | extern void cfs_bandwidth_usage_dec(void); |
