diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 121 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 77 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 12 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 437 | ||||
| -rw-r--r-- | kernel/sched/features.h | 13 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 14 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 181 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 38 |
8 files changed, 652 insertions, 241 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..f9123a82cbb6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -306,6 +306,9 @@ __read_mostly int scheduler_running; | |||
| 306 | */ | 306 | */ |
| 307 | int sysctl_sched_rt_runtime = 950000; | 307 | int sysctl_sched_rt_runtime = 950000; |
| 308 | 308 | ||
| 309 | /* cpus with isolated domains */ | ||
| 310 | cpumask_var_t cpu_isolated_map; | ||
| 311 | |||
| 309 | /* | 312 | /* |
| 310 | * this_rq_lock - lock this runqueue and disable interrupts. | 313 | * this_rq_lock - lock this runqueue and disable interrupts. |
| 311 | */ | 314 | */ |
| @@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void) | |||
| 690 | bool sched_can_stop_tick(void) | 693 | bool sched_can_stop_tick(void) |
| 691 | { | 694 | { |
| 692 | /* | 695 | /* |
| 696 | * FIFO realtime policy runs the highest priority task. Other runnable | ||
| 697 | * tasks are of a lower priority. The scheduler tick does nothing. | ||
| 698 | */ | ||
| 699 | if (current->policy == SCHED_FIFO) | ||
| 700 | return true; | ||
| 701 | |||
| 702 | /* | ||
| 703 | * Round-robin realtime tasks time slice with other tasks at the same | ||
| 704 | * realtime priority. Is this task the only one at this priority? | ||
| 705 | */ | ||
| 706 | if (current->policy == SCHED_RR) { | ||
| 707 | struct sched_rt_entity *rt_se = ¤t->rt; | ||
| 708 | |||
| 709 | return rt_se->run_list.prev == rt_se->run_list.next; | ||
| 710 | } | ||
| 711 | |||
| 712 | /* | ||
| 693 | * More than one running task need preemption. | 713 | * More than one running task need preemption. |
| 694 | * nr_running update is assumed to be visible | 714 | * nr_running update is assumed to be visible |
| 695 | * after IPI is sent from wakers. | 715 | * after IPI is sent from wakers. |
| @@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 996 | rq_clock_skip_update(rq, true); | 1016 | rq_clock_skip_update(rq, true); |
| 997 | } | 1017 | } |
| 998 | 1018 | ||
| 1019 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
| 1020 | |||
| 1021 | void register_task_migration_notifier(struct notifier_block *n) | ||
| 1022 | { | ||
| 1023 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
| 1024 | } | ||
| 1025 | |||
| 999 | #ifdef CONFIG_SMP | 1026 | #ifdef CONFIG_SMP |
| 1000 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1027 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
| 1001 | { | 1028 | { |
| @@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1026 | trace_sched_migrate_task(p, new_cpu); | 1053 | trace_sched_migrate_task(p, new_cpu); |
| 1027 | 1054 | ||
| 1028 | if (task_cpu(p) != new_cpu) { | 1055 | if (task_cpu(p) != new_cpu) { |
| 1056 | struct task_migration_notifier tmn; | ||
| 1057 | |||
| 1029 | if (p->sched_class->migrate_task_rq) | 1058 | if (p->sched_class->migrate_task_rq) |
| 1030 | p->sched_class->migrate_task_rq(p, new_cpu); | 1059 | p->sched_class->migrate_task_rq(p, new_cpu); |
| 1031 | p->se.nr_migrations++; | 1060 | p->se.nr_migrations++; |
| 1032 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); | 1061 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); |
| 1062 | |||
| 1063 | tmn.task = p; | ||
| 1064 | tmn.from_cpu = task_cpu(p); | ||
| 1065 | tmn.to_cpu = new_cpu; | ||
| 1066 | |||
| 1067 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
| 1033 | } | 1068 | } |
| 1034 | 1069 | ||
| 1035 | __set_task_cpu(p, new_cpu); | 1070 | __set_task_cpu(p, new_cpu); |
| @@ -2818,7 +2853,7 @@ asmlinkage __visible void __sched schedule_user(void) | |||
| 2818 | * we find a better solution. | 2853 | * we find a better solution. |
| 2819 | * | 2854 | * |
| 2820 | * NB: There are buggy callers of this function. Ideally we | 2855 | * NB: There are buggy callers of this function. Ideally we |
| 2821 | * should warn if prev_state != IN_USER, but that will trigger | 2856 | * should warn if prev_state != CONTEXT_USER, but that will trigger |
| 2822 | * too frequently to make sense yet. | 2857 | * too frequently to make sense yet. |
| 2823 | */ | 2858 | */ |
| 2824 | enum ctx_state prev_state = exception_enter(); | 2859 | enum ctx_state prev_state = exception_enter(); |
| @@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3034 | } else { | 3069 | } else { |
| 3035 | if (dl_prio(oldprio)) | 3070 | if (dl_prio(oldprio)) |
| 3036 | p->dl.dl_boosted = 0; | 3071 | p->dl.dl_boosted = 0; |
| 3072 | if (rt_prio(oldprio)) | ||
| 3073 | p->rt.timeout = 0; | ||
| 3037 | p->sched_class = &fair_sched_class; | 3074 | p->sched_class = &fair_sched_class; |
| 3038 | } | 3075 | } |
| 3039 | 3076 | ||
| @@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
| 5318 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5355 | static int sched_cpu_inactive(struct notifier_block *nfb, |
| 5319 | unsigned long action, void *hcpu) | 5356 | unsigned long action, void *hcpu) |
| 5320 | { | 5357 | { |
| 5321 | unsigned long flags; | ||
| 5322 | long cpu = (long)hcpu; | ||
| 5323 | struct dl_bw *dl_b; | ||
| 5324 | |||
| 5325 | switch (action & ~CPU_TASKS_FROZEN) { | 5358 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5326 | case CPU_DOWN_PREPARE: | 5359 | case CPU_DOWN_PREPARE: |
| 5327 | set_cpu_active(cpu, false); | 5360 | set_cpu_active((long)hcpu, false); |
| 5328 | |||
| 5329 | /* explicitly allow suspend */ | ||
| 5330 | if (!(action & CPU_TASKS_FROZEN)) { | ||
| 5331 | bool overflow; | ||
| 5332 | int cpus; | ||
| 5333 | |||
| 5334 | rcu_read_lock_sched(); | ||
| 5335 | dl_b = dl_bw_of(cpu); | ||
| 5336 | |||
| 5337 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 5338 | cpus = dl_bw_cpus(cpu); | ||
| 5339 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 5340 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 5341 | |||
| 5342 | rcu_read_unlock_sched(); | ||
| 5343 | |||
| 5344 | if (overflow) | ||
| 5345 | return notifier_from_errno(-EBUSY); | ||
| 5346 | } | ||
| 5347 | return NOTIFY_OK; | 5361 | return NOTIFY_OK; |
| 5362 | default: | ||
| 5363 | return NOTIFY_DONE; | ||
| 5348 | } | 5364 | } |
| 5349 | |||
| 5350 | return NOTIFY_DONE; | ||
| 5351 | } | 5365 | } |
| 5352 | 5366 | ||
| 5353 | static int __init migration_init(void) | 5367 | static int __init migration_init(void) |
| @@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 5428 | break; | 5442 | break; |
| 5429 | } | 5443 | } |
| 5430 | 5444 | ||
| 5431 | /* | ||
| 5432 | * Even though we initialize ->capacity to something semi-sane, | ||
| 5433 | * we leave capacity_orig unset. This allows us to detect if | ||
| 5434 | * domain iteration is still funny without causing /0 traps. | ||
| 5435 | */ | ||
| 5436 | if (!group->sgc->capacity_orig) { | ||
| 5437 | printk(KERN_CONT "\n"); | ||
| 5438 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); | ||
| 5439 | break; | ||
| 5440 | } | ||
| 5441 | |||
| 5442 | if (!cpumask_weight(sched_group_cpus(group))) { | 5445 | if (!cpumask_weight(sched_group_cpus(group))) { |
| 5443 | printk(KERN_CONT "\n"); | 5446 | printk(KERN_CONT "\n"); |
| 5444 | printk(KERN_ERR "ERROR: empty group\n"); | 5447 | printk(KERN_ERR "ERROR: empty group\n"); |
| @@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 5811 | update_top_cache_domain(cpu); | 5814 | update_top_cache_domain(cpu); |
| 5812 | } | 5815 | } |
| 5813 | 5816 | ||
| 5814 | /* cpus with isolated domains */ | ||
| 5815 | static cpumask_var_t cpu_isolated_map; | ||
| 5816 | |||
| 5817 | /* Setup the mask of cpus configured for isolated domains */ | 5817 | /* Setup the mask of cpus configured for isolated domains */ |
| 5818 | static int __init isolated_cpu_setup(char *str) | 5818 | static int __init isolated_cpu_setup(char *str) |
| 5819 | { | 5819 | { |
| @@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5922 | * die on a /0 trap. | 5922 | * die on a /0 trap. |
| 5923 | */ | 5923 | */ |
| 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
| 5925 | sg->sgc->capacity_orig = sg->sgc->capacity; | ||
| 5926 | 5925 | ||
| 5927 | /* | 5926 | /* |
| 5928 | * Make sure the first group of this domain contains the | 5927 | * Make sure the first group of this domain contains the |
| @@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6233 | */ | 6232 | */ |
| 6234 | 6233 | ||
| 6235 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6234 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
| 6235 | sd->flags |= SD_PREFER_SIBLING; | ||
| 6236 | sd->imbalance_pct = 110; | 6236 | sd->imbalance_pct = 110; |
| 6237 | sd->smt_gain = 1178; /* ~15% */ | 6237 | sd->smt_gain = 1178; /* ~15% */ |
| 6238 | 6238 | ||
| @@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
| 6998 | */ | 6998 | */ |
| 6999 | 6999 | ||
| 7000 | case CPU_ONLINE: | 7000 | case CPU_ONLINE: |
| 7001 | case CPU_DOWN_FAILED: | ||
| 7002 | cpuset_update_active_cpus(true); | 7001 | cpuset_update_active_cpus(true); |
| 7003 | break; | 7002 | break; |
| 7004 | default: | 7003 | default: |
| @@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
| 7010 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7009 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
| 7011 | void *hcpu) | 7010 | void *hcpu) |
| 7012 | { | 7011 | { |
| 7013 | switch (action) { | 7012 | unsigned long flags; |
| 7013 | long cpu = (long)hcpu; | ||
| 7014 | struct dl_bw *dl_b; | ||
| 7015 | |||
| 7016 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 7014 | case CPU_DOWN_PREPARE: | 7017 | case CPU_DOWN_PREPARE: |
| 7018 | /* explicitly allow suspend */ | ||
| 7019 | if (!(action & CPU_TASKS_FROZEN)) { | ||
| 7020 | bool overflow; | ||
| 7021 | int cpus; | ||
| 7022 | |||
| 7023 | rcu_read_lock_sched(); | ||
| 7024 | dl_b = dl_bw_of(cpu); | ||
| 7025 | |||
| 7026 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 7027 | cpus = dl_bw_cpus(cpu); | ||
| 7028 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 7029 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 7030 | |||
| 7031 | rcu_read_unlock_sched(); | ||
| 7032 | |||
| 7033 | if (overflow) | ||
| 7034 | return notifier_from_errno(-EBUSY); | ||
| 7035 | } | ||
| 7015 | cpuset_update_active_cpus(false); | 7036 | cpuset_update_active_cpus(false); |
| 7016 | break; | 7037 | break; |
| 7017 | case CPU_DOWN_PREPARE_FROZEN: | 7038 | case CPU_DOWN_PREPARE_FROZEN: |
| @@ -7156,8 +7177,8 @@ void __init sched_init(void) | |||
| 7156 | rq->calc_load_active = 0; | 7177 | rq->calc_load_active = 0; |
| 7157 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7178 | rq->calc_load_update = jiffies + LOAD_FREQ; |
| 7158 | init_cfs_rq(&rq->cfs); | 7179 | init_cfs_rq(&rq->cfs); |
| 7159 | init_rt_rq(&rq->rt, rq); | 7180 | init_rt_rq(&rq->rt); |
| 7160 | init_dl_rq(&rq->dl, rq); | 7181 | init_dl_rq(&rq->dl); |
| 7161 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7182 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7162 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 7183 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
| 7163 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7184 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| @@ -7197,7 +7218,7 @@ void __init sched_init(void) | |||
| 7197 | #ifdef CONFIG_SMP | 7218 | #ifdef CONFIG_SMP |
| 7198 | rq->sd = NULL; | 7219 | rq->sd = NULL; |
| 7199 | rq->rd = NULL; | 7220 | rq->rd = NULL; |
| 7200 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; | 7221 | rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; |
| 7201 | rq->post_schedule = 0; | 7222 | rq->post_schedule = 0; |
| 7202 | rq->active_balance = 0; | 7223 | rq->active_balance = 0; |
| 7203 | rq->next_balance = jiffies; | 7224 | rq->next_balance = jiffies; |
| @@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void) | |||
| 7796 | } | 7817 | } |
| 7797 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7818 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7798 | 7819 | ||
| 7799 | static int sched_dl_global_constraints(void) | 7820 | static int sched_dl_global_validate(void) |
| 7800 | { | 7821 | { |
| 7801 | u64 runtime = global_rt_runtime(); | 7822 | u64 runtime = global_rt_runtime(); |
| 7802 | u64 period = global_rt_period(); | 7823 | u64 period = global_rt_period(); |
| @@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
| 7897 | if (ret) | 7918 | if (ret) |
| 7898 | goto undo; | 7919 | goto undo; |
| 7899 | 7920 | ||
| 7900 | ret = sched_rt_global_constraints(); | 7921 | ret = sched_dl_global_validate(); |
| 7901 | if (ret) | 7922 | if (ret) |
| 7902 | goto undo; | 7923 | goto undo; |
| 7903 | 7924 | ||
| 7904 | ret = sched_dl_global_constraints(); | 7925 | ret = sched_rt_global_constraints(); |
| 7905 | if (ret) | 7926 | if (ret) |
| 7906 | goto undo; | 7927 | goto undo; |
| 7907 | 7928 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) | |||
| 69 | dl_b->total_bw = 0; | 69 | dl_b->total_bw = 0; |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | 72 | void init_dl_rq(struct dl_rq *dl_rq) |
| 73 | { | 73 | { |
| 74 | dl_rq->rb_root = RB_ROOT; | 74 | dl_rq->rb_root = RB_ROOT; |
| 75 | 75 | ||
| @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) | |||
| 218 | rq->post_schedule = has_pushable_dl_tasks(rq); | 218 | rq->post_schedule = has_pushable_dl_tasks(rq); |
| 219 | } | 219 | } |
| 220 | 220 | ||
| 221 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | ||
| 222 | |||
| 223 | static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) | ||
| 224 | { | ||
| 225 | struct rq *later_rq = NULL; | ||
| 226 | bool fallback = false; | ||
| 227 | |||
| 228 | later_rq = find_lock_later_rq(p, rq); | ||
| 229 | |||
| 230 | if (!later_rq) { | ||
| 231 | int cpu; | ||
| 232 | |||
| 233 | /* | ||
| 234 | * If we cannot preempt any rq, fall back to pick any | ||
| 235 | * online cpu. | ||
| 236 | */ | ||
| 237 | fallback = true; | ||
| 238 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | ||
| 239 | if (cpu >= nr_cpu_ids) { | ||
| 240 | /* | ||
| 241 | * Fail to find any suitable cpu. | ||
| 242 | * The task will never come back! | ||
| 243 | */ | ||
| 244 | BUG_ON(dl_bandwidth_enabled()); | ||
| 245 | |||
| 246 | /* | ||
| 247 | * If admission control is disabled we | ||
| 248 | * try a little harder to let the task | ||
| 249 | * run. | ||
| 250 | */ | ||
| 251 | cpu = cpumask_any(cpu_active_mask); | ||
| 252 | } | ||
| 253 | later_rq = cpu_rq(cpu); | ||
| 254 | double_lock_balance(rq, later_rq); | ||
| 255 | } | ||
| 256 | |||
| 257 | deactivate_task(rq, p, 0); | ||
| 258 | set_task_cpu(p, later_rq->cpu); | ||
| 259 | activate_task(later_rq, p, ENQUEUE_REPLENISH); | ||
| 260 | |||
| 261 | if (!fallback) | ||
| 262 | resched_curr(later_rq); | ||
| 263 | |||
| 264 | double_unlock_balance(rq, later_rq); | ||
| 265 | } | ||
| 266 | |||
| 221 | #else | 267 | #else |
| 222 | 268 | ||
| 223 | static inline | 269 | static inline |
| @@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 514 | unsigned long flags; | 560 | unsigned long flags; |
| 515 | struct rq *rq; | 561 | struct rq *rq; |
| 516 | 562 | ||
| 517 | rq = task_rq_lock(current, &flags); | 563 | rq = task_rq_lock(p, &flags); |
| 518 | 564 | ||
| 519 | /* | 565 | /* |
| 520 | * We need to take care of several possible races here: | 566 | * We need to take care of several possible races here: |
| @@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 536 | sched_clock_tick(); | 582 | sched_clock_tick(); |
| 537 | update_rq_clock(rq); | 583 | update_rq_clock(rq); |
| 538 | 584 | ||
| 585 | #ifdef CONFIG_SMP | ||
| 586 | /* | ||
| 587 | * If we find that the rq the task was on is no longer | ||
| 588 | * available, we need to select a new rq. | ||
| 589 | */ | ||
| 590 | if (unlikely(!rq->online)) { | ||
| 591 | dl_task_offline_migration(rq, p); | ||
| 592 | goto unlock; | ||
| 593 | } | ||
| 594 | #endif | ||
| 595 | |||
| 539 | /* | 596 | /* |
| 540 | * If the throttle happened during sched-out; like: | 597 | * If the throttle happened during sched-out; like: |
| 541 | * | 598 | * |
| @@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 569 | push_dl_task(rq); | 626 | push_dl_task(rq); |
| 570 | #endif | 627 | #endif |
| 571 | unlock: | 628 | unlock: |
| 572 | task_rq_unlock(rq, current, &flags); | 629 | task_rq_unlock(rq, p, &flags); |
| 573 | 630 | ||
| 574 | return HRTIMER_NORESTART; | 631 | return HRTIMER_NORESTART; |
| 575 | } | 632 | } |
| @@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq) | |||
| 914 | } | 971 | } |
| 915 | update_rq_clock(rq); | 972 | update_rq_clock(rq); |
| 916 | update_curr_dl(rq); | 973 | update_curr_dl(rq); |
| 974 | /* | ||
| 975 | * Tell update_rq_clock() that we've just updated, | ||
| 976 | * so we don't do microscopic update in schedule() | ||
| 977 | * and double the fastpath cost. | ||
| 978 | */ | ||
| 979 | rq_clock_skip_update(rq, true); | ||
| 917 | } | 980 | } |
| 918 | 981 | ||
| 919 | #ifdef CONFIG_SMP | 982 | #ifdef CONFIG_SMP |
| @@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1659 | { | 1722 | { |
| 1660 | int check_resched = 1; | 1723 | int check_resched = 1; |
| 1661 | 1724 | ||
| 1662 | /* | ||
| 1663 | * If p is throttled, don't consider the possibility | ||
| 1664 | * of preempting rq->curr, the check will be done right | ||
| 1665 | * after its runtime will get replenished. | ||
| 1666 | */ | ||
| 1667 | if (unlikely(p->dl.dl_throttled)) | ||
| 1668 | return; | ||
| 1669 | |||
| 1670 | if (task_on_rq_queued(p) && rq->curr != p) { | 1725 | if (task_on_rq_queued(p) && rq->curr != p) { |
| 1671 | #ifdef CONFIG_SMP | 1726 | #ifdef CONFIG_SMP |
| 1672 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && | 1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf858d25c..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 71 | if (!se) { | 71 | if (!se) { |
| 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; |
| 73 | P(avg->runnable_avg_sum); | 73 | P(avg->runnable_avg_sum); |
| 74 | P(avg->runnable_avg_period); | 74 | P(avg->avg_period); |
| 75 | return; | 75 | return; |
| 76 | } | 76 | } |
| 77 | 77 | ||
| @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 94 | P(se->load.weight); | 94 | P(se->load.weight); |
| 95 | #ifdef CONFIG_SMP | 95 | #ifdef CONFIG_SMP |
| 96 | P(se->avg.runnable_avg_sum); | 96 | P(se->avg.runnable_avg_sum); |
| 97 | P(se->avg.runnable_avg_period); | 97 | P(se->avg.running_avg_sum); |
| 98 | P(se->avg.avg_period); | ||
| 98 | P(se->avg.load_avg_contrib); | 99 | P(se->avg.load_avg_contrib); |
| 100 | P(se->avg.utilization_avg_contrib); | ||
| 99 | P(se->avg.decay_count); | 101 | P(se->avg.decay_count); |
| 100 | #endif | 102 | #endif |
| 101 | #undef PN | 103 | #undef PN |
| @@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 214 | cfs_rq->runnable_load_avg); | 216 | cfs_rq->runnable_load_avg); |
| 215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 217 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", |
| 216 | cfs_rq->blocked_load_avg); | 218 | cfs_rq->blocked_load_avg); |
| 219 | SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", | ||
| 220 | cfs_rq->utilization_load_avg); | ||
| 217 | #ifdef CONFIG_FAIR_GROUP_SCHED | 221 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", |
| 219 | cfs_rq->tg_load_contrib); | 223 | cfs_rq->tg_load_contrib); |
| @@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 636 | P(se.load.weight); | 640 | P(se.load.weight); |
| 637 | #ifdef CONFIG_SMP | 641 | #ifdef CONFIG_SMP |
| 638 | P(se.avg.runnable_avg_sum); | 642 | P(se.avg.runnable_avg_sum); |
| 639 | P(se.avg.runnable_avg_period); | 643 | P(se.avg.running_avg_sum); |
| 644 | P(se.avg.avg_period); | ||
| 640 | P(se.avg.load_avg_contrib); | 645 | P(se.avg.load_avg_contrib); |
| 646 | P(se.avg.utilization_avg_contrib); | ||
| 641 | P(se.avg.decay_count); | 647 | P(se.avg.decay_count); |
| 642 | #endif | 648 | #endif |
| 643 | P(policy); | 649 | P(policy); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); | |||
| 670 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
| 671 | 671 | ||
| 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| 673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | ||
| 673 | 674 | ||
| 674 | /* Give new task start runnable values to heavy its load in infant time */ | 675 | /* Give new task start runnable values to heavy its load in infant time */ |
| 675 | void init_task_runnable_average(struct task_struct *p) | 676 | void init_task_runnable_average(struct task_struct *p) |
| @@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) | |||
| 677 | u32 slice; | 678 | u32 slice; |
| 678 | 679 | ||
| 679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; |
| 680 | p->se.avg.runnable_avg_sum = slice; | 681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; |
| 681 | p->se.avg.runnable_avg_period = slice; | 682 | p->se.avg.avg_period = slice; |
| 682 | __update_task_entity_contrib(&p->se); | 683 | __update_task_entity_contrib(&p->se); |
| 684 | __update_task_entity_utilization(&p->se); | ||
| 683 | } | 685 | } |
| 684 | #else | 686 | #else |
| 685 | void init_task_runnable_average(struct task_struct *p) | 687 | void init_task_runnable_average(struct task_struct *p) |
| @@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, | |||
| 1196 | static bool load_too_imbalanced(long src_load, long dst_load, | 1198 | static bool load_too_imbalanced(long src_load, long dst_load, |
| 1197 | struct task_numa_env *env) | 1199 | struct task_numa_env *env) |
| 1198 | { | 1200 | { |
| 1199 | long imb, old_imb; | ||
| 1200 | long orig_src_load, orig_dst_load; | ||
| 1201 | long src_capacity, dst_capacity; | 1201 | long src_capacity, dst_capacity; |
| 1202 | long orig_src_load; | ||
| 1203 | long load_a, load_b; | ||
| 1204 | long moved_load; | ||
| 1205 | long imb; | ||
| 1202 | 1206 | ||
| 1203 | /* | 1207 | /* |
| 1204 | * The load is corrected for the CPU capacity available on each node. | 1208 | * The load is corrected for the CPU capacity available on each node. |
| @@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
| 1211 | dst_capacity = env->dst_stats.compute_capacity; | 1215 | dst_capacity = env->dst_stats.compute_capacity; |
| 1212 | 1216 | ||
| 1213 | /* We care about the slope of the imbalance, not the direction. */ | 1217 | /* We care about the slope of the imbalance, not the direction. */ |
| 1214 | if (dst_load < src_load) | 1218 | load_a = dst_load; |
| 1215 | swap(dst_load, src_load); | 1219 | load_b = src_load; |
| 1220 | if (load_a < load_b) | ||
| 1221 | swap(load_a, load_b); | ||
| 1216 | 1222 | ||
| 1217 | /* Is the difference below the threshold? */ | 1223 | /* Is the difference below the threshold? */ |
| 1218 | imb = dst_load * src_capacity * 100 - | 1224 | imb = load_a * src_capacity * 100 - |
| 1219 | src_load * dst_capacity * env->imbalance_pct; | 1225 | load_b * dst_capacity * env->imbalance_pct; |
| 1220 | if (imb <= 0) | 1226 | if (imb <= 0) |
| 1221 | return false; | 1227 | return false; |
| 1222 | 1228 | ||
| 1223 | /* | 1229 | /* |
| 1224 | * The imbalance is above the allowed threshold. | 1230 | * The imbalance is above the allowed threshold. |
| 1225 | * Compare it with the old imbalance. | 1231 | * Allow a move that brings us closer to a balanced situation, |
| 1232 | * without moving things past the point of balance. | ||
| 1226 | */ | 1233 | */ |
| 1227 | orig_src_load = env->src_stats.load; | 1234 | orig_src_load = env->src_stats.load; |
| 1228 | orig_dst_load = env->dst_stats.load; | ||
| 1229 | 1235 | ||
| 1230 | if (orig_dst_load < orig_src_load) | 1236 | /* |
| 1231 | swap(orig_dst_load, orig_src_load); | 1237 | * In a task swap, there will be one load moving from src to dst, |
| 1232 | 1238 | * and another moving back. This is the net sum of both moves. | |
| 1233 | old_imb = orig_dst_load * src_capacity * 100 - | 1239 | * A simple task move will always have a positive value. |
| 1234 | orig_src_load * dst_capacity * env->imbalance_pct; | 1240 | * Allow the move if it brings the system closer to a balanced |
| 1241 | * situation, without crossing over the balance point. | ||
| 1242 | */ | ||
| 1243 | moved_load = orig_src_load - src_load; | ||
| 1235 | 1244 | ||
| 1236 | /* Would this change make things worse? */ | 1245 | if (moved_load > 0) |
| 1237 | return (imb > old_imb); | 1246 | /* Moving src -> dst. Did we overshoot balance? */ |
| 1247 | return src_load * dst_capacity < dst_load * src_capacity; | ||
| 1248 | else | ||
| 1249 | /* Moving dst -> src. Did we overshoot balance? */ | ||
| 1250 | return dst_load * src_capacity < src_load * dst_capacity; | ||
| 1238 | } | 1251 | } |
| 1239 | 1252 | ||
| 1240 | /* | 1253 | /* |
| @@ -1609,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p, | |||
| 1609 | /* | 1622 | /* |
| 1610 | * If there were no record hinting faults then either the task is | 1623 | * If there were no record hinting faults then either the task is |
| 1611 | * completely idle or all activity is areas that are not of interest | 1624 | * completely idle or all activity is areas that are not of interest |
| 1612 | * to automatic numa balancing. Scan slower | 1625 | * to automatic numa balancing. Related to that, if there were failed |
| 1626 | * migration then it implies we are migrating too quickly or the local | ||
| 1627 | * node is overloaded. In either case, scan slower | ||
| 1613 | */ | 1628 | */ |
| 1614 | if (local + shared == 0) { | 1629 | if (local + shared == 0 || p->numa_faults_locality[2]) { |
| 1615 | p->numa_scan_period = min(p->numa_scan_period_max, | 1630 | p->numa_scan_period = min(p->numa_scan_period_max, |
| 1616 | p->numa_scan_period << 1); | 1631 | p->numa_scan_period << 1); |
| 1617 | 1632 | ||
| @@ -1673,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
| 1673 | *period = now - p->last_task_numa_placement; | 1688 | *period = now - p->last_task_numa_placement; |
| 1674 | } else { | 1689 | } else { |
| 1675 | delta = p->se.avg.runnable_avg_sum; | 1690 | delta = p->se.avg.runnable_avg_sum; |
| 1676 | *period = p->se.avg.runnable_avg_period; | 1691 | *period = p->se.avg.avg_period; |
| 1677 | } | 1692 | } |
| 1678 | 1693 | ||
| 1679 | p->last_sum_exec_runtime = runtime; | 1694 | p->last_sum_exec_runtime = runtime; |
| @@ -1763,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
| 1763 | } | 1778 | } |
| 1764 | } | 1779 | } |
| 1765 | /* Next round, evaluate the nodes within max_group. */ | 1780 | /* Next round, evaluate the nodes within max_group. */ |
| 1781 | if (!max_faults) | ||
| 1782 | break; | ||
| 1766 | nodes = max_group; | 1783 | nodes = max_group; |
| 1767 | } | 1784 | } |
| 1768 | return nid; | 1785 | return nid; |
| @@ -2080,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 2080 | 2097 | ||
| 2081 | if (migrated) | 2098 | if (migrated) |
| 2082 | p->numa_pages_migrated += pages; | 2099 | p->numa_pages_migrated += pages; |
| 2100 | if (flags & TNF_MIGRATE_FAIL) | ||
| 2101 | p->numa_faults_locality[2] += pages; | ||
| 2083 | 2102 | ||
| 2084 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; | 2103 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; |
| 2085 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; | 2104 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; |
| @@ -2161,8 +2180,10 @@ void task_numa_work(struct callback_head *work) | |||
| 2161 | vma = mm->mmap; | 2180 | vma = mm->mmap; |
| 2162 | } | 2181 | } |
| 2163 | for (; vma; vma = vma->vm_next) { | 2182 | for (; vma; vma = vma->vm_next) { |
| 2164 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) | 2183 | if (!vma_migratable(vma) || !vma_policy_mof(vma) || |
| 2184 | is_vm_hugetlb_page(vma)) { | ||
| 2165 | continue; | 2185 | continue; |
| 2186 | } | ||
| 2166 | 2187 | ||
| 2167 | /* | 2188 | /* |
| 2168 | * Shared library pages mapped by multiple processes are not | 2189 | * Shared library pages mapped by multiple processes are not |
| @@ -2497,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) | |||
| 2497 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2518 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) |
| 2498 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2519 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
| 2499 | */ | 2520 | */ |
| 2500 | static __always_inline int __update_entity_runnable_avg(u64 now, | 2521 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, |
| 2501 | struct sched_avg *sa, | 2522 | struct sched_avg *sa, |
| 2502 | int runnable) | 2523 | int runnable, |
| 2524 | int running) | ||
| 2503 | { | 2525 | { |
| 2504 | u64 delta, periods; | 2526 | u64 delta, periods; |
| 2505 | u32 runnable_contrib; | 2527 | u32 runnable_contrib; |
| 2506 | int delta_w, decayed = 0; | 2528 | int delta_w, decayed = 0; |
| 2529 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
| 2507 | 2530 | ||
| 2508 | delta = now - sa->last_runnable_update; | 2531 | delta = now - sa->last_runnable_update; |
| 2509 | /* | 2532 | /* |
| @@ -2525,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2525 | sa->last_runnable_update = now; | 2548 | sa->last_runnable_update = now; |
| 2526 | 2549 | ||
| 2527 | /* delta_w is the amount already accumulated against our next period */ | 2550 | /* delta_w is the amount already accumulated against our next period */ |
| 2528 | delta_w = sa->runnable_avg_period % 1024; | 2551 | delta_w = sa->avg_period % 1024; |
| 2529 | if (delta + delta_w >= 1024) { | 2552 | if (delta + delta_w >= 1024) { |
| 2530 | /* period roll-over */ | 2553 | /* period roll-over */ |
| 2531 | decayed = 1; | 2554 | decayed = 1; |
| @@ -2538,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2538 | delta_w = 1024 - delta_w; | 2561 | delta_w = 1024 - delta_w; |
| 2539 | if (runnable) | 2562 | if (runnable) |
| 2540 | sa->runnable_avg_sum += delta_w; | 2563 | sa->runnable_avg_sum += delta_w; |
| 2541 | sa->runnable_avg_period += delta_w; | 2564 | if (running) |
| 2565 | sa->running_avg_sum += delta_w * scale_freq | ||
| 2566 | >> SCHED_CAPACITY_SHIFT; | ||
| 2567 | sa->avg_period += delta_w; | ||
| 2542 | 2568 | ||
| 2543 | delta -= delta_w; | 2569 | delta -= delta_w; |
| 2544 | 2570 | ||
| @@ -2548,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2548 | 2574 | ||
| 2549 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2575 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, |
| 2550 | periods + 1); | 2576 | periods + 1); |
| 2551 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | 2577 | sa->running_avg_sum = decay_load(sa->running_avg_sum, |
| 2578 | periods + 1); | ||
| 2579 | sa->avg_period = decay_load(sa->avg_period, | ||
| 2552 | periods + 1); | 2580 | periods + 1); |
| 2553 | 2581 | ||
| 2554 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2582 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
| 2555 | runnable_contrib = __compute_runnable_contrib(periods); | 2583 | runnable_contrib = __compute_runnable_contrib(periods); |
| 2556 | if (runnable) | 2584 | if (runnable) |
| 2557 | sa->runnable_avg_sum += runnable_contrib; | 2585 | sa->runnable_avg_sum += runnable_contrib; |
| 2558 | sa->runnable_avg_period += runnable_contrib; | 2586 | if (running) |
| 2587 | sa->running_avg_sum += runnable_contrib * scale_freq | ||
| 2588 | >> SCHED_CAPACITY_SHIFT; | ||
| 2589 | sa->avg_period += runnable_contrib; | ||
| 2559 | } | 2590 | } |
| 2560 | 2591 | ||
| 2561 | /* Remainder of delta accrued against u_0` */ | 2592 | /* Remainder of delta accrued against u_0` */ |
| 2562 | if (runnable) | 2593 | if (runnable) |
| 2563 | sa->runnable_avg_sum += delta; | 2594 | sa->runnable_avg_sum += delta; |
| 2564 | sa->runnable_avg_period += delta; | 2595 | if (running) |
| 2596 | sa->running_avg_sum += delta * scale_freq | ||
| 2597 | >> SCHED_CAPACITY_SHIFT; | ||
| 2598 | sa->avg_period += delta; | ||
| 2565 | 2599 | ||
| 2566 | return decayed; | 2600 | return decayed; |
| 2567 | } | 2601 | } |
| @@ -2578,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
| 2578 | return 0; | 2612 | return 0; |
| 2579 | 2613 | ||
| 2580 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2614 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); |
| 2615 | se->avg.utilization_avg_contrib = | ||
| 2616 | decay_load(se->avg.utilization_avg_contrib, decays); | ||
| 2581 | 2617 | ||
| 2582 | return decays; | 2618 | return decays; |
| 2583 | } | 2619 | } |
| @@ -2613,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
| 2613 | 2649 | ||
| 2614 | /* The fraction of a cpu used by this cfs_rq */ | 2650 | /* The fraction of a cpu used by this cfs_rq */ |
| 2615 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2651 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
| 2616 | sa->runnable_avg_period + 1); | 2652 | sa->avg_period + 1); |
| 2617 | contrib -= cfs_rq->tg_runnable_contrib; | 2653 | contrib -= cfs_rq->tg_runnable_contrib; |
| 2618 | 2654 | ||
| 2619 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 2655 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { |
| @@ -2666,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
| 2666 | 2702 | ||
| 2667 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 2703 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
| 2668 | { | 2704 | { |
| 2669 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | 2705 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, |
| 2706 | runnable, runnable); | ||
| 2670 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 2707 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
| 2671 | } | 2708 | } |
| 2672 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2709 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
| @@ -2684,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) | |||
| 2684 | 2721 | ||
| 2685 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2722 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ |
| 2686 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 2723 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); |
| 2687 | contrib /= (se->avg.runnable_avg_period + 1); | 2724 | contrib /= (se->avg.avg_period + 1); |
| 2688 | se->avg.load_avg_contrib = scale_load(contrib); | 2725 | se->avg.load_avg_contrib = scale_load(contrib); |
| 2689 | } | 2726 | } |
| 2690 | 2727 | ||
| @@ -2703,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
| 2703 | return se->avg.load_avg_contrib - old_contrib; | 2740 | return se->avg.load_avg_contrib - old_contrib; |
| 2704 | } | 2741 | } |
| 2705 | 2742 | ||
| 2743 | |||
| 2744 | static inline void __update_task_entity_utilization(struct sched_entity *se) | ||
| 2745 | { | ||
| 2746 | u32 contrib; | ||
| 2747 | |||
| 2748 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
| 2749 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | ||
| 2750 | contrib /= (se->avg.avg_period + 1); | ||
| 2751 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
| 2752 | } | ||
| 2753 | |||
| 2754 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | ||
| 2755 | { | ||
| 2756 | long old_contrib = se->avg.utilization_avg_contrib; | ||
| 2757 | |||
| 2758 | if (entity_is_task(se)) | ||
| 2759 | __update_task_entity_utilization(se); | ||
| 2760 | else | ||
| 2761 | se->avg.utilization_avg_contrib = | ||
| 2762 | group_cfs_rq(se)->utilization_load_avg; | ||
| 2763 | |||
| 2764 | return se->avg.utilization_avg_contrib - old_contrib; | ||
| 2765 | } | ||
| 2766 | |||
| 2706 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2767 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, |
| 2707 | long load_contrib) | 2768 | long load_contrib) |
| 2708 | { | 2769 | { |
| @@ -2719,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2719 | int update_cfs_rq) | 2780 | int update_cfs_rq) |
| 2720 | { | 2781 | { |
| 2721 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2782 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 2722 | long contrib_delta; | 2783 | long contrib_delta, utilization_delta; |
| 2784 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
| 2723 | u64 now; | 2785 | u64 now; |
| 2724 | 2786 | ||
| 2725 | /* | 2787 | /* |
| @@ -2731,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2731 | else | 2793 | else |
| 2732 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 2794 | now = cfs_rq_clock_task(group_cfs_rq(se)); |
| 2733 | 2795 | ||
| 2734 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | 2796 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, |
| 2797 | cfs_rq->curr == se)) | ||
| 2735 | return; | 2798 | return; |
| 2736 | 2799 | ||
| 2737 | contrib_delta = __update_entity_load_avg_contrib(se); | 2800 | contrib_delta = __update_entity_load_avg_contrib(se); |
| 2801 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
| 2738 | 2802 | ||
| 2739 | if (!update_cfs_rq) | 2803 | if (!update_cfs_rq) |
| 2740 | return; | 2804 | return; |
| 2741 | 2805 | ||
| 2742 | if (se->on_rq) | 2806 | if (se->on_rq) { |
| 2743 | cfs_rq->runnable_load_avg += contrib_delta; | 2807 | cfs_rq->runnable_load_avg += contrib_delta; |
| 2744 | else | 2808 | cfs_rq->utilization_load_avg += utilization_delta; |
| 2809 | } else { | ||
| 2745 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 2810 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); |
| 2811 | } | ||
| 2746 | } | 2812 | } |
| 2747 | 2813 | ||
| 2748 | /* | 2814 | /* |
| @@ -2817,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2817 | } | 2883 | } |
| 2818 | 2884 | ||
| 2819 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 2885 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; |
| 2886 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
| 2820 | /* we force update consideration on load-balancer moves */ | 2887 | /* we force update consideration on load-balancer moves */ |
| 2821 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 2888 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); |
| 2822 | } | 2889 | } |
| @@ -2835,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2835 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2902 | update_cfs_rq_blocked_load(cfs_rq, !sleep); |
| 2836 | 2903 | ||
| 2837 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2904 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; |
| 2905 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | ||
| 2838 | if (sleep) { | 2906 | if (sleep) { |
| 2839 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2907 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
| 2840 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2908 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
| @@ -3172,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3172 | */ | 3240 | */ |
| 3173 | update_stats_wait_end(cfs_rq, se); | 3241 | update_stats_wait_end(cfs_rq, se); |
| 3174 | __dequeue_entity(cfs_rq, se); | 3242 | __dequeue_entity(cfs_rq, se); |
| 3243 | update_entity_load_avg(se, 1); | ||
| 3175 | } | 3244 | } |
| 3176 | 3245 | ||
| 3177 | update_stats_curr_start(cfs_rq, se); | 3246 | update_stats_curr_start(cfs_rq, se); |
| @@ -4298,6 +4367,11 @@ static unsigned long capacity_of(int cpu) | |||
| 4298 | return cpu_rq(cpu)->cpu_capacity; | 4367 | return cpu_rq(cpu)->cpu_capacity; |
| 4299 | } | 4368 | } |
| 4300 | 4369 | ||
| 4370 | static unsigned long capacity_orig_of(int cpu) | ||
| 4371 | { | ||
| 4372 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
| 4373 | } | ||
| 4374 | |||
| 4301 | static unsigned long cpu_avg_load_per_task(int cpu) | 4375 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 4302 | { | 4376 | { |
| 4303 | struct rq *rq = cpu_rq(cpu); | 4377 | struct rq *rq = cpu_rq(cpu); |
| @@ -4711,6 +4785,33 @@ next: | |||
| 4711 | done: | 4785 | done: |
| 4712 | return target; | 4786 | return target; |
| 4713 | } | 4787 | } |
| 4788 | /* | ||
| 4789 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | ||
| 4790 | * tasks. The unit of the return value must be the one of capacity so we can | ||
| 4791 | * compare the usage with the capacity of the CPU that is available for CFS | ||
| 4792 | * task (ie cpu_capacity). | ||
| 4793 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | ||
| 4794 | * CPU. It represents the amount of utilization of a CPU in the range | ||
| 4795 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | ||
| 4796 | * capacity of the CPU because it's about the running time on this CPU. | ||
| 4797 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | ||
| 4798 | * because of unfortunate rounding in avg_period and running_load_avg or just | ||
| 4799 | * after migrating tasks until the average stabilizes with the new running | ||
| 4800 | * time. So we need to check that the usage stays into the range | ||
| 4801 | * [0..cpu_capacity_orig] and cap if necessary. | ||
| 4802 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | ||
| 4803 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | ||
| 4804 | */ | ||
| 4805 | static int get_cpu_usage(int cpu) | ||
| 4806 | { | ||
| 4807 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | ||
| 4808 | unsigned long capacity = capacity_orig_of(cpu); | ||
| 4809 | |||
| 4810 | if (usage >= SCHED_LOAD_SCALE) | ||
| 4811 | return capacity; | ||
| 4812 | |||
| 4813 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
| 4814 | } | ||
| 4714 | 4815 | ||
| 4715 | /* | 4816 | /* |
| 4716 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 4817 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
| @@ -5837,12 +5938,12 @@ struct sg_lb_stats { | |||
| 5837 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5938 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 5838 | unsigned long load_per_task; | 5939 | unsigned long load_per_task; |
| 5839 | unsigned long group_capacity; | 5940 | unsigned long group_capacity; |
| 5941 | unsigned long group_usage; /* Total usage of the group */ | ||
| 5840 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5942 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
| 5841 | unsigned int group_capacity_factor; | ||
| 5842 | unsigned int idle_cpus; | 5943 | unsigned int idle_cpus; |
| 5843 | unsigned int group_weight; | 5944 | unsigned int group_weight; |
| 5844 | enum group_type group_type; | 5945 | enum group_type group_type; |
| 5845 | int group_has_free_capacity; | 5946 | int group_no_capacity; |
| 5846 | #ifdef CONFIG_NUMA_BALANCING | 5947 | #ifdef CONFIG_NUMA_BALANCING |
| 5847 | unsigned int nr_numa_running; | 5948 | unsigned int nr_numa_running; |
| 5848 | unsigned int nr_preferred_running; | 5949 | unsigned int nr_preferred_running; |
| @@ -5913,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
| 5913 | return load_idx; | 6014 | return load_idx; |
| 5914 | } | 6015 | } |
| 5915 | 6016 | ||
| 5916 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) | ||
| 5917 | { | ||
| 5918 | return SCHED_CAPACITY_SCALE; | ||
| 5919 | } | ||
| 5920 | |||
| 5921 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
| 5922 | { | ||
| 5923 | return default_scale_capacity(sd, cpu); | ||
| 5924 | } | ||
| 5925 | |||
| 5926 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 6017 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
| 5927 | { | 6018 | { |
| 5928 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 6019 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
| @@ -5939,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5939 | static unsigned long scale_rt_capacity(int cpu) | 6030 | static unsigned long scale_rt_capacity(int cpu) |
| 5940 | { | 6031 | { |
| 5941 | struct rq *rq = cpu_rq(cpu); | 6032 | struct rq *rq = cpu_rq(cpu); |
| 5942 | u64 total, available, age_stamp, avg; | 6033 | u64 total, used, age_stamp, avg; |
| 5943 | s64 delta; | 6034 | s64 delta; |
| 5944 | 6035 | ||
| 5945 | /* | 6036 | /* |
| @@ -5955,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 5955 | 6046 | ||
| 5956 | total = sched_avg_period() + delta; | 6047 | total = sched_avg_period() + delta; |
| 5957 | 6048 | ||
| 5958 | if (unlikely(total < avg)) { | 6049 | used = div_u64(avg, total); |
| 5959 | /* Ensures that capacity won't end up being negative */ | ||
| 5960 | available = 0; | ||
| 5961 | } else { | ||
| 5962 | available = total - avg; | ||
| 5963 | } | ||
| 5964 | |||
| 5965 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) | ||
| 5966 | total = SCHED_CAPACITY_SCALE; | ||
| 5967 | 6050 | ||
| 5968 | total >>= SCHED_CAPACITY_SHIFT; | 6051 | if (likely(used < SCHED_CAPACITY_SCALE)) |
| 6052 | return SCHED_CAPACITY_SCALE - used; | ||
| 5969 | 6053 | ||
| 5970 | return div_u64(available, total); | 6054 | return 1; |
| 5971 | } | 6055 | } |
| 5972 | 6056 | ||
| 5973 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6057 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
| @@ -5982,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5982 | 6066 | ||
| 5983 | capacity >>= SCHED_CAPACITY_SHIFT; | 6067 | capacity >>= SCHED_CAPACITY_SHIFT; |
| 5984 | 6068 | ||
| 5985 | sdg->sgc->capacity_orig = capacity; | 6069 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
| 5986 | |||
| 5987 | if (sched_feat(ARCH_CAPACITY)) | ||
| 5988 | capacity *= arch_scale_freq_capacity(sd, cpu); | ||
| 5989 | else | ||
| 5990 | capacity *= default_scale_capacity(sd, cpu); | ||
| 5991 | |||
| 5992 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
| 5993 | 6070 | ||
| 5994 | capacity *= scale_rt_capacity(cpu); | 6071 | capacity *= scale_rt_capacity(cpu); |
| 5995 | capacity >>= SCHED_CAPACITY_SHIFT; | 6072 | capacity >>= SCHED_CAPACITY_SHIFT; |
| @@ -6005,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6005 | { | 6082 | { |
| 6006 | struct sched_domain *child = sd->child; | 6083 | struct sched_domain *child = sd->child; |
| 6007 | struct sched_group *group, *sdg = sd->groups; | 6084 | struct sched_group *group, *sdg = sd->groups; |
| 6008 | unsigned long capacity, capacity_orig; | 6085 | unsigned long capacity; |
| 6009 | unsigned long interval; | 6086 | unsigned long interval; |
| 6010 | 6087 | ||
| 6011 | interval = msecs_to_jiffies(sd->balance_interval); | 6088 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -6017,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6017 | return; | 6094 | return; |
| 6018 | } | 6095 | } |
| 6019 | 6096 | ||
| 6020 | capacity_orig = capacity = 0; | 6097 | capacity = 0; |
| 6021 | 6098 | ||
| 6022 | if (child->flags & SD_OVERLAP) { | 6099 | if (child->flags & SD_OVERLAP) { |
| 6023 | /* | 6100 | /* |
| @@ -6037,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6037 | * Use capacity_of(), which is set irrespective of domains | 6114 | * Use capacity_of(), which is set irrespective of domains |
| 6038 | * in update_cpu_capacity(). | 6115 | * in update_cpu_capacity(). |
| 6039 | * | 6116 | * |
| 6040 | * This avoids capacity/capacity_orig from being 0 and | 6117 | * This avoids capacity from being 0 and |
| 6041 | * causing divide-by-zero issues on boot. | 6118 | * causing divide-by-zero issues on boot. |
| 6042 | * | ||
| 6043 | * Runtime updates will correct capacity_orig. | ||
| 6044 | */ | 6119 | */ |
| 6045 | if (unlikely(!rq->sd)) { | 6120 | if (unlikely(!rq->sd)) { |
| 6046 | capacity_orig += capacity_of(cpu); | ||
| 6047 | capacity += capacity_of(cpu); | 6121 | capacity += capacity_of(cpu); |
| 6048 | continue; | 6122 | continue; |
| 6049 | } | 6123 | } |
| 6050 | 6124 | ||
| 6051 | sgc = rq->sd->groups->sgc; | 6125 | sgc = rq->sd->groups->sgc; |
| 6052 | capacity_orig += sgc->capacity_orig; | ||
| 6053 | capacity += sgc->capacity; | 6126 | capacity += sgc->capacity; |
| 6054 | } | 6127 | } |
| 6055 | } else { | 6128 | } else { |
| @@ -6060,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6060 | 6133 | ||
| 6061 | group = child->groups; | 6134 | group = child->groups; |
| 6062 | do { | 6135 | do { |
| 6063 | capacity_orig += group->sgc->capacity_orig; | ||
| 6064 | capacity += group->sgc->capacity; | 6136 | capacity += group->sgc->capacity; |
| 6065 | group = group->next; | 6137 | group = group->next; |
| 6066 | } while (group != child->groups); | 6138 | } while (group != child->groups); |
| 6067 | } | 6139 | } |
| 6068 | 6140 | ||
| 6069 | sdg->sgc->capacity_orig = capacity_orig; | ||
| 6070 | sdg->sgc->capacity = capacity; | 6141 | sdg->sgc->capacity = capacity; |
| 6071 | } | 6142 | } |
| 6072 | 6143 | ||
| 6073 | /* | 6144 | /* |
| 6074 | * Try and fix up capacity for tiny siblings, this is needed when | 6145 | * Check whether the capacity of the rq has been noticeably reduced by side |
| 6075 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | 6146 | * activity. The imbalance_pct is used for the threshold. |
| 6076 | * which on its own isn't powerful enough. | 6147 | * Return true is the capacity is reduced |
| 6077 | * | ||
| 6078 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
| 6079 | */ | 6148 | */ |
| 6080 | static inline int | 6149 | static inline int |
| 6081 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 6150 | check_cpu_capacity(struct rq *rq, struct sched_domain *sd) |
| 6082 | { | 6151 | { |
| 6083 | /* | 6152 | return ((rq->cpu_capacity * sd->imbalance_pct) < |
| 6084 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE | 6153 | (rq->cpu_capacity_orig * 100)); |
| 6085 | */ | ||
| 6086 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) | ||
| 6087 | return 0; | ||
| 6088 | |||
| 6089 | /* | ||
| 6090 | * If ~90% of the cpu_capacity is still there, we're good. | ||
| 6091 | */ | ||
| 6092 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) | ||
| 6093 | return 1; | ||
| 6094 | |||
| 6095 | return 0; | ||
| 6096 | } | 6154 | } |
| 6097 | 6155 | ||
| 6098 | /* | 6156 | /* |
| @@ -6130,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
| 6130 | } | 6188 | } |
| 6131 | 6189 | ||
| 6132 | /* | 6190 | /* |
| 6133 | * Compute the group capacity factor. | 6191 | * group_has_capacity returns true if the group has spare capacity that could |
| 6134 | * | 6192 | * be used by some tasks. |
| 6135 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by | 6193 | * We consider that a group has spare capacity if the * number of task is |
| 6136 | * first dividing out the smt factor and computing the actual number of cores | 6194 | * smaller than the number of CPUs or if the usage is lower than the available |
| 6137 | * and limit unit capacity with that. | 6195 | * capacity for CFS tasks. |
| 6196 | * For the latter, we use a threshold to stabilize the state, to take into | ||
| 6197 | * account the variance of the tasks' load and to return true if the available | ||
| 6198 | * capacity in meaningful for the load balancer. | ||
| 6199 | * As an example, an available capacity of 1% can appear but it doesn't make | ||
| 6200 | * any benefit for the load balance. | ||
| 6138 | */ | 6201 | */ |
| 6139 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) | 6202 | static inline bool |
| 6203 | group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6140 | { | 6204 | { |
| 6141 | unsigned int capacity_factor, smt, cpus; | 6205 | if (sgs->sum_nr_running < sgs->group_weight) |
| 6142 | unsigned int capacity, capacity_orig; | 6206 | return true; |
| 6143 | 6207 | ||
| 6144 | capacity = group->sgc->capacity; | 6208 | if ((sgs->group_capacity * 100) > |
| 6145 | capacity_orig = group->sgc->capacity_orig; | 6209 | (sgs->group_usage * env->sd->imbalance_pct)) |
| 6146 | cpus = group->group_weight; | 6210 | return true; |
| 6211 | |||
| 6212 | return false; | ||
| 6213 | } | ||
| 6147 | 6214 | ||
| 6148 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ | 6215 | /* |
| 6149 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); | 6216 | * group_is_overloaded returns true if the group has more tasks than it can |
| 6150 | capacity_factor = cpus / smt; /* cores */ | 6217 | * handle. |
| 6218 | * group_is_overloaded is not equals to !group_has_capacity because a group | ||
| 6219 | * with the exact right number of tasks, has no more spare capacity but is not | ||
| 6220 | * overloaded so both group_has_capacity and group_is_overloaded return | ||
| 6221 | * false. | ||
| 6222 | */ | ||
| 6223 | static inline bool | ||
| 6224 | group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6225 | { | ||
| 6226 | if (sgs->sum_nr_running <= sgs->group_weight) | ||
| 6227 | return false; | ||
| 6151 | 6228 | ||
| 6152 | capacity_factor = min_t(unsigned, | 6229 | if ((sgs->group_capacity * 100) < |
| 6153 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); | 6230 | (sgs->group_usage * env->sd->imbalance_pct)) |
| 6154 | if (!capacity_factor) | 6231 | return true; |
| 6155 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6156 | 6232 | ||
| 6157 | return capacity_factor; | 6233 | return false; |
| 6158 | } | 6234 | } |
| 6159 | 6235 | ||
| 6160 | static enum group_type | 6236 | static enum group_type group_classify(struct lb_env *env, |
| 6161 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | 6237 | struct sched_group *group, |
| 6238 | struct sg_lb_stats *sgs) | ||
| 6162 | { | 6239 | { |
| 6163 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6240 | if (sgs->group_no_capacity) |
| 6164 | return group_overloaded; | 6241 | return group_overloaded; |
| 6165 | 6242 | ||
| 6166 | if (sg_imbalanced(group)) | 6243 | if (sg_imbalanced(group)) |
| @@ -6198,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6198 | load = source_load(i, load_idx); | 6275 | load = source_load(i, load_idx); |
| 6199 | 6276 | ||
| 6200 | sgs->group_load += load; | 6277 | sgs->group_load += load; |
| 6278 | sgs->group_usage += get_cpu_usage(i); | ||
| 6201 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6279 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 6202 | 6280 | ||
| 6203 | if (rq->nr_running > 1) | 6281 | if (rq->nr_running > 1) |
| @@ -6220,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6220 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6298 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 6221 | 6299 | ||
| 6222 | sgs->group_weight = group->group_weight; | 6300 | sgs->group_weight = group->group_weight; |
| 6223 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | ||
| 6224 | sgs->group_type = group_classify(group, sgs); | ||
| 6225 | 6301 | ||
| 6226 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6302 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
| 6227 | sgs->group_has_free_capacity = 1; | 6303 | sgs->group_type = group_classify(env, group, sgs); |
| 6228 | } | 6304 | } |
| 6229 | 6305 | ||
| 6230 | /** | 6306 | /** |
| @@ -6346,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 6346 | 6422 | ||
| 6347 | /* | 6423 | /* |
| 6348 | * In case the child domain prefers tasks go to siblings | 6424 | * In case the child domain prefers tasks go to siblings |
| 6349 | * first, lower the sg capacity factor to one so that we'll try | 6425 | * first, lower the sg capacity so that we'll try |
| 6350 | * and move all the excess tasks away. We lower the capacity | 6426 | * and move all the excess tasks away. We lower the capacity |
| 6351 | * of a group only if the local group has the capacity to fit | 6427 | * of a group only if the local group has the capacity to fit |
| 6352 | * these excess tasks, i.e. nr_running < group_capacity_factor. The | 6428 | * these excess tasks. The extra check prevents the case where |
| 6353 | * extra check prevents the case where you always pull from the | 6429 | * you always pull from the heaviest group when it is already |
| 6354 | * heaviest group when it is already under-utilized (possible | 6430 | * under-utilized (possible with a large weight task outweighs |
| 6355 | * with a large weight task outweighs the tasks on the system). | 6431 | * the tasks on the system). |
| 6356 | */ | 6432 | */ |
| 6357 | if (prefer_sibling && sds->local && | 6433 | if (prefer_sibling && sds->local && |
| 6358 | sds->local_stat.group_has_free_capacity) { | 6434 | group_has_capacity(env, &sds->local_stat) && |
| 6359 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6435 | (sgs->sum_nr_running > 1)) { |
| 6360 | sgs->group_type = group_classify(sg, sgs); | 6436 | sgs->group_no_capacity = 1; |
| 6437 | sgs->group_type = group_overloaded; | ||
| 6361 | } | 6438 | } |
| 6362 | 6439 | ||
| 6363 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6440 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| @@ -6537,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6537 | */ | 6614 | */ |
| 6538 | if (busiest->group_type == group_overloaded && | 6615 | if (busiest->group_type == group_overloaded && |
| 6539 | local->group_type == group_overloaded) { | 6616 | local->group_type == group_overloaded) { |
| 6540 | load_above_capacity = | 6617 | load_above_capacity = busiest->sum_nr_running * |
| 6541 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6618 | SCHED_LOAD_SCALE; |
| 6542 | 6619 | if (load_above_capacity > busiest->group_capacity) | |
| 6543 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); | 6620 | load_above_capacity -= busiest->group_capacity; |
| 6544 | load_above_capacity /= busiest->group_capacity; | 6621 | else |
| 6622 | load_above_capacity = ~0UL; | ||
| 6545 | } | 6623 | } |
| 6546 | 6624 | ||
| 6547 | /* | 6625 | /* |
| @@ -6604,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6604 | local = &sds.local_stat; | 6682 | local = &sds.local_stat; |
| 6605 | busiest = &sds.busiest_stat; | 6683 | busiest = &sds.busiest_stat; |
| 6606 | 6684 | ||
| 6685 | /* ASYM feature bypasses nice load balance check */ | ||
| 6607 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 6686 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
| 6608 | check_asym_packing(env, &sds)) | 6687 | check_asym_packing(env, &sds)) |
| 6609 | return sds.busiest; | 6688 | return sds.busiest; |
| @@ -6624,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6624 | goto force_balance; | 6703 | goto force_balance; |
| 6625 | 6704 | ||
| 6626 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6705 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| 6627 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && | 6706 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && |
| 6628 | !busiest->group_has_free_capacity) | 6707 | busiest->group_no_capacity) |
| 6629 | goto force_balance; | 6708 | goto force_balance; |
| 6630 | 6709 | ||
| 6631 | /* | 6710 | /* |
| @@ -6684,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6684 | int i; | 6763 | int i; |
| 6685 | 6764 | ||
| 6686 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6765 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 6687 | unsigned long capacity, capacity_factor, wl; | 6766 | unsigned long capacity, wl; |
| 6688 | enum fbq_type rt; | 6767 | enum fbq_type rt; |
| 6689 | 6768 | ||
| 6690 | rq = cpu_rq(i); | 6769 | rq = cpu_rq(i); |
| @@ -6713,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6713 | continue; | 6792 | continue; |
| 6714 | 6793 | ||
| 6715 | capacity = capacity_of(i); | 6794 | capacity = capacity_of(i); |
| 6716 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); | ||
| 6717 | if (!capacity_factor) | ||
| 6718 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6719 | 6795 | ||
| 6720 | wl = weighted_cpuload(i); | 6796 | wl = weighted_cpuload(i); |
| 6721 | 6797 | ||
| @@ -6723,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6723 | * When comparing with imbalance, use weighted_cpuload() | 6799 | * When comparing with imbalance, use weighted_cpuload() |
| 6724 | * which is not scaled with the cpu capacity. | 6800 | * which is not scaled with the cpu capacity. |
| 6725 | */ | 6801 | */ |
| 6726 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) | 6802 | |
| 6803 | if (rq->nr_running == 1 && wl > env->imbalance && | ||
| 6804 | !check_cpu_capacity(rq, env->sd)) | ||
| 6727 | continue; | 6805 | continue; |
| 6728 | 6806 | ||
| 6729 | /* | 6807 | /* |
| @@ -6771,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) | |||
| 6771 | return 1; | 6849 | return 1; |
| 6772 | } | 6850 | } |
| 6773 | 6851 | ||
| 6852 | /* | ||
| 6853 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | ||
| 6854 | * It's worth migrating the task if the src_cpu's capacity is reduced | ||
| 6855 | * because of other sched_class or IRQs if more capacity stays | ||
| 6856 | * available on dst_cpu. | ||
| 6857 | */ | ||
| 6858 | if ((env->idle != CPU_NOT_IDLE) && | ||
| 6859 | (env->src_rq->cfs.h_nr_running == 1)) { | ||
| 6860 | if ((check_cpu_capacity(env->src_rq, sd)) && | ||
| 6861 | (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) | ||
| 6862 | return 1; | ||
| 6863 | } | ||
| 6864 | |||
| 6774 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 6865 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
| 6775 | } | 6866 | } |
| 6776 | 6867 | ||
| @@ -6870,6 +6961,9 @@ redo: | |||
| 6870 | 6961 | ||
| 6871 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 6962 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
| 6872 | 6963 | ||
| 6964 | env.src_cpu = busiest->cpu; | ||
| 6965 | env.src_rq = busiest; | ||
| 6966 | |||
| 6873 | ld_moved = 0; | 6967 | ld_moved = 0; |
| 6874 | if (busiest->nr_running > 1) { | 6968 | if (busiest->nr_running > 1) { |
| 6875 | /* | 6969 | /* |
| @@ -6879,8 +6973,6 @@ redo: | |||
| 6879 | * correctly treated as an imbalance. | 6973 | * correctly treated as an imbalance. |
| 6880 | */ | 6974 | */ |
| 6881 | env.flags |= LBF_ALL_PINNED; | 6975 | env.flags |= LBF_ALL_PINNED; |
| 6882 | env.src_cpu = busiest->cpu; | ||
| 6883 | env.src_rq = busiest; | ||
| 6884 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6976 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 6885 | 6977 | ||
| 6886 | more_balance: | 6978 | more_balance: |
| @@ -7580,22 +7672,25 @@ end: | |||
| 7580 | 7672 | ||
| 7581 | /* | 7673 | /* |
| 7582 | * Current heuristic for kicking the idle load balancer in the presence | 7674 | * Current heuristic for kicking the idle load balancer in the presence |
| 7583 | * of an idle cpu is the system. | 7675 | * of an idle cpu in the system. |
| 7584 | * - This rq has more than one task. | 7676 | * - This rq has more than one task. |
| 7585 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7677 | * - This rq has at least one CFS task and the capacity of the CPU is |
| 7586 | * busy cpu's exceeding the group's capacity. | 7678 | * significantly reduced because of RT tasks or IRQs. |
| 7679 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
| 7680 | * multiple busy cpu. | ||
| 7587 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7681 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
| 7588 | * domain span are idle. | 7682 | * domain span are idle. |
| 7589 | */ | 7683 | */ |
| 7590 | static inline int nohz_kick_needed(struct rq *rq) | 7684 | static inline bool nohz_kick_needed(struct rq *rq) |
| 7591 | { | 7685 | { |
| 7592 | unsigned long now = jiffies; | 7686 | unsigned long now = jiffies; |
| 7593 | struct sched_domain *sd; | 7687 | struct sched_domain *sd; |
| 7594 | struct sched_group_capacity *sgc; | 7688 | struct sched_group_capacity *sgc; |
| 7595 | int nr_busy, cpu = rq->cpu; | 7689 | int nr_busy, cpu = rq->cpu; |
| 7690 | bool kick = false; | ||
| 7596 | 7691 | ||
| 7597 | if (unlikely(rq->idle_balance)) | 7692 | if (unlikely(rq->idle_balance)) |
| 7598 | return 0; | 7693 | return false; |
| 7599 | 7694 | ||
| 7600 | /* | 7695 | /* |
| 7601 | * We may be recently in ticked or tickless idle mode. At the first | 7696 | * We may be recently in ticked or tickless idle mode. At the first |
| @@ -7609,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
| 7609 | * balancing. | 7704 | * balancing. |
| 7610 | */ | 7705 | */ |
| 7611 | if (likely(!atomic_read(&nohz.nr_cpus))) | 7706 | if (likely(!atomic_read(&nohz.nr_cpus))) |
| 7612 | return 0; | 7707 | return false; |
| 7613 | 7708 | ||
| 7614 | if (time_before(now, nohz.next_balance)) | 7709 | if (time_before(now, nohz.next_balance)) |
| 7615 | return 0; | 7710 | return false; |
| 7616 | 7711 | ||
| 7617 | if (rq->nr_running >= 2) | 7712 | if (rq->nr_running >= 2) |
| 7618 | goto need_kick; | 7713 | return true; |
| 7619 | 7714 | ||
| 7620 | rcu_read_lock(); | 7715 | rcu_read_lock(); |
| 7621 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7716 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 7622 | |||
| 7623 | if (sd) { | 7717 | if (sd) { |
| 7624 | sgc = sd->groups->sgc; | 7718 | sgc = sd->groups->sgc; |
| 7625 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 7719 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
| 7626 | 7720 | ||
| 7627 | if (nr_busy > 1) | 7721 | if (nr_busy > 1) { |
| 7628 | goto need_kick_unlock; | 7722 | kick = true; |
| 7723 | goto unlock; | ||
| 7724 | } | ||
| 7725 | |||
| 7629 | } | 7726 | } |
| 7630 | 7727 | ||
| 7631 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 7728 | sd = rcu_dereference(rq->sd); |
| 7729 | if (sd) { | ||
| 7730 | if ((rq->cfs.h_nr_running >= 1) && | ||
| 7731 | check_cpu_capacity(rq, sd)) { | ||
| 7732 | kick = true; | ||
| 7733 | goto unlock; | ||
| 7734 | } | ||
| 7735 | } | ||
| 7632 | 7736 | ||
| 7737 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
| 7633 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 7738 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, |
| 7634 | sched_domain_span(sd)) < cpu)) | 7739 | sched_domain_span(sd)) < cpu)) { |
| 7635 | goto need_kick_unlock; | 7740 | kick = true; |
| 7636 | 7741 | goto unlock; | |
| 7637 | rcu_read_unlock(); | 7742 | } |
| 7638 | return 0; | ||
| 7639 | 7743 | ||
| 7640 | need_kick_unlock: | 7744 | unlock: |
| 7641 | rcu_read_unlock(); | 7745 | rcu_read_unlock(); |
| 7642 | need_kick: | 7746 | return kick; |
| 7643 | return 1; | ||
| 7644 | } | 7747 | } |
| 7645 | #else | 7748 | #else |
| 7646 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 7749 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
| @@ -7656,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 7656 | enum cpu_idle_type idle = this_rq->idle_balance ? | 7759 | enum cpu_idle_type idle = this_rq->idle_balance ? |
| 7657 | CPU_IDLE : CPU_NOT_IDLE; | 7760 | CPU_IDLE : CPU_NOT_IDLE; |
| 7658 | 7761 | ||
| 7659 | rebalance_domains(this_rq, idle); | ||
| 7660 | |||
| 7661 | /* | 7762 | /* |
| 7662 | * If this cpu has a pending nohz_balance_kick, then do the | 7763 | * If this cpu has a pending nohz_balance_kick, then do the |
| 7663 | * balancing on behalf of the other idle cpus whose ticks are | 7764 | * balancing on behalf of the other idle cpus whose ticks are |
| 7664 | * stopped. | 7765 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
| 7766 | * give the idle cpus a chance to load balance. Else we may | ||
| 7767 | * load balance only within the local sched_domain hierarchy | ||
| 7768 | * and abort nohz_idle_balance altogether if we pull some load. | ||
| 7665 | */ | 7769 | */ |
| 7666 | nohz_idle_balance(this_rq, idle); | 7770 | nohz_idle_balance(this_rq, idle); |
| 7771 | rebalance_domains(this_rq, idle); | ||
| 7667 | } | 7772 | } |
| 7668 | 7773 | ||
| 7669 | /* | 7774 | /* |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) | |||
| 56 | */ | 56 | */ |
| 57 | SCHED_FEAT(TTWU_QUEUE, true) | 57 | SCHED_FEAT(TTWU_QUEUE, true) |
| 58 | 58 | ||
| 59 | #ifdef HAVE_RT_PUSH_IPI | ||
| 60 | /* | ||
| 61 | * In order to avoid a thundering herd attack of CPUs that are | ||
| 62 | * lowering their priorities at the same time, and there being | ||
| 63 | * a single CPU that has an RT task that can migrate and is waiting | ||
| 64 | * to run, where the other CPUs will try to take that CPUs | ||
| 65 | * rq lock and possibly create a large contention, sending an | ||
| 66 | * IPI to that CPU and let that CPU push the RT task to where | ||
| 67 | * it should go may be a better scenario. | ||
| 68 | */ | ||
| 69 | SCHED_FEAT(RT_PUSH_IPI, true) | ||
| 70 | #endif | ||
| 71 | |||
| 59 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
| 60 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
| 61 | SCHED_FEAT(LB_MIN, false) | 74 | SCHED_FEAT(LB_MIN, false) |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80014a178342..deef1caa94c6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -158,8 +158,7 @@ static void cpuidle_idle_call(void) | |||
| 158 | * is used from another cpu as a broadcast timer, this call may | 158 | * is used from another cpu as a broadcast timer, this call may |
| 159 | * fail if it is not available | 159 | * fail if it is not available |
| 160 | */ | 160 | */ |
| 161 | if (broadcast && | 161 | if (broadcast && tick_broadcast_enter()) |
| 162 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | ||
| 163 | goto use_default; | 162 | goto use_default; |
| 164 | 163 | ||
| 165 | /* Take note of the planned idle state. */ | 164 | /* Take note of the planned idle state. */ |
| @@ -176,7 +175,7 @@ static void cpuidle_idle_call(void) | |||
| 176 | idle_set_state(this_rq(), NULL); | 175 | idle_set_state(this_rq(), NULL); |
| 177 | 176 | ||
| 178 | if (broadcast) | 177 | if (broadcast) |
| 179 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 178 | tick_broadcast_exit(); |
| 180 | 179 | ||
| 181 | /* | 180 | /* |
| 182 | * Give the governor an opportunity to reflect on the outcome | 181 | * Give the governor an opportunity to reflect on the outcome |
| @@ -210,6 +209,8 @@ use_default: | |||
| 210 | goto exit_idle; | 209 | goto exit_idle; |
| 211 | } | 210 | } |
| 212 | 211 | ||
| 212 | DEFINE_PER_CPU(bool, cpu_dead_idle); | ||
| 213 | |||
| 213 | /* | 214 | /* |
| 214 | * Generic idle loop implementation | 215 | * Generic idle loop implementation |
| 215 | * | 216 | * |
| @@ -234,8 +235,13 @@ static void cpu_idle_loop(void) | |||
| 234 | check_pgt_cache(); | 235 | check_pgt_cache(); |
| 235 | rmb(); | 236 | rmb(); |
| 236 | 237 | ||
| 237 | if (cpu_is_offline(smp_processor_id())) | 238 | if (cpu_is_offline(smp_processor_id())) { |
| 239 | rcu_cpu_notify(NULL, CPU_DYING_IDLE, | ||
| 240 | (void *)(long)smp_processor_id()); | ||
| 241 | smp_mb(); /* all activity before dead. */ | ||
| 242 | this_cpu_write(cpu_dead_idle, true); | ||
| 238 | arch_cpu_idle_dead(); | 243 | arch_cpu_idle_dead(); |
| 244 | } | ||
| 239 | 245 | ||
| 240 | local_irq_disable(); | 246 | local_irq_disable(); |
| 241 | arch_cpu_idle_enter(); | 247 | arch_cpu_idle_enter(); |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b077eba0..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include "sched.h" | 6 | #include "sched.h" |
| 7 | 7 | ||
| 8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
| 9 | #include <linux/irq_work.h> | ||
| 9 | 10 | ||
| 10 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; |
| 11 | 12 | ||
| @@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 59 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 60 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
| 60 | } | 61 | } |
| 61 | 62 | ||
| 62 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 63 | #ifdef CONFIG_SMP |
| 64 | static void push_irq_work_func(struct irq_work *work); | ||
| 65 | #endif | ||
| 66 | |||
| 67 | void init_rt_rq(struct rt_rq *rt_rq) | ||
| 63 | { | 68 | { |
| 64 | struct rt_prio_array *array; | 69 | struct rt_prio_array *array; |
| 65 | int i; | 70 | int i; |
| @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 78 | rt_rq->rt_nr_migratory = 0; | 83 | rt_rq->rt_nr_migratory = 0; |
| 79 | rt_rq->overloaded = 0; | 84 | rt_rq->overloaded = 0; |
| 80 | plist_head_init(&rt_rq->pushable_tasks); | 85 | plist_head_init(&rt_rq->pushable_tasks); |
| 86 | |||
| 87 | #ifdef HAVE_RT_PUSH_IPI | ||
| 88 | rt_rq->push_flags = 0; | ||
| 89 | rt_rq->push_cpu = nr_cpu_ids; | ||
| 90 | raw_spin_lock_init(&rt_rq->push_lock); | ||
| 91 | init_irq_work(&rt_rq->push_work, push_irq_work_func); | ||
| 81 | #endif | 92 | #endif |
| 93 | #endif /* CONFIG_SMP */ | ||
| 82 | /* We start is dequeued state, because no RT tasks are queued */ | 94 | /* We start is dequeued state, because no RT tasks are queued */ |
| 83 | rt_rq->rt_queued = 0; | 95 | rt_rq->rt_queued = 0; |
| 84 | 96 | ||
| @@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 193 | if (!rt_se) | 205 | if (!rt_se) |
| 194 | goto err_free_rq; | 206 | goto err_free_rq; |
| 195 | 207 | ||
| 196 | init_rt_rq(rt_rq, cpu_rq(i)); | 208 | init_rt_rq(rt_rq); |
| 197 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 209 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
| 198 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 210 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
| 199 | } | 211 | } |
| @@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) | |||
| 1778 | ; | 1790 | ; |
| 1779 | } | 1791 | } |
| 1780 | 1792 | ||
| 1793 | #ifdef HAVE_RT_PUSH_IPI | ||
| 1794 | /* | ||
| 1795 | * The search for the next cpu always starts at rq->cpu and ends | ||
| 1796 | * when we reach rq->cpu again. It will never return rq->cpu. | ||
| 1797 | * This returns the next cpu to check, or nr_cpu_ids if the loop | ||
| 1798 | * is complete. | ||
| 1799 | * | ||
| 1800 | * rq->rt.push_cpu holds the last cpu returned by this function, | ||
| 1801 | * or if this is the first instance, it must hold rq->cpu. | ||
| 1802 | */ | ||
| 1803 | static int rto_next_cpu(struct rq *rq) | ||
| 1804 | { | ||
| 1805 | int prev_cpu = rq->rt.push_cpu; | ||
| 1806 | int cpu; | ||
| 1807 | |||
| 1808 | cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); | ||
| 1809 | |||
| 1810 | /* | ||
| 1811 | * If the previous cpu is less than the rq's CPU, then it already | ||
| 1812 | * passed the end of the mask, and has started from the beginning. | ||
| 1813 | * We end if the next CPU is greater or equal to rq's CPU. | ||
| 1814 | */ | ||
| 1815 | if (prev_cpu < rq->cpu) { | ||
| 1816 | if (cpu >= rq->cpu) | ||
| 1817 | return nr_cpu_ids; | ||
| 1818 | |||
| 1819 | } else if (cpu >= nr_cpu_ids) { | ||
| 1820 | /* | ||
| 1821 | * We passed the end of the mask, start at the beginning. | ||
| 1822 | * If the result is greater or equal to the rq's CPU, then | ||
| 1823 | * the loop is finished. | ||
| 1824 | */ | ||
| 1825 | cpu = cpumask_first(rq->rd->rto_mask); | ||
| 1826 | if (cpu >= rq->cpu) | ||
| 1827 | return nr_cpu_ids; | ||
| 1828 | } | ||
| 1829 | rq->rt.push_cpu = cpu; | ||
| 1830 | |||
| 1831 | /* Return cpu to let the caller know if the loop is finished or not */ | ||
| 1832 | return cpu; | ||
| 1833 | } | ||
| 1834 | |||
| 1835 | static int find_next_push_cpu(struct rq *rq) | ||
| 1836 | { | ||
| 1837 | struct rq *next_rq; | ||
| 1838 | int cpu; | ||
| 1839 | |||
| 1840 | while (1) { | ||
| 1841 | cpu = rto_next_cpu(rq); | ||
| 1842 | if (cpu >= nr_cpu_ids) | ||
| 1843 | break; | ||
| 1844 | next_rq = cpu_rq(cpu); | ||
| 1845 | |||
| 1846 | /* Make sure the next rq can push to this rq */ | ||
| 1847 | if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) | ||
| 1848 | break; | ||
| 1849 | } | ||
| 1850 | |||
| 1851 | return cpu; | ||
| 1852 | } | ||
| 1853 | |||
| 1854 | #define RT_PUSH_IPI_EXECUTING 1 | ||
| 1855 | #define RT_PUSH_IPI_RESTART 2 | ||
| 1856 | |||
| 1857 | static void tell_cpu_to_push(struct rq *rq) | ||
| 1858 | { | ||
| 1859 | int cpu; | ||
| 1860 | |||
| 1861 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
| 1862 | raw_spin_lock(&rq->rt.push_lock); | ||
| 1863 | /* Make sure it's still executing */ | ||
| 1864 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
| 1865 | /* | ||
| 1866 | * Tell the IPI to restart the loop as things have | ||
| 1867 | * changed since it started. | ||
| 1868 | */ | ||
| 1869 | rq->rt.push_flags |= RT_PUSH_IPI_RESTART; | ||
| 1870 | raw_spin_unlock(&rq->rt.push_lock); | ||
| 1871 | return; | ||
| 1872 | } | ||
| 1873 | raw_spin_unlock(&rq->rt.push_lock); | ||
| 1874 | } | ||
| 1875 | |||
| 1876 | /* When here, there's no IPI going around */ | ||
| 1877 | |||
| 1878 | rq->rt.push_cpu = rq->cpu; | ||
| 1879 | cpu = find_next_push_cpu(rq); | ||
| 1880 | if (cpu >= nr_cpu_ids) | ||
| 1881 | return; | ||
| 1882 | |||
| 1883 | rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; | ||
| 1884 | |||
| 1885 | irq_work_queue_on(&rq->rt.push_work, cpu); | ||
| 1886 | } | ||
| 1887 | |||
| 1888 | /* Called from hardirq context */ | ||
| 1889 | static void try_to_push_tasks(void *arg) | ||
| 1890 | { | ||
| 1891 | struct rt_rq *rt_rq = arg; | ||
| 1892 | struct rq *rq, *src_rq; | ||
| 1893 | int this_cpu; | ||
| 1894 | int cpu; | ||
| 1895 | |||
| 1896 | this_cpu = rt_rq->push_cpu; | ||
| 1897 | |||
| 1898 | /* Paranoid check */ | ||
| 1899 | BUG_ON(this_cpu != smp_processor_id()); | ||
| 1900 | |||
| 1901 | rq = cpu_rq(this_cpu); | ||
| 1902 | src_rq = rq_of_rt_rq(rt_rq); | ||
| 1903 | |||
| 1904 | again: | ||
| 1905 | if (has_pushable_tasks(rq)) { | ||
| 1906 | raw_spin_lock(&rq->lock); | ||
| 1907 | push_rt_task(rq); | ||
| 1908 | raw_spin_unlock(&rq->lock); | ||
| 1909 | } | ||
| 1910 | |||
| 1911 | /* Pass the IPI to the next rt overloaded queue */ | ||
| 1912 | raw_spin_lock(&rt_rq->push_lock); | ||
| 1913 | /* | ||
| 1914 | * If the source queue changed since the IPI went out, | ||
| 1915 | * we need to restart the search from that CPU again. | ||
| 1916 | */ | ||
| 1917 | if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { | ||
| 1918 | rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; | ||
| 1919 | rt_rq->push_cpu = src_rq->cpu; | ||
| 1920 | } | ||
| 1921 | |||
| 1922 | cpu = find_next_push_cpu(src_rq); | ||
| 1923 | |||
| 1924 | if (cpu >= nr_cpu_ids) | ||
| 1925 | rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; | ||
| 1926 | raw_spin_unlock(&rt_rq->push_lock); | ||
| 1927 | |||
| 1928 | if (cpu >= nr_cpu_ids) | ||
| 1929 | return; | ||
| 1930 | |||
| 1931 | /* | ||
| 1932 | * It is possible that a restart caused this CPU to be | ||
| 1933 | * chosen again. Don't bother with an IPI, just see if we | ||
| 1934 | * have more to push. | ||
| 1935 | */ | ||
| 1936 | if (unlikely(cpu == rq->cpu)) | ||
| 1937 | goto again; | ||
| 1938 | |||
| 1939 | /* Try the next RT overloaded CPU */ | ||
| 1940 | irq_work_queue_on(&rt_rq->push_work, cpu); | ||
| 1941 | } | ||
| 1942 | |||
| 1943 | static void push_irq_work_func(struct irq_work *work) | ||
| 1944 | { | ||
| 1945 | struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); | ||
| 1946 | |||
| 1947 | try_to_push_tasks(rt_rq); | ||
| 1948 | } | ||
| 1949 | #endif /* HAVE_RT_PUSH_IPI */ | ||
| 1950 | |||
| 1781 | static int pull_rt_task(struct rq *this_rq) | 1951 | static int pull_rt_task(struct rq *this_rq) |
| 1782 | { | 1952 | { |
| 1783 | int this_cpu = this_rq->cpu, ret = 0, cpu; | 1953 | int this_cpu = this_rq->cpu, ret = 0, cpu; |
| @@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1793 | */ | 1963 | */ |
| 1794 | smp_rmb(); | 1964 | smp_rmb(); |
| 1795 | 1965 | ||
| 1966 | #ifdef HAVE_RT_PUSH_IPI | ||
| 1967 | if (sched_feat(RT_PUSH_IPI)) { | ||
| 1968 | tell_cpu_to_push(this_rq); | ||
| 1969 | return 0; | ||
| 1970 | } | ||
| 1971 | #endif | ||
| 1972 | |||
| 1796 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1973 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
| 1797 | if (this_cpu == cpu) | 1974 | if (this_cpu == cpu) |
| 1798 | continue; | 1975 | continue; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435a2779..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
| 7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
| 8 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
| 9 | #include <linux/irq_work.h> | ||
| 9 | #include <linux/tick.h> | 10 | #include <linux/tick.h> |
| 10 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
| 11 | 12 | ||
| @@ -362,8 +363,14 @@ struct cfs_rq { | |||
| 362 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 363 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
| 363 | * This allows for the description of both thread and group usage (in | 364 | * This allows for the description of both thread and group usage (in |
| 364 | * the FAIR_GROUP_SCHED case). | 365 | * the FAIR_GROUP_SCHED case). |
| 366 | * runnable_load_avg is the sum of the load_avg_contrib of the | ||
| 367 | * sched_entities on the rq. | ||
| 368 | * blocked_load_avg is similar to runnable_load_avg except that its | ||
| 369 | * the blocked sched_entities on the rq. | ||
| 370 | * utilization_load_avg is the sum of the average running time of the | ||
| 371 | * sched_entities on the rq. | ||
| 365 | */ | 372 | */ |
| 366 | unsigned long runnable_load_avg, blocked_load_avg; | 373 | unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; |
| 367 | atomic64_t decay_counter; | 374 | atomic64_t decay_counter; |
| 368 | u64 last_decay; | 375 | u64 last_decay; |
| 369 | atomic_long_t removed_load; | 376 | atomic_long_t removed_load; |
| @@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) | |||
| 418 | return sysctl_sched_rt_runtime >= 0; | 425 | return sysctl_sched_rt_runtime >= 0; |
| 419 | } | 426 | } |
| 420 | 427 | ||
| 428 | /* RT IPI pull logic requires IRQ_WORK */ | ||
| 429 | #ifdef CONFIG_IRQ_WORK | ||
| 430 | # define HAVE_RT_PUSH_IPI | ||
| 431 | #endif | ||
| 432 | |||
| 421 | /* Real-Time classes' related field in a runqueue: */ | 433 | /* Real-Time classes' related field in a runqueue: */ |
| 422 | struct rt_rq { | 434 | struct rt_rq { |
| 423 | struct rt_prio_array active; | 435 | struct rt_prio_array active; |
| @@ -435,7 +447,13 @@ struct rt_rq { | |||
| 435 | unsigned long rt_nr_total; | 447 | unsigned long rt_nr_total; |
| 436 | int overloaded; | 448 | int overloaded; |
| 437 | struct plist_head pushable_tasks; | 449 | struct plist_head pushable_tasks; |
| 450 | #ifdef HAVE_RT_PUSH_IPI | ||
| 451 | int push_flags; | ||
| 452 | int push_cpu; | ||
| 453 | struct irq_work push_work; | ||
| 454 | raw_spinlock_t push_lock; | ||
| 438 | #endif | 455 | #endif |
| 456 | #endif /* CONFIG_SMP */ | ||
| 439 | int rt_queued; | 457 | int rt_queued; |
| 440 | 458 | ||
| 441 | int rt_throttled; | 459 | int rt_throttled; |
| @@ -597,6 +615,7 @@ struct rq { | |||
| 597 | struct sched_domain *sd; | 615 | struct sched_domain *sd; |
| 598 | 616 | ||
| 599 | unsigned long cpu_capacity; | 617 | unsigned long cpu_capacity; |
| 618 | unsigned long cpu_capacity_orig; | ||
| 600 | 619 | ||
| 601 | unsigned char idle_balance; | 620 | unsigned char idle_balance; |
| 602 | /* For active balancing */ | 621 | /* For active balancing */ |
| @@ -807,7 +826,7 @@ struct sched_group_capacity { | |||
| 807 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity | 826 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity |
| 808 | * for a single CPU. | 827 | * for a single CPU. |
| 809 | */ | 828 | */ |
| 810 | unsigned int capacity, capacity_orig; | 829 | unsigned int capacity; |
| 811 | unsigned long next_update; | 830 | unsigned long next_update; |
| 812 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 831 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
| 813 | /* | 832 | /* |
| @@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) | |||
| 1368 | 1387 | ||
| 1369 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
| 1370 | extern void sched_avg_update(struct rq *rq); | 1389 | extern void sched_avg_update(struct rq *rq); |
| 1390 | |||
| 1391 | #ifndef arch_scale_freq_capacity | ||
| 1392 | static __always_inline | ||
| 1393 | unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
| 1394 | { | ||
| 1395 | return SCHED_CAPACITY_SCALE; | ||
| 1396 | } | ||
| 1397 | #endif | ||
| 1398 | |||
| 1371 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1399 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
| 1372 | { | 1400 | { |
| 1373 | rq->rt_avg += rt_delta; | 1401 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
| 1374 | sched_avg_update(rq); | 1402 | sched_avg_update(rq); |
| 1375 | } | 1403 | } |
| 1376 | #else | 1404 | #else |
| @@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
| 1643 | extern void print_dl_stats(struct seq_file *m, int cpu); | 1671 | extern void print_dl_stats(struct seq_file *m, int cpu); |
| 1644 | 1672 | ||
| 1645 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1673 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
| 1646 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1674 | extern void init_rt_rq(struct rt_rq *rt_rq); |
| 1647 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | 1675 | extern void init_dl_rq(struct dl_rq *dl_rq); |
| 1648 | 1676 | ||
| 1649 | extern void cfs_bandwidth_usage_inc(void); | 1677 | extern void cfs_bandwidth_usage_inc(void); |
| 1650 | extern void cfs_bandwidth_usage_dec(void); | 1678 | extern void cfs_bandwidth_usage_dec(void); |
