diff options
-rw-r--r-- | arch/x86/include/asm/mwait.h | 8 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 51 | ||||
-rw-r--r-- | include/linux/irq_work.h | 3 | ||||
-rw-r--r-- | include/linux/sched.h | 21 | ||||
-rw-r--r-- | kernel/sched/core.c | 96 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 77 | ||||
-rw-r--r-- | kernel/sched/debug.c | 12 | ||||
-rw-r--r-- | kernel/sched/fair.c | 425 | ||||
-rw-r--r-- | kernel/sched/features.h | 13 | ||||
-rw-r--r-- | kernel/sched/rt.c | 181 | ||||
-rw-r--r-- | kernel/sched/sched.h | 38 |
11 files changed, 690 insertions, 235 deletions
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a1410db38a1a..653dfa7662e1 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h | |||
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) | |||
30 | :: "a" (eax), "c" (ecx)); | 30 | :: "a" (eax), "c" (ecx)); |
31 | } | 31 | } |
32 | 32 | ||
33 | static inline void __sti_mwait(unsigned long eax, unsigned long ecx) | ||
34 | { | ||
35 | trace_hardirqs_on(); | ||
36 | /* "mwait %eax, %ecx;" */ | ||
37 | asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" | ||
38 | :: "a" (eax), "c" (ecx)); | ||
39 | } | ||
40 | |||
33 | /* | 41 | /* |
34 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | 42 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, |
35 | * which can obviate IPI to trigger checking of need_resched. | 43 | * which can obviate IPI to trigger checking of need_resched. |
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d620bbe..a388bb883128 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
@@ -24,6 +24,7 @@ | |||
24 | #include <asm/syscalls.h> | 24 | #include <asm/syscalls.h> |
25 | #include <asm/idle.h> | 25 | #include <asm/idle.h> |
26 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
27 | #include <asm/mwait.h> | ||
27 | #include <asm/i387.h> | 28 | #include <asm/i387.h> |
28 | #include <asm/fpu-internal.h> | 29 | #include <asm/fpu-internal.h> |
29 | #include <asm/debugreg.h> | 30 | #include <asm/debugreg.h> |
@@ -399,6 +400,53 @@ static void amd_e400_idle(void) | |||
399 | default_idle(); | 400 | default_idle(); |
400 | } | 401 | } |
401 | 402 | ||
403 | /* | ||
404 | * Intel Core2 and older machines prefer MWAIT over HALT for C1. | ||
405 | * We can't rely on cpuidle installing MWAIT, because it will not load | ||
406 | * on systems that support only C1 -- so the boot default must be MWAIT. | ||
407 | * | ||
408 | * Some AMD machines are the opposite, they depend on using HALT. | ||
409 | * | ||
410 | * So for default C1, which is used during boot until cpuidle loads, | ||
411 | * use MWAIT-C1 on Intel HW that has it, else use HALT. | ||
412 | */ | ||
413 | static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) | ||
414 | { | ||
415 | if (c->x86_vendor != X86_VENDOR_INTEL) | ||
416 | return 0; | ||
417 | |||
418 | if (!cpu_has(c, X86_FEATURE_MWAIT)) | ||
419 | return 0; | ||
420 | |||
421 | return 1; | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * MONITOR/MWAIT with no hints, used for default default C1 state. | ||
426 | * This invokes MWAIT with interrutps enabled and no flags, | ||
427 | * which is backwards compatible with the original MWAIT implementation. | ||
428 | */ | ||
429 | |||
430 | static void mwait_idle(void) | ||
431 | { | ||
432 | if (!current_set_polling_and_test()) { | ||
433 | if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { | ||
434 | smp_mb(); /* quirk */ | ||
435 | clflush((void *)¤t_thread_info()->flags); | ||
436 | smp_mb(); /* quirk */ | ||
437 | } | ||
438 | |||
439 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
440 | if (!need_resched()) | ||
441 | __sti_mwait(0, 0); | ||
442 | else | ||
443 | local_irq_enable(); | ||
444 | } else { | ||
445 | local_irq_enable(); | ||
446 | } | ||
447 | __current_clr_polling(); | ||
448 | } | ||
449 | |||
402 | void select_idle_routine(const struct cpuinfo_x86 *c) | 450 | void select_idle_routine(const struct cpuinfo_x86 *c) |
403 | { | 451 | { |
404 | #ifdef CONFIG_SMP | 452 | #ifdef CONFIG_SMP |
@@ -412,6 +460,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) | |||
412 | /* E400: APIC timer interrupt does not wake up CPU from C1e */ | 460 | /* E400: APIC timer interrupt does not wake up CPU from C1e */ |
413 | pr_info("using AMD E400 aware idle routine\n"); | 461 | pr_info("using AMD E400 aware idle routine\n"); |
414 | x86_idle = amd_e400_idle; | 462 | x86_idle = amd_e400_idle; |
463 | } else if (prefer_mwait_c1_over_halt(c)) { | ||
464 | pr_info("using mwait in idle threads\n"); | ||
465 | x86_idle = mwait_idle; | ||
415 | } else | 466 | } else |
416 | x86_idle = default_idle; | 467 | x86_idle = default_idle; |
417 | } | 468 | } |
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index bf3fe719c7ce..47b9ebd4a74f 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h | |||
@@ -38,16 +38,17 @@ bool irq_work_queue(struct irq_work *work); | |||
38 | bool irq_work_queue_on(struct irq_work *work, int cpu); | 38 | bool irq_work_queue_on(struct irq_work *work, int cpu); |
39 | #endif | 39 | #endif |
40 | 40 | ||
41 | void irq_work_run(void); | ||
42 | void irq_work_tick(void); | 41 | void irq_work_tick(void); |
43 | void irq_work_sync(struct irq_work *work); | 42 | void irq_work_sync(struct irq_work *work); |
44 | 43 | ||
45 | #ifdef CONFIG_IRQ_WORK | 44 | #ifdef CONFIG_IRQ_WORK |
46 | #include <asm/irq_work.h> | 45 | #include <asm/irq_work.h> |
47 | 46 | ||
47 | void irq_work_run(void); | ||
48 | bool irq_work_needs_cpu(void); | 48 | bool irq_work_needs_cpu(void); |
49 | #else | 49 | #else |
50 | static inline bool irq_work_needs_cpu(void) { return false; } | 50 | static inline bool irq_work_needs_cpu(void) { return false; } |
51 | static inline void irq_work_run(void) { } | ||
51 | #endif | 52 | #endif |
52 | 53 | ||
53 | #endif /* _LINUX_IRQ_WORK_H */ | 54 | #endif /* _LINUX_IRQ_WORK_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 51348f77e431..3f3308824fa4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1123,15 +1123,28 @@ struct load_weight { | |||
1123 | }; | 1123 | }; |
1124 | 1124 | ||
1125 | struct sched_avg { | 1125 | struct sched_avg { |
1126 | u64 last_runnable_update; | ||
1127 | s64 decay_count; | ||
1128 | /* | ||
1129 | * utilization_avg_contrib describes the amount of time that a | ||
1130 | * sched_entity is running on a CPU. It is based on running_avg_sum | ||
1131 | * and is scaled in the range [0..SCHED_LOAD_SCALE]. | ||
1132 | * load_avg_contrib described the amount of time that a sched_entity | ||
1133 | * is runnable on a rq. It is based on both runnable_avg_sum and the | ||
1134 | * weight of the task. | ||
1135 | */ | ||
1136 | unsigned long load_avg_contrib, utilization_avg_contrib; | ||
1126 | /* | 1137 | /* |
1127 | * These sums represent an infinite geometric series and so are bound | 1138 | * These sums represent an infinite geometric series and so are bound |
1128 | * above by 1024/(1-y). Thus we only need a u32 to store them for all | 1139 | * above by 1024/(1-y). Thus we only need a u32 to store them for all |
1129 | * choices of y < 1-2^(-32)*1024. | 1140 | * choices of y < 1-2^(-32)*1024. |
1141 | * running_avg_sum reflects the time that the sched_entity is | ||
1142 | * effectively running on the CPU. | ||
1143 | * runnable_avg_sum represents the amount of time a sched_entity is on | ||
1144 | * a runqueue which includes the running time that is monitored by | ||
1145 | * running_avg_sum. | ||
1130 | */ | 1146 | */ |
1131 | u32 runnable_avg_sum, runnable_avg_period; | 1147 | u32 runnable_avg_sum, avg_period, running_avg_sum; |
1132 | u64 last_runnable_update; | ||
1133 | s64 decay_count; | ||
1134 | unsigned long load_avg_contrib; | ||
1135 | }; | 1148 | }; |
1136 | 1149 | ||
1137 | #ifdef CONFIG_SCHEDSTATS | 1150 | #ifdef CONFIG_SCHEDSTATS |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3d5f6f6d14c2..261af7bfcb67 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -690,6 +690,23 @@ static inline bool got_nohz_idle_kick(void) | |||
690 | bool sched_can_stop_tick(void) | 690 | bool sched_can_stop_tick(void) |
691 | { | 691 | { |
692 | /* | 692 | /* |
693 | * FIFO realtime policy runs the highest priority task. Other runnable | ||
694 | * tasks are of a lower priority. The scheduler tick does nothing. | ||
695 | */ | ||
696 | if (current->policy == SCHED_FIFO) | ||
697 | return true; | ||
698 | |||
699 | /* | ||
700 | * Round-robin realtime tasks time slice with other tasks at the same | ||
701 | * realtime priority. Is this task the only one at this priority? | ||
702 | */ | ||
703 | if (current->policy == SCHED_RR) { | ||
704 | struct sched_rt_entity *rt_se = ¤t->rt; | ||
705 | |||
706 | return rt_se->run_list.prev == rt_se->run_list.next; | ||
707 | } | ||
708 | |||
709 | /* | ||
693 | * More than one running task need preemption. | 710 | * More than one running task need preemption. |
694 | * nr_running update is assumed to be visible | 711 | * nr_running update is assumed to be visible |
695 | * after IPI is sent from wakers. | 712 | * after IPI is sent from wakers. |
@@ -5335,36 +5352,13 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
5335 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5352 | static int sched_cpu_inactive(struct notifier_block *nfb, |
5336 | unsigned long action, void *hcpu) | 5353 | unsigned long action, void *hcpu) |
5337 | { | 5354 | { |
5338 | unsigned long flags; | ||
5339 | long cpu = (long)hcpu; | ||
5340 | struct dl_bw *dl_b; | ||
5341 | |||
5342 | switch (action & ~CPU_TASKS_FROZEN) { | 5355 | switch (action & ~CPU_TASKS_FROZEN) { |
5343 | case CPU_DOWN_PREPARE: | 5356 | case CPU_DOWN_PREPARE: |
5344 | set_cpu_active(cpu, false); | 5357 | set_cpu_active((long)hcpu, false); |
5345 | |||
5346 | /* explicitly allow suspend */ | ||
5347 | if (!(action & CPU_TASKS_FROZEN)) { | ||
5348 | bool overflow; | ||
5349 | int cpus; | ||
5350 | |||
5351 | rcu_read_lock_sched(); | ||
5352 | dl_b = dl_bw_of(cpu); | ||
5353 | |||
5354 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
5355 | cpus = dl_bw_cpus(cpu); | ||
5356 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
5357 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
5358 | |||
5359 | rcu_read_unlock_sched(); | ||
5360 | |||
5361 | if (overflow) | ||
5362 | return notifier_from_errno(-EBUSY); | ||
5363 | } | ||
5364 | return NOTIFY_OK; | 5358 | return NOTIFY_OK; |
5359 | default: | ||
5360 | return NOTIFY_DONE; | ||
5365 | } | 5361 | } |
5366 | |||
5367 | return NOTIFY_DONE; | ||
5368 | } | 5362 | } |
5369 | 5363 | ||
5370 | static int __init migration_init(void) | 5364 | static int __init migration_init(void) |
@@ -5445,17 +5439,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5445 | break; | 5439 | break; |
5446 | } | 5440 | } |
5447 | 5441 | ||
5448 | /* | ||
5449 | * Even though we initialize ->capacity to something semi-sane, | ||
5450 | * we leave capacity_orig unset. This allows us to detect if | ||
5451 | * domain iteration is still funny without causing /0 traps. | ||
5452 | */ | ||
5453 | if (!group->sgc->capacity_orig) { | ||
5454 | printk(KERN_CONT "\n"); | ||
5455 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); | ||
5456 | break; | ||
5457 | } | ||
5458 | |||
5459 | if (!cpumask_weight(sched_group_cpus(group))) { | 5442 | if (!cpumask_weight(sched_group_cpus(group))) { |
5460 | printk(KERN_CONT "\n"); | 5443 | printk(KERN_CONT "\n"); |
5461 | printk(KERN_ERR "ERROR: empty group\n"); | 5444 | printk(KERN_ERR "ERROR: empty group\n"); |
@@ -5939,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5939 | * die on a /0 trap. | 5922 | * die on a /0 trap. |
5940 | */ | 5923 | */ |
5941 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
5942 | sg->sgc->capacity_orig = sg->sgc->capacity; | ||
5943 | 5925 | ||
5944 | /* | 5926 | /* |
5945 | * Make sure the first group of this domain contains the | 5927 | * Make sure the first group of this domain contains the |
@@ -6250,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
6250 | */ | 6232 | */ |
6251 | 6233 | ||
6252 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6234 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
6235 | sd->flags |= SD_PREFER_SIBLING; | ||
6253 | sd->imbalance_pct = 110; | 6236 | sd->imbalance_pct = 110; |
6254 | sd->smt_gain = 1178; /* ~15% */ | 6237 | sd->smt_gain = 1178; /* ~15% */ |
6255 | 6238 | ||
@@ -7015,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
7015 | */ | 6998 | */ |
7016 | 6999 | ||
7017 | case CPU_ONLINE: | 7000 | case CPU_ONLINE: |
7018 | case CPU_DOWN_FAILED: | ||
7019 | cpuset_update_active_cpus(true); | 7001 | cpuset_update_active_cpus(true); |
7020 | break; | 7002 | break; |
7021 | default: | 7003 | default: |
@@ -7027,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
7027 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7009 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, |
7028 | void *hcpu) | 7010 | void *hcpu) |
7029 | { | 7011 | { |
7030 | switch (action) { | 7012 | unsigned long flags; |
7013 | long cpu = (long)hcpu; | ||
7014 | struct dl_bw *dl_b; | ||
7015 | |||
7016 | switch (action & ~CPU_TASKS_FROZEN) { | ||
7031 | case CPU_DOWN_PREPARE: | 7017 | case CPU_DOWN_PREPARE: |
7018 | /* explicitly allow suspend */ | ||
7019 | if (!(action & CPU_TASKS_FROZEN)) { | ||
7020 | bool overflow; | ||
7021 | int cpus; | ||
7022 | |||
7023 | rcu_read_lock_sched(); | ||
7024 | dl_b = dl_bw_of(cpu); | ||
7025 | |||
7026 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
7027 | cpus = dl_bw_cpus(cpu); | ||
7028 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
7029 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
7030 | |||
7031 | rcu_read_unlock_sched(); | ||
7032 | |||
7033 | if (overflow) | ||
7034 | return notifier_from_errno(-EBUSY); | ||
7035 | } | ||
7032 | cpuset_update_active_cpus(false); | 7036 | cpuset_update_active_cpus(false); |
7033 | break; | 7037 | break; |
7034 | case CPU_DOWN_PREPARE_FROZEN: | 7038 | case CPU_DOWN_PREPARE_FROZEN: |
@@ -7173,8 +7177,8 @@ void __init sched_init(void) | |||
7173 | rq->calc_load_active = 0; | 7177 | rq->calc_load_active = 0; |
7174 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7178 | rq->calc_load_update = jiffies + LOAD_FREQ; |
7175 | init_cfs_rq(&rq->cfs); | 7179 | init_cfs_rq(&rq->cfs); |
7176 | init_rt_rq(&rq->rt, rq); | 7180 | init_rt_rq(&rq->rt); |
7177 | init_dl_rq(&rq->dl, rq); | 7181 | init_dl_rq(&rq->dl); |
7178 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7182 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7179 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 7183 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
7180 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7184 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
@@ -7214,7 +7218,7 @@ void __init sched_init(void) | |||
7214 | #ifdef CONFIG_SMP | 7218 | #ifdef CONFIG_SMP |
7215 | rq->sd = NULL; | 7219 | rq->sd = NULL; |
7216 | rq->rd = NULL; | 7220 | rq->rd = NULL; |
7217 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; | 7221 | rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; |
7218 | rq->post_schedule = 0; | 7222 | rq->post_schedule = 0; |
7219 | rq->active_balance = 0; | 7223 | rq->active_balance = 0; |
7220 | rq->next_balance = jiffies; | 7224 | rq->next_balance = jiffies; |
@@ -7813,7 +7817,7 @@ static int sched_rt_global_constraints(void) | |||
7813 | } | 7817 | } |
7814 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7818 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7815 | 7819 | ||
7816 | static int sched_dl_global_constraints(void) | 7820 | static int sched_dl_global_validate(void) |
7817 | { | 7821 | { |
7818 | u64 runtime = global_rt_runtime(); | 7822 | u64 runtime = global_rt_runtime(); |
7819 | u64 period = global_rt_period(); | 7823 | u64 period = global_rt_period(); |
@@ -7914,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
7914 | if (ret) | 7918 | if (ret) |
7915 | goto undo; | 7919 | goto undo; |
7916 | 7920 | ||
7917 | ret = sched_rt_global_constraints(); | 7921 | ret = sched_dl_global_validate(); |
7918 | if (ret) | 7922 | if (ret) |
7919 | goto undo; | 7923 | goto undo; |
7920 | 7924 | ||
7921 | ret = sched_dl_global_constraints(); | 7925 | ret = sched_rt_global_constraints(); |
7922 | if (ret) | 7926 | if (ret) |
7923 | goto undo; | 7927 | goto undo; |
7924 | 7928 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) | |||
69 | dl_b->total_bw = 0; | 69 | dl_b->total_bw = 0; |
70 | } | 70 | } |
71 | 71 | ||
72 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | 72 | void init_dl_rq(struct dl_rq *dl_rq) |
73 | { | 73 | { |
74 | dl_rq->rb_root = RB_ROOT; | 74 | dl_rq->rb_root = RB_ROOT; |
75 | 75 | ||
@@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) | |||
218 | rq->post_schedule = has_pushable_dl_tasks(rq); | 218 | rq->post_schedule = has_pushable_dl_tasks(rq); |
219 | } | 219 | } |
220 | 220 | ||
221 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | ||
222 | |||
223 | static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) | ||
224 | { | ||
225 | struct rq *later_rq = NULL; | ||
226 | bool fallback = false; | ||
227 | |||
228 | later_rq = find_lock_later_rq(p, rq); | ||
229 | |||
230 | if (!later_rq) { | ||
231 | int cpu; | ||
232 | |||
233 | /* | ||
234 | * If we cannot preempt any rq, fall back to pick any | ||
235 | * online cpu. | ||
236 | */ | ||
237 | fallback = true; | ||
238 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | ||
239 | if (cpu >= nr_cpu_ids) { | ||
240 | /* | ||
241 | * Fail to find any suitable cpu. | ||
242 | * The task will never come back! | ||
243 | */ | ||
244 | BUG_ON(dl_bandwidth_enabled()); | ||
245 | |||
246 | /* | ||
247 | * If admission control is disabled we | ||
248 | * try a little harder to let the task | ||
249 | * run. | ||
250 | */ | ||
251 | cpu = cpumask_any(cpu_active_mask); | ||
252 | } | ||
253 | later_rq = cpu_rq(cpu); | ||
254 | double_lock_balance(rq, later_rq); | ||
255 | } | ||
256 | |||
257 | deactivate_task(rq, p, 0); | ||
258 | set_task_cpu(p, later_rq->cpu); | ||
259 | activate_task(later_rq, p, ENQUEUE_REPLENISH); | ||
260 | |||
261 | if (!fallback) | ||
262 | resched_curr(later_rq); | ||
263 | |||
264 | double_unlock_balance(rq, later_rq); | ||
265 | } | ||
266 | |||
221 | #else | 267 | #else |
222 | 268 | ||
223 | static inline | 269 | static inline |
@@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
514 | unsigned long flags; | 560 | unsigned long flags; |
515 | struct rq *rq; | 561 | struct rq *rq; |
516 | 562 | ||
517 | rq = task_rq_lock(current, &flags); | 563 | rq = task_rq_lock(p, &flags); |
518 | 564 | ||
519 | /* | 565 | /* |
520 | * We need to take care of several possible races here: | 566 | * We need to take care of several possible races here: |
@@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
536 | sched_clock_tick(); | 582 | sched_clock_tick(); |
537 | update_rq_clock(rq); | 583 | update_rq_clock(rq); |
538 | 584 | ||
585 | #ifdef CONFIG_SMP | ||
586 | /* | ||
587 | * If we find that the rq the task was on is no longer | ||
588 | * available, we need to select a new rq. | ||
589 | */ | ||
590 | if (unlikely(!rq->online)) { | ||
591 | dl_task_offline_migration(rq, p); | ||
592 | goto unlock; | ||
593 | } | ||
594 | #endif | ||
595 | |||
539 | /* | 596 | /* |
540 | * If the throttle happened during sched-out; like: | 597 | * If the throttle happened during sched-out; like: |
541 | * | 598 | * |
@@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
569 | push_dl_task(rq); | 626 | push_dl_task(rq); |
570 | #endif | 627 | #endif |
571 | unlock: | 628 | unlock: |
572 | task_rq_unlock(rq, current, &flags); | 629 | task_rq_unlock(rq, p, &flags); |
573 | 630 | ||
574 | return HRTIMER_NORESTART; | 631 | return HRTIMER_NORESTART; |
575 | } | 632 | } |
@@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq) | |||
914 | } | 971 | } |
915 | update_rq_clock(rq); | 972 | update_rq_clock(rq); |
916 | update_curr_dl(rq); | 973 | update_curr_dl(rq); |
974 | /* | ||
975 | * Tell update_rq_clock() that we've just updated, | ||
976 | * so we don't do microscopic update in schedule() | ||
977 | * and double the fastpath cost. | ||
978 | */ | ||
979 | rq_clock_skip_update(rq, true); | ||
917 | } | 980 | } |
918 | 981 | ||
919 | #ifdef CONFIG_SMP | 982 | #ifdef CONFIG_SMP |
@@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1659 | { | 1722 | { |
1660 | int check_resched = 1; | 1723 | int check_resched = 1; |
1661 | 1724 | ||
1662 | /* | ||
1663 | * If p is throttled, don't consider the possibility | ||
1664 | * of preempting rq->curr, the check will be done right | ||
1665 | * after its runtime will get replenished. | ||
1666 | */ | ||
1667 | if (unlikely(p->dl.dl_throttled)) | ||
1668 | return; | ||
1669 | |||
1670 | if (task_on_rq_queued(p) && rq->curr != p) { | 1725 | if (task_on_rq_queued(p) && rq->curr != p) { |
1671 | #ifdef CONFIG_SMP | 1726 | #ifdef CONFIG_SMP |
1672 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && | 1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf858d25c..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
71 | if (!se) { | 71 | if (!se) { |
72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; |
73 | P(avg->runnable_avg_sum); | 73 | P(avg->runnable_avg_sum); |
74 | P(avg->runnable_avg_period); | 74 | P(avg->avg_period); |
75 | return; | 75 | return; |
76 | } | 76 | } |
77 | 77 | ||
@@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
94 | P(se->load.weight); | 94 | P(se->load.weight); |
95 | #ifdef CONFIG_SMP | 95 | #ifdef CONFIG_SMP |
96 | P(se->avg.runnable_avg_sum); | 96 | P(se->avg.runnable_avg_sum); |
97 | P(se->avg.runnable_avg_period); | 97 | P(se->avg.running_avg_sum); |
98 | P(se->avg.avg_period); | ||
98 | P(se->avg.load_avg_contrib); | 99 | P(se->avg.load_avg_contrib); |
100 | P(se->avg.utilization_avg_contrib); | ||
99 | P(se->avg.decay_count); | 101 | P(se->avg.decay_count); |
100 | #endif | 102 | #endif |
101 | #undef PN | 103 | #undef PN |
@@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
214 | cfs_rq->runnable_load_avg); | 216 | cfs_rq->runnable_load_avg); |
215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 217 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", |
216 | cfs_rq->blocked_load_avg); | 218 | cfs_rq->blocked_load_avg); |
219 | SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", | ||
220 | cfs_rq->utilization_load_avg); | ||
217 | #ifdef CONFIG_FAIR_GROUP_SCHED | 221 | #ifdef CONFIG_FAIR_GROUP_SCHED |
218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", |
219 | cfs_rq->tg_load_contrib); | 223 | cfs_rq->tg_load_contrib); |
@@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
636 | P(se.load.weight); | 640 | P(se.load.weight); |
637 | #ifdef CONFIG_SMP | 641 | #ifdef CONFIG_SMP |
638 | P(se.avg.runnable_avg_sum); | 642 | P(se.avg.runnable_avg_sum); |
639 | P(se.avg.runnable_avg_period); | 643 | P(se.avg.running_avg_sum); |
644 | P(se.avg.avg_period); | ||
640 | P(se.avg.load_avg_contrib); | 645 | P(se.avg.load_avg_contrib); |
646 | P(se.avg.utilization_avg_contrib); | ||
641 | P(se.avg.decay_count); | 647 | P(se.avg.decay_count); |
642 | #endif | 648 | #endif |
643 | P(policy); | 649 | P(policy); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 241213be507c..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); | |||
670 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); |
671 | 671 | ||
672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | ||
673 | 674 | ||
674 | /* Give new task start runnable values to heavy its load in infant time */ | 675 | /* Give new task start runnable values to heavy its load in infant time */ |
675 | void init_task_runnable_average(struct task_struct *p) | 676 | void init_task_runnable_average(struct task_struct *p) |
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) | |||
677 | u32 slice; | 678 | u32 slice; |
678 | 679 | ||
679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; |
680 | p->se.avg.runnable_avg_sum = slice; | 681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; |
681 | p->se.avg.runnable_avg_period = slice; | 682 | p->se.avg.avg_period = slice; |
682 | __update_task_entity_contrib(&p->se); | 683 | __update_task_entity_contrib(&p->se); |
684 | __update_task_entity_utilization(&p->se); | ||
683 | } | 685 | } |
684 | #else | 686 | #else |
685 | void init_task_runnable_average(struct task_struct *p) | 687 | void init_task_runnable_average(struct task_struct *p) |
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1196 | static bool load_too_imbalanced(long src_load, long dst_load, | 1198 | static bool load_too_imbalanced(long src_load, long dst_load, |
1197 | struct task_numa_env *env) | 1199 | struct task_numa_env *env) |
1198 | { | 1200 | { |
1199 | long imb, old_imb; | ||
1200 | long orig_src_load, orig_dst_load; | ||
1201 | long src_capacity, dst_capacity; | 1201 | long src_capacity, dst_capacity; |
1202 | long orig_src_load; | ||
1203 | long load_a, load_b; | ||
1204 | long moved_load; | ||
1205 | long imb; | ||
1202 | 1206 | ||
1203 | /* | 1207 | /* |
1204 | * The load is corrected for the CPU capacity available on each node. | 1208 | * The load is corrected for the CPU capacity available on each node. |
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
1211 | dst_capacity = env->dst_stats.compute_capacity; | 1215 | dst_capacity = env->dst_stats.compute_capacity; |
1212 | 1216 | ||
1213 | /* We care about the slope of the imbalance, not the direction. */ | 1217 | /* We care about the slope of the imbalance, not the direction. */ |
1214 | if (dst_load < src_load) | 1218 | load_a = dst_load; |
1215 | swap(dst_load, src_load); | 1219 | load_b = src_load; |
1220 | if (load_a < load_b) | ||
1221 | swap(load_a, load_b); | ||
1216 | 1222 | ||
1217 | /* Is the difference below the threshold? */ | 1223 | /* Is the difference below the threshold? */ |
1218 | imb = dst_load * src_capacity * 100 - | 1224 | imb = load_a * src_capacity * 100 - |
1219 | src_load * dst_capacity * env->imbalance_pct; | 1225 | load_b * dst_capacity * env->imbalance_pct; |
1220 | if (imb <= 0) | 1226 | if (imb <= 0) |
1221 | return false; | 1227 | return false; |
1222 | 1228 | ||
1223 | /* | 1229 | /* |
1224 | * The imbalance is above the allowed threshold. | 1230 | * The imbalance is above the allowed threshold. |
1225 | * Compare it with the old imbalance. | 1231 | * Allow a move that brings us closer to a balanced situation, |
1232 | * without moving things past the point of balance. | ||
1226 | */ | 1233 | */ |
1227 | orig_src_load = env->src_stats.load; | 1234 | orig_src_load = env->src_stats.load; |
1228 | orig_dst_load = env->dst_stats.load; | ||
1229 | 1235 | ||
1230 | if (orig_dst_load < orig_src_load) | 1236 | /* |
1231 | swap(orig_dst_load, orig_src_load); | 1237 | * In a task swap, there will be one load moving from src to dst, |
1232 | 1238 | * and another moving back. This is the net sum of both moves. | |
1233 | old_imb = orig_dst_load * src_capacity * 100 - | 1239 | * A simple task move will always have a positive value. |
1234 | orig_src_load * dst_capacity * env->imbalance_pct; | 1240 | * Allow the move if it brings the system closer to a balanced |
1241 | * situation, without crossing over the balance point. | ||
1242 | */ | ||
1243 | moved_load = orig_src_load - src_load; | ||
1235 | 1244 | ||
1236 | /* Would this change make things worse? */ | 1245 | if (moved_load > 0) |
1237 | return (imb > old_imb); | 1246 | /* Moving src -> dst. Did we overshoot balance? */ |
1247 | return src_load * dst_capacity < dst_load * src_capacity; | ||
1248 | else | ||
1249 | /* Moving dst -> src. Did we overshoot balance? */ | ||
1250 | return dst_load * src_capacity < src_load * dst_capacity; | ||
1238 | } | 1251 | } |
1239 | 1252 | ||
1240 | /* | 1253 | /* |
@@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
1675 | *period = now - p->last_task_numa_placement; | 1688 | *period = now - p->last_task_numa_placement; |
1676 | } else { | 1689 | } else { |
1677 | delta = p->se.avg.runnable_avg_sum; | 1690 | delta = p->se.avg.runnable_avg_sum; |
1678 | *period = p->se.avg.runnable_avg_period; | 1691 | *period = p->se.avg.avg_period; |
1679 | } | 1692 | } |
1680 | 1693 | ||
1681 | p->last_sum_exec_runtime = runtime; | 1694 | p->last_sum_exec_runtime = runtime; |
@@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
1765 | } | 1778 | } |
1766 | } | 1779 | } |
1767 | /* Next round, evaluate the nodes within max_group. */ | 1780 | /* Next round, evaluate the nodes within max_group. */ |
1781 | if (!max_faults) | ||
1782 | break; | ||
1768 | nodes = max_group; | 1783 | nodes = max_group; |
1769 | } | 1784 | } |
1770 | return nid; | 1785 | return nid; |
@@ -2503,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) | |||
2503 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2518 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) |
2504 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2519 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
2505 | */ | 2520 | */ |
2506 | static __always_inline int __update_entity_runnable_avg(u64 now, | 2521 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, |
2507 | struct sched_avg *sa, | 2522 | struct sched_avg *sa, |
2508 | int runnable) | 2523 | int runnable, |
2524 | int running) | ||
2509 | { | 2525 | { |
2510 | u64 delta, periods; | 2526 | u64 delta, periods; |
2511 | u32 runnable_contrib; | 2527 | u32 runnable_contrib; |
2512 | int delta_w, decayed = 0; | 2528 | int delta_w, decayed = 0; |
2529 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
2513 | 2530 | ||
2514 | delta = now - sa->last_runnable_update; | 2531 | delta = now - sa->last_runnable_update; |
2515 | /* | 2532 | /* |
@@ -2531,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2531 | sa->last_runnable_update = now; | 2548 | sa->last_runnable_update = now; |
2532 | 2549 | ||
2533 | /* delta_w is the amount already accumulated against our next period */ | 2550 | /* delta_w is the amount already accumulated against our next period */ |
2534 | delta_w = sa->runnable_avg_period % 1024; | 2551 | delta_w = sa->avg_period % 1024; |
2535 | if (delta + delta_w >= 1024) { | 2552 | if (delta + delta_w >= 1024) { |
2536 | /* period roll-over */ | 2553 | /* period roll-over */ |
2537 | decayed = 1; | 2554 | decayed = 1; |
@@ -2544,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2544 | delta_w = 1024 - delta_w; | 2561 | delta_w = 1024 - delta_w; |
2545 | if (runnable) | 2562 | if (runnable) |
2546 | sa->runnable_avg_sum += delta_w; | 2563 | sa->runnable_avg_sum += delta_w; |
2547 | sa->runnable_avg_period += delta_w; | 2564 | if (running) |
2565 | sa->running_avg_sum += delta_w * scale_freq | ||
2566 | >> SCHED_CAPACITY_SHIFT; | ||
2567 | sa->avg_period += delta_w; | ||
2548 | 2568 | ||
2549 | delta -= delta_w; | 2569 | delta -= delta_w; |
2550 | 2570 | ||
@@ -2554,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
2554 | 2574 | ||
2555 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2575 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, |
2556 | periods + 1); | 2576 | periods + 1); |
2557 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | 2577 | sa->running_avg_sum = decay_load(sa->running_avg_sum, |
2578 | periods + 1); | ||
2579 | sa->avg_period = decay_load(sa->avg_period, | ||
2558 | periods + 1); | 2580 | periods + 1); |
2559 | 2581 | ||
2560 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2582 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
2561 | runnable_contrib = __compute_runnable_contrib(periods); | 2583 | runnable_contrib = __compute_runnable_contrib(periods); |
2562 | if (runnable) | 2584 | if (runnable) |
2563 | sa->runnable_avg_sum += runnable_contrib; | 2585 | sa->runnable_avg_sum += runnable_contrib; |
2564 | sa->runnable_avg_period += runnable_contrib; | 2586 | if (running) |
2587 | sa->running_avg_sum += runnable_contrib * scale_freq | ||
2588 | >> SCHED_CAPACITY_SHIFT; | ||
2589 | sa->avg_period += runnable_contrib; | ||
2565 | } | 2590 | } |
2566 | 2591 | ||
2567 | /* Remainder of delta accrued against u_0` */ | 2592 | /* Remainder of delta accrued against u_0` */ |
2568 | if (runnable) | 2593 | if (runnable) |
2569 | sa->runnable_avg_sum += delta; | 2594 | sa->runnable_avg_sum += delta; |
2570 | sa->runnable_avg_period += delta; | 2595 | if (running) |
2596 | sa->running_avg_sum += delta * scale_freq | ||
2597 | >> SCHED_CAPACITY_SHIFT; | ||
2598 | sa->avg_period += delta; | ||
2571 | 2599 | ||
2572 | return decayed; | 2600 | return decayed; |
2573 | } | 2601 | } |
@@ -2584,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
2584 | return 0; | 2612 | return 0; |
2585 | 2613 | ||
2586 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2614 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); |
2615 | se->avg.utilization_avg_contrib = | ||
2616 | decay_load(se->avg.utilization_avg_contrib, decays); | ||
2587 | 2617 | ||
2588 | return decays; | 2618 | return decays; |
2589 | } | 2619 | } |
@@ -2619,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
2619 | 2649 | ||
2620 | /* The fraction of a cpu used by this cfs_rq */ | 2650 | /* The fraction of a cpu used by this cfs_rq */ |
2621 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2651 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, |
2622 | sa->runnable_avg_period + 1); | 2652 | sa->avg_period + 1); |
2623 | contrib -= cfs_rq->tg_runnable_contrib; | 2653 | contrib -= cfs_rq->tg_runnable_contrib; |
2624 | 2654 | ||
2625 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 2655 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { |
@@ -2672,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
2672 | 2702 | ||
2673 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 2703 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
2674 | { | 2704 | { |
2675 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | 2705 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, |
2706 | runnable, runnable); | ||
2676 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 2707 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); |
2677 | } | 2708 | } |
2678 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2709 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -2690,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) | |||
2690 | 2721 | ||
2691 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2722 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ |
2692 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 2723 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); |
2693 | contrib /= (se->avg.runnable_avg_period + 1); | 2724 | contrib /= (se->avg.avg_period + 1); |
2694 | se->avg.load_avg_contrib = scale_load(contrib); | 2725 | se->avg.load_avg_contrib = scale_load(contrib); |
2695 | } | 2726 | } |
2696 | 2727 | ||
@@ -2709,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
2709 | return se->avg.load_avg_contrib - old_contrib; | 2740 | return se->avg.load_avg_contrib - old_contrib; |
2710 | } | 2741 | } |
2711 | 2742 | ||
2743 | |||
2744 | static inline void __update_task_entity_utilization(struct sched_entity *se) | ||
2745 | { | ||
2746 | u32 contrib; | ||
2747 | |||
2748 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
2749 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | ||
2750 | contrib /= (se->avg.avg_period + 1); | ||
2751 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
2752 | } | ||
2753 | |||
2754 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | ||
2755 | { | ||
2756 | long old_contrib = se->avg.utilization_avg_contrib; | ||
2757 | |||
2758 | if (entity_is_task(se)) | ||
2759 | __update_task_entity_utilization(se); | ||
2760 | else | ||
2761 | se->avg.utilization_avg_contrib = | ||
2762 | group_cfs_rq(se)->utilization_load_avg; | ||
2763 | |||
2764 | return se->avg.utilization_avg_contrib - old_contrib; | ||
2765 | } | ||
2766 | |||
2712 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2767 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, |
2713 | long load_contrib) | 2768 | long load_contrib) |
2714 | { | 2769 | { |
@@ -2725,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
2725 | int update_cfs_rq) | 2780 | int update_cfs_rq) |
2726 | { | 2781 | { |
2727 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2782 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2728 | long contrib_delta; | 2783 | long contrib_delta, utilization_delta; |
2784 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
2729 | u64 now; | 2785 | u64 now; |
2730 | 2786 | ||
2731 | /* | 2787 | /* |
@@ -2737,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
2737 | else | 2793 | else |
2738 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 2794 | now = cfs_rq_clock_task(group_cfs_rq(se)); |
2739 | 2795 | ||
2740 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | 2796 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, |
2797 | cfs_rq->curr == se)) | ||
2741 | return; | 2798 | return; |
2742 | 2799 | ||
2743 | contrib_delta = __update_entity_load_avg_contrib(se); | 2800 | contrib_delta = __update_entity_load_avg_contrib(se); |
2801 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
2744 | 2802 | ||
2745 | if (!update_cfs_rq) | 2803 | if (!update_cfs_rq) |
2746 | return; | 2804 | return; |
2747 | 2805 | ||
2748 | if (se->on_rq) | 2806 | if (se->on_rq) { |
2749 | cfs_rq->runnable_load_avg += contrib_delta; | 2807 | cfs_rq->runnable_load_avg += contrib_delta; |
2750 | else | 2808 | cfs_rq->utilization_load_avg += utilization_delta; |
2809 | } else { | ||
2751 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 2810 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); |
2811 | } | ||
2752 | } | 2812 | } |
2753 | 2813 | ||
2754 | /* | 2814 | /* |
@@ -2823,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2823 | } | 2883 | } |
2824 | 2884 | ||
2825 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 2885 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; |
2886 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
2826 | /* we force update consideration on load-balancer moves */ | 2887 | /* we force update consideration on load-balancer moves */ |
2827 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 2888 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); |
2828 | } | 2889 | } |
@@ -2841,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
2841 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2902 | update_cfs_rq_blocked_load(cfs_rq, !sleep); |
2842 | 2903 | ||
2843 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2904 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; |
2905 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | ||
2844 | if (sleep) { | 2906 | if (sleep) { |
2845 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2907 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; |
2846 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2908 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
@@ -3178,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3178 | */ | 3240 | */ |
3179 | update_stats_wait_end(cfs_rq, se); | 3241 | update_stats_wait_end(cfs_rq, se); |
3180 | __dequeue_entity(cfs_rq, se); | 3242 | __dequeue_entity(cfs_rq, se); |
3243 | update_entity_load_avg(se, 1); | ||
3181 | } | 3244 | } |
3182 | 3245 | ||
3183 | update_stats_curr_start(cfs_rq, se); | 3246 | update_stats_curr_start(cfs_rq, se); |
@@ -4304,6 +4367,11 @@ static unsigned long capacity_of(int cpu) | |||
4304 | return cpu_rq(cpu)->cpu_capacity; | 4367 | return cpu_rq(cpu)->cpu_capacity; |
4305 | } | 4368 | } |
4306 | 4369 | ||
4370 | static unsigned long capacity_orig_of(int cpu) | ||
4371 | { | ||
4372 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
4373 | } | ||
4374 | |||
4307 | static unsigned long cpu_avg_load_per_task(int cpu) | 4375 | static unsigned long cpu_avg_load_per_task(int cpu) |
4308 | { | 4376 | { |
4309 | struct rq *rq = cpu_rq(cpu); | 4377 | struct rq *rq = cpu_rq(cpu); |
@@ -4717,6 +4785,33 @@ next: | |||
4717 | done: | 4785 | done: |
4718 | return target; | 4786 | return target; |
4719 | } | 4787 | } |
4788 | /* | ||
4789 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | ||
4790 | * tasks. The unit of the return value must be the one of capacity so we can | ||
4791 | * compare the usage with the capacity of the CPU that is available for CFS | ||
4792 | * task (ie cpu_capacity). | ||
4793 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | ||
4794 | * CPU. It represents the amount of utilization of a CPU in the range | ||
4795 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | ||
4796 | * capacity of the CPU because it's about the running time on this CPU. | ||
4797 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | ||
4798 | * because of unfortunate rounding in avg_period and running_load_avg or just | ||
4799 | * after migrating tasks until the average stabilizes with the new running | ||
4800 | * time. So we need to check that the usage stays into the range | ||
4801 | * [0..cpu_capacity_orig] and cap if necessary. | ||
4802 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | ||
4803 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | ||
4804 | */ | ||
4805 | static int get_cpu_usage(int cpu) | ||
4806 | { | ||
4807 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | ||
4808 | unsigned long capacity = capacity_orig_of(cpu); | ||
4809 | |||
4810 | if (usage >= SCHED_LOAD_SCALE) | ||
4811 | return capacity; | ||
4812 | |||
4813 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
4814 | } | ||
4720 | 4815 | ||
4721 | /* | 4816 | /* |
4722 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 4817 | * select_task_rq_fair: Select target runqueue for the waking task in domains |
@@ -5843,12 +5938,12 @@ struct sg_lb_stats { | |||
5843 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5938 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
5844 | unsigned long load_per_task; | 5939 | unsigned long load_per_task; |
5845 | unsigned long group_capacity; | 5940 | unsigned long group_capacity; |
5941 | unsigned long group_usage; /* Total usage of the group */ | ||
5846 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5942 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
5847 | unsigned int group_capacity_factor; | ||
5848 | unsigned int idle_cpus; | 5943 | unsigned int idle_cpus; |
5849 | unsigned int group_weight; | 5944 | unsigned int group_weight; |
5850 | enum group_type group_type; | 5945 | enum group_type group_type; |
5851 | int group_has_free_capacity; | 5946 | int group_no_capacity; |
5852 | #ifdef CONFIG_NUMA_BALANCING | 5947 | #ifdef CONFIG_NUMA_BALANCING |
5853 | unsigned int nr_numa_running; | 5948 | unsigned int nr_numa_running; |
5854 | unsigned int nr_preferred_running; | 5949 | unsigned int nr_preferred_running; |
@@ -5919,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
5919 | return load_idx; | 6014 | return load_idx; |
5920 | } | 6015 | } |
5921 | 6016 | ||
5922 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) | ||
5923 | { | ||
5924 | return SCHED_CAPACITY_SCALE; | ||
5925 | } | ||
5926 | |||
5927 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
5928 | { | ||
5929 | return default_scale_capacity(sd, cpu); | ||
5930 | } | ||
5931 | |||
5932 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 6017 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
5933 | { | 6018 | { |
5934 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 6019 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) |
@@ -5945,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
5945 | static unsigned long scale_rt_capacity(int cpu) | 6030 | static unsigned long scale_rt_capacity(int cpu) |
5946 | { | 6031 | { |
5947 | struct rq *rq = cpu_rq(cpu); | 6032 | struct rq *rq = cpu_rq(cpu); |
5948 | u64 total, available, age_stamp, avg; | 6033 | u64 total, used, age_stamp, avg; |
5949 | s64 delta; | 6034 | s64 delta; |
5950 | 6035 | ||
5951 | /* | 6036 | /* |
@@ -5961,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu) | |||
5961 | 6046 | ||
5962 | total = sched_avg_period() + delta; | 6047 | total = sched_avg_period() + delta; |
5963 | 6048 | ||
5964 | if (unlikely(total < avg)) { | 6049 | used = div_u64(avg, total); |
5965 | /* Ensures that capacity won't end up being negative */ | ||
5966 | available = 0; | ||
5967 | } else { | ||
5968 | available = total - avg; | ||
5969 | } | ||
5970 | 6050 | ||
5971 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) | 6051 | if (likely(used < SCHED_CAPACITY_SCALE)) |
5972 | total = SCHED_CAPACITY_SCALE; | 6052 | return SCHED_CAPACITY_SCALE - used; |
5973 | 6053 | ||
5974 | total >>= SCHED_CAPACITY_SHIFT; | 6054 | return 1; |
5975 | |||
5976 | return div_u64(available, total); | ||
5977 | } | 6055 | } |
5978 | 6056 | ||
5979 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6057 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
@@ -5988,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
5988 | 6066 | ||
5989 | capacity >>= SCHED_CAPACITY_SHIFT; | 6067 | capacity >>= SCHED_CAPACITY_SHIFT; |
5990 | 6068 | ||
5991 | sdg->sgc->capacity_orig = capacity; | 6069 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
5992 | |||
5993 | if (sched_feat(ARCH_CAPACITY)) | ||
5994 | capacity *= arch_scale_freq_capacity(sd, cpu); | ||
5995 | else | ||
5996 | capacity *= default_scale_capacity(sd, cpu); | ||
5997 | |||
5998 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
5999 | 6070 | ||
6000 | capacity *= scale_rt_capacity(cpu); | 6071 | capacity *= scale_rt_capacity(cpu); |
6001 | capacity >>= SCHED_CAPACITY_SHIFT; | 6072 | capacity >>= SCHED_CAPACITY_SHIFT; |
@@ -6011,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6011 | { | 6082 | { |
6012 | struct sched_domain *child = sd->child; | 6083 | struct sched_domain *child = sd->child; |
6013 | struct sched_group *group, *sdg = sd->groups; | 6084 | struct sched_group *group, *sdg = sd->groups; |
6014 | unsigned long capacity, capacity_orig; | 6085 | unsigned long capacity; |
6015 | unsigned long interval; | 6086 | unsigned long interval; |
6016 | 6087 | ||
6017 | interval = msecs_to_jiffies(sd->balance_interval); | 6088 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -6023,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6023 | return; | 6094 | return; |
6024 | } | 6095 | } |
6025 | 6096 | ||
6026 | capacity_orig = capacity = 0; | 6097 | capacity = 0; |
6027 | 6098 | ||
6028 | if (child->flags & SD_OVERLAP) { | 6099 | if (child->flags & SD_OVERLAP) { |
6029 | /* | 6100 | /* |
@@ -6043,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6043 | * Use capacity_of(), which is set irrespective of domains | 6114 | * Use capacity_of(), which is set irrespective of domains |
6044 | * in update_cpu_capacity(). | 6115 | * in update_cpu_capacity(). |
6045 | * | 6116 | * |
6046 | * This avoids capacity/capacity_orig from being 0 and | 6117 | * This avoids capacity from being 0 and |
6047 | * causing divide-by-zero issues on boot. | 6118 | * causing divide-by-zero issues on boot. |
6048 | * | ||
6049 | * Runtime updates will correct capacity_orig. | ||
6050 | */ | 6119 | */ |
6051 | if (unlikely(!rq->sd)) { | 6120 | if (unlikely(!rq->sd)) { |
6052 | capacity_orig += capacity_of(cpu); | ||
6053 | capacity += capacity_of(cpu); | 6121 | capacity += capacity_of(cpu); |
6054 | continue; | 6122 | continue; |
6055 | } | 6123 | } |
6056 | 6124 | ||
6057 | sgc = rq->sd->groups->sgc; | 6125 | sgc = rq->sd->groups->sgc; |
6058 | capacity_orig += sgc->capacity_orig; | ||
6059 | capacity += sgc->capacity; | 6126 | capacity += sgc->capacity; |
6060 | } | 6127 | } |
6061 | } else { | 6128 | } else { |
@@ -6066,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6066 | 6133 | ||
6067 | group = child->groups; | 6134 | group = child->groups; |
6068 | do { | 6135 | do { |
6069 | capacity_orig += group->sgc->capacity_orig; | ||
6070 | capacity += group->sgc->capacity; | 6136 | capacity += group->sgc->capacity; |
6071 | group = group->next; | 6137 | group = group->next; |
6072 | } while (group != child->groups); | 6138 | } while (group != child->groups); |
6073 | } | 6139 | } |
6074 | 6140 | ||
6075 | sdg->sgc->capacity_orig = capacity_orig; | ||
6076 | sdg->sgc->capacity = capacity; | 6141 | sdg->sgc->capacity = capacity; |
6077 | } | 6142 | } |
6078 | 6143 | ||
6079 | /* | 6144 | /* |
6080 | * Try and fix up capacity for tiny siblings, this is needed when | 6145 | * Check whether the capacity of the rq has been noticeably reduced by side |
6081 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | 6146 | * activity. The imbalance_pct is used for the threshold. |
6082 | * which on its own isn't powerful enough. | 6147 | * Return true is the capacity is reduced |
6083 | * | ||
6084 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
6085 | */ | 6148 | */ |
6086 | static inline int | 6149 | static inline int |
6087 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 6150 | check_cpu_capacity(struct rq *rq, struct sched_domain *sd) |
6088 | { | 6151 | { |
6089 | /* | 6152 | return ((rq->cpu_capacity * sd->imbalance_pct) < |
6090 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE | 6153 | (rq->cpu_capacity_orig * 100)); |
6091 | */ | ||
6092 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) | ||
6093 | return 0; | ||
6094 | |||
6095 | /* | ||
6096 | * If ~90% of the cpu_capacity is still there, we're good. | ||
6097 | */ | ||
6098 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) | ||
6099 | return 1; | ||
6100 | |||
6101 | return 0; | ||
6102 | } | 6154 | } |
6103 | 6155 | ||
6104 | /* | 6156 | /* |
@@ -6136,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
6136 | } | 6188 | } |
6137 | 6189 | ||
6138 | /* | 6190 | /* |
6139 | * Compute the group capacity factor. | 6191 | * group_has_capacity returns true if the group has spare capacity that could |
6140 | * | 6192 | * be used by some tasks. |
6141 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by | 6193 | * We consider that a group has spare capacity if the * number of task is |
6142 | * first dividing out the smt factor and computing the actual number of cores | 6194 | * smaller than the number of CPUs or if the usage is lower than the available |
6143 | * and limit unit capacity with that. | 6195 | * capacity for CFS tasks. |
6196 | * For the latter, we use a threshold to stabilize the state, to take into | ||
6197 | * account the variance of the tasks' load and to return true if the available | ||
6198 | * capacity in meaningful for the load balancer. | ||
6199 | * As an example, an available capacity of 1% can appear but it doesn't make | ||
6200 | * any benefit for the load balance. | ||
6144 | */ | 6201 | */ |
6145 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) | 6202 | static inline bool |
6203 | group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | ||
6146 | { | 6204 | { |
6147 | unsigned int capacity_factor, smt, cpus; | 6205 | if (sgs->sum_nr_running < sgs->group_weight) |
6148 | unsigned int capacity, capacity_orig; | 6206 | return true; |
6149 | 6207 | ||
6150 | capacity = group->sgc->capacity; | 6208 | if ((sgs->group_capacity * 100) > |
6151 | capacity_orig = group->sgc->capacity_orig; | 6209 | (sgs->group_usage * env->sd->imbalance_pct)) |
6152 | cpus = group->group_weight; | 6210 | return true; |
6153 | 6211 | ||
6154 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ | 6212 | return false; |
6155 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); | 6213 | } |
6156 | capacity_factor = cpus / smt; /* cores */ | 6214 | |
6215 | /* | ||
6216 | * group_is_overloaded returns true if the group has more tasks than it can | ||
6217 | * handle. | ||
6218 | * group_is_overloaded is not equals to !group_has_capacity because a group | ||
6219 | * with the exact right number of tasks, has no more spare capacity but is not | ||
6220 | * overloaded so both group_has_capacity and group_is_overloaded return | ||
6221 | * false. | ||
6222 | */ | ||
6223 | static inline bool | ||
6224 | group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | ||
6225 | { | ||
6226 | if (sgs->sum_nr_running <= sgs->group_weight) | ||
6227 | return false; | ||
6157 | 6228 | ||
6158 | capacity_factor = min_t(unsigned, | 6229 | if ((sgs->group_capacity * 100) < |
6159 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); | 6230 | (sgs->group_usage * env->sd->imbalance_pct)) |
6160 | if (!capacity_factor) | 6231 | return true; |
6161 | capacity_factor = fix_small_capacity(env->sd, group); | ||
6162 | 6232 | ||
6163 | return capacity_factor; | 6233 | return false; |
6164 | } | 6234 | } |
6165 | 6235 | ||
6166 | static enum group_type | 6236 | static enum group_type group_classify(struct lb_env *env, |
6167 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | 6237 | struct sched_group *group, |
6238 | struct sg_lb_stats *sgs) | ||
6168 | { | 6239 | { |
6169 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6240 | if (sgs->group_no_capacity) |
6170 | return group_overloaded; | 6241 | return group_overloaded; |
6171 | 6242 | ||
6172 | if (sg_imbalanced(group)) | 6243 | if (sg_imbalanced(group)) |
@@ -6204,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6204 | load = source_load(i, load_idx); | 6275 | load = source_load(i, load_idx); |
6205 | 6276 | ||
6206 | sgs->group_load += load; | 6277 | sgs->group_load += load; |
6278 | sgs->group_usage += get_cpu_usage(i); | ||
6207 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6279 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
6208 | 6280 | ||
6209 | if (rq->nr_running > 1) | 6281 | if (rq->nr_running > 1) |
@@ -6226,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6226 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6298 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
6227 | 6299 | ||
6228 | sgs->group_weight = group->group_weight; | 6300 | sgs->group_weight = group->group_weight; |
6229 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | ||
6230 | sgs->group_type = group_classify(group, sgs); | ||
6231 | 6301 | ||
6232 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6302 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
6233 | sgs->group_has_free_capacity = 1; | 6303 | sgs->group_type = group_classify(env, group, sgs); |
6234 | } | 6304 | } |
6235 | 6305 | ||
6236 | /** | 6306 | /** |
@@ -6352,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6352 | 6422 | ||
6353 | /* | 6423 | /* |
6354 | * In case the child domain prefers tasks go to siblings | 6424 | * In case the child domain prefers tasks go to siblings |
6355 | * first, lower the sg capacity factor to one so that we'll try | 6425 | * first, lower the sg capacity so that we'll try |
6356 | * and move all the excess tasks away. We lower the capacity | 6426 | * and move all the excess tasks away. We lower the capacity |
6357 | * of a group only if the local group has the capacity to fit | 6427 | * of a group only if the local group has the capacity to fit |
6358 | * these excess tasks, i.e. nr_running < group_capacity_factor. The | 6428 | * these excess tasks. The extra check prevents the case where |
6359 | * extra check prevents the case where you always pull from the | 6429 | * you always pull from the heaviest group when it is already |
6360 | * heaviest group when it is already under-utilized (possible | 6430 | * under-utilized (possible with a large weight task outweighs |
6361 | * with a large weight task outweighs the tasks on the system). | 6431 | * the tasks on the system). |
6362 | */ | 6432 | */ |
6363 | if (prefer_sibling && sds->local && | 6433 | if (prefer_sibling && sds->local && |
6364 | sds->local_stat.group_has_free_capacity) { | 6434 | group_has_capacity(env, &sds->local_stat) && |
6365 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6435 | (sgs->sum_nr_running > 1)) { |
6366 | sgs->group_type = group_classify(sg, sgs); | 6436 | sgs->group_no_capacity = 1; |
6437 | sgs->group_type = group_overloaded; | ||
6367 | } | 6438 | } |
6368 | 6439 | ||
6369 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6440 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
@@ -6543,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6543 | */ | 6614 | */ |
6544 | if (busiest->group_type == group_overloaded && | 6615 | if (busiest->group_type == group_overloaded && |
6545 | local->group_type == group_overloaded) { | 6616 | local->group_type == group_overloaded) { |
6546 | load_above_capacity = | 6617 | load_above_capacity = busiest->sum_nr_running * |
6547 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6618 | SCHED_LOAD_SCALE; |
6548 | 6619 | if (load_above_capacity > busiest->group_capacity) | |
6549 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); | 6620 | load_above_capacity -= busiest->group_capacity; |
6550 | load_above_capacity /= busiest->group_capacity; | 6621 | else |
6622 | load_above_capacity = ~0UL; | ||
6551 | } | 6623 | } |
6552 | 6624 | ||
6553 | /* | 6625 | /* |
@@ -6610,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6610 | local = &sds.local_stat; | 6682 | local = &sds.local_stat; |
6611 | busiest = &sds.busiest_stat; | 6683 | busiest = &sds.busiest_stat; |
6612 | 6684 | ||
6685 | /* ASYM feature bypasses nice load balance check */ | ||
6613 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 6686 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
6614 | check_asym_packing(env, &sds)) | 6687 | check_asym_packing(env, &sds)) |
6615 | return sds.busiest; | 6688 | return sds.busiest; |
@@ -6630,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6630 | goto force_balance; | 6703 | goto force_balance; |
6631 | 6704 | ||
6632 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6705 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
6633 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && | 6706 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && |
6634 | !busiest->group_has_free_capacity) | 6707 | busiest->group_no_capacity) |
6635 | goto force_balance; | 6708 | goto force_balance; |
6636 | 6709 | ||
6637 | /* | 6710 | /* |
@@ -6690,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6690 | int i; | 6763 | int i; |
6691 | 6764 | ||
6692 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6765 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
6693 | unsigned long capacity, capacity_factor, wl; | 6766 | unsigned long capacity, wl; |
6694 | enum fbq_type rt; | 6767 | enum fbq_type rt; |
6695 | 6768 | ||
6696 | rq = cpu_rq(i); | 6769 | rq = cpu_rq(i); |
@@ -6719,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6719 | continue; | 6792 | continue; |
6720 | 6793 | ||
6721 | capacity = capacity_of(i); | 6794 | capacity = capacity_of(i); |
6722 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); | ||
6723 | if (!capacity_factor) | ||
6724 | capacity_factor = fix_small_capacity(env->sd, group); | ||
6725 | 6795 | ||
6726 | wl = weighted_cpuload(i); | 6796 | wl = weighted_cpuload(i); |
6727 | 6797 | ||
@@ -6729,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6729 | * When comparing with imbalance, use weighted_cpuload() | 6799 | * When comparing with imbalance, use weighted_cpuload() |
6730 | * which is not scaled with the cpu capacity. | 6800 | * which is not scaled with the cpu capacity. |
6731 | */ | 6801 | */ |
6732 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) | 6802 | |
6803 | if (rq->nr_running == 1 && wl > env->imbalance && | ||
6804 | !check_cpu_capacity(rq, env->sd)) | ||
6733 | continue; | 6805 | continue; |
6734 | 6806 | ||
6735 | /* | 6807 | /* |
@@ -6777,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) | |||
6777 | return 1; | 6849 | return 1; |
6778 | } | 6850 | } |
6779 | 6851 | ||
6852 | /* | ||
6853 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | ||
6854 | * It's worth migrating the task if the src_cpu's capacity is reduced | ||
6855 | * because of other sched_class or IRQs if more capacity stays | ||
6856 | * available on dst_cpu. | ||
6857 | */ | ||
6858 | if ((env->idle != CPU_NOT_IDLE) && | ||
6859 | (env->src_rq->cfs.h_nr_running == 1)) { | ||
6860 | if ((check_cpu_capacity(env->src_rq, sd)) && | ||
6861 | (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) | ||
6862 | return 1; | ||
6863 | } | ||
6864 | |||
6780 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 6865 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
6781 | } | 6866 | } |
6782 | 6867 | ||
@@ -6876,6 +6961,9 @@ redo: | |||
6876 | 6961 | ||
6877 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 6962 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
6878 | 6963 | ||
6964 | env.src_cpu = busiest->cpu; | ||
6965 | env.src_rq = busiest; | ||
6966 | |||
6879 | ld_moved = 0; | 6967 | ld_moved = 0; |
6880 | if (busiest->nr_running > 1) { | 6968 | if (busiest->nr_running > 1) { |
6881 | /* | 6969 | /* |
@@ -6885,8 +6973,6 @@ redo: | |||
6885 | * correctly treated as an imbalance. | 6973 | * correctly treated as an imbalance. |
6886 | */ | 6974 | */ |
6887 | env.flags |= LBF_ALL_PINNED; | 6975 | env.flags |= LBF_ALL_PINNED; |
6888 | env.src_cpu = busiest->cpu; | ||
6889 | env.src_rq = busiest; | ||
6890 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6976 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
6891 | 6977 | ||
6892 | more_balance: | 6978 | more_balance: |
@@ -7586,22 +7672,25 @@ end: | |||
7586 | 7672 | ||
7587 | /* | 7673 | /* |
7588 | * Current heuristic for kicking the idle load balancer in the presence | 7674 | * Current heuristic for kicking the idle load balancer in the presence |
7589 | * of an idle cpu is the system. | 7675 | * of an idle cpu in the system. |
7590 | * - This rq has more than one task. | 7676 | * - This rq has more than one task. |
7591 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7677 | * - This rq has at least one CFS task and the capacity of the CPU is |
7592 | * busy cpu's exceeding the group's capacity. | 7678 | * significantly reduced because of RT tasks or IRQs. |
7679 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
7680 | * multiple busy cpu. | ||
7593 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7681 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
7594 | * domain span are idle. | 7682 | * domain span are idle. |
7595 | */ | 7683 | */ |
7596 | static inline int nohz_kick_needed(struct rq *rq) | 7684 | static inline bool nohz_kick_needed(struct rq *rq) |
7597 | { | 7685 | { |
7598 | unsigned long now = jiffies; | 7686 | unsigned long now = jiffies; |
7599 | struct sched_domain *sd; | 7687 | struct sched_domain *sd; |
7600 | struct sched_group_capacity *sgc; | 7688 | struct sched_group_capacity *sgc; |
7601 | int nr_busy, cpu = rq->cpu; | 7689 | int nr_busy, cpu = rq->cpu; |
7690 | bool kick = false; | ||
7602 | 7691 | ||
7603 | if (unlikely(rq->idle_balance)) | 7692 | if (unlikely(rq->idle_balance)) |
7604 | return 0; | 7693 | return false; |
7605 | 7694 | ||
7606 | /* | 7695 | /* |
7607 | * We may be recently in ticked or tickless idle mode. At the first | 7696 | * We may be recently in ticked or tickless idle mode. At the first |
@@ -7615,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
7615 | * balancing. | 7704 | * balancing. |
7616 | */ | 7705 | */ |
7617 | if (likely(!atomic_read(&nohz.nr_cpus))) | 7706 | if (likely(!atomic_read(&nohz.nr_cpus))) |
7618 | return 0; | 7707 | return false; |
7619 | 7708 | ||
7620 | if (time_before(now, nohz.next_balance)) | 7709 | if (time_before(now, nohz.next_balance)) |
7621 | return 0; | 7710 | return false; |
7622 | 7711 | ||
7623 | if (rq->nr_running >= 2) | 7712 | if (rq->nr_running >= 2) |
7624 | goto need_kick; | 7713 | return true; |
7625 | 7714 | ||
7626 | rcu_read_lock(); | 7715 | rcu_read_lock(); |
7627 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7716 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
7628 | |||
7629 | if (sd) { | 7717 | if (sd) { |
7630 | sgc = sd->groups->sgc; | 7718 | sgc = sd->groups->sgc; |
7631 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 7719 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
7632 | 7720 | ||
7633 | if (nr_busy > 1) | 7721 | if (nr_busy > 1) { |
7634 | goto need_kick_unlock; | 7722 | kick = true; |
7723 | goto unlock; | ||
7724 | } | ||
7725 | |||
7635 | } | 7726 | } |
7636 | 7727 | ||
7637 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 7728 | sd = rcu_dereference(rq->sd); |
7729 | if (sd) { | ||
7730 | if ((rq->cfs.h_nr_running >= 1) && | ||
7731 | check_cpu_capacity(rq, sd)) { | ||
7732 | kick = true; | ||
7733 | goto unlock; | ||
7734 | } | ||
7735 | } | ||
7638 | 7736 | ||
7737 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
7639 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 7738 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, |
7640 | sched_domain_span(sd)) < cpu)) | 7739 | sched_domain_span(sd)) < cpu)) { |
7641 | goto need_kick_unlock; | 7740 | kick = true; |
7642 | 7741 | goto unlock; | |
7643 | rcu_read_unlock(); | 7742 | } |
7644 | return 0; | ||
7645 | 7743 | ||
7646 | need_kick_unlock: | 7744 | unlock: |
7647 | rcu_read_unlock(); | 7745 | rcu_read_unlock(); |
7648 | need_kick: | 7746 | return kick; |
7649 | return 1; | ||
7650 | } | 7747 | } |
7651 | #else | 7748 | #else |
7652 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 7749 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } |
@@ -7662,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
7662 | enum cpu_idle_type idle = this_rq->idle_balance ? | 7759 | enum cpu_idle_type idle = this_rq->idle_balance ? |
7663 | CPU_IDLE : CPU_NOT_IDLE; | 7760 | CPU_IDLE : CPU_NOT_IDLE; |
7664 | 7761 | ||
7665 | rebalance_domains(this_rq, idle); | ||
7666 | |||
7667 | /* | 7762 | /* |
7668 | * If this cpu has a pending nohz_balance_kick, then do the | 7763 | * If this cpu has a pending nohz_balance_kick, then do the |
7669 | * balancing on behalf of the other idle cpus whose ticks are | 7764 | * balancing on behalf of the other idle cpus whose ticks are |
7670 | * stopped. | 7765 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
7766 | * give the idle cpus a chance to load balance. Else we may | ||
7767 | * load balance only within the local sched_domain hierarchy | ||
7768 | * and abort nohz_idle_balance altogether if we pull some load. | ||
7671 | */ | 7769 | */ |
7672 | nohz_idle_balance(this_rq, idle); | 7770 | nohz_idle_balance(this_rq, idle); |
7771 | rebalance_domains(this_rq, idle); | ||
7673 | } | 7772 | } |
7674 | 7773 | ||
7675 | /* | 7774 | /* |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) | |||
56 | */ | 56 | */ |
57 | SCHED_FEAT(TTWU_QUEUE, true) | 57 | SCHED_FEAT(TTWU_QUEUE, true) |
58 | 58 | ||
59 | #ifdef HAVE_RT_PUSH_IPI | ||
60 | /* | ||
61 | * In order to avoid a thundering herd attack of CPUs that are | ||
62 | * lowering their priorities at the same time, and there being | ||
63 | * a single CPU that has an RT task that can migrate and is waiting | ||
64 | * to run, where the other CPUs will try to take that CPUs | ||
65 | * rq lock and possibly create a large contention, sending an | ||
66 | * IPI to that CPU and let that CPU push the RT task to where | ||
67 | * it should go may be a better scenario. | ||
68 | */ | ||
69 | SCHED_FEAT(RT_PUSH_IPI, true) | ||
70 | #endif | ||
71 | |||
59 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
60 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
61 | SCHED_FEAT(LB_MIN, false) | 74 | SCHED_FEAT(LB_MIN, false) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b077eba0..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include "sched.h" | 6 | #include "sched.h" |
7 | 7 | ||
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/irq_work.h> | ||
9 | 10 | ||
10 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; |
11 | 12 | ||
@@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
59 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 60 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
60 | } | 61 | } |
61 | 62 | ||
62 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 63 | #ifdef CONFIG_SMP |
64 | static void push_irq_work_func(struct irq_work *work); | ||
65 | #endif | ||
66 | |||
67 | void init_rt_rq(struct rt_rq *rt_rq) | ||
63 | { | 68 | { |
64 | struct rt_prio_array *array; | 69 | struct rt_prio_array *array; |
65 | int i; | 70 | int i; |
@@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
78 | rt_rq->rt_nr_migratory = 0; | 83 | rt_rq->rt_nr_migratory = 0; |
79 | rt_rq->overloaded = 0; | 84 | rt_rq->overloaded = 0; |
80 | plist_head_init(&rt_rq->pushable_tasks); | 85 | plist_head_init(&rt_rq->pushable_tasks); |
86 | |||
87 | #ifdef HAVE_RT_PUSH_IPI | ||
88 | rt_rq->push_flags = 0; | ||
89 | rt_rq->push_cpu = nr_cpu_ids; | ||
90 | raw_spin_lock_init(&rt_rq->push_lock); | ||
91 | init_irq_work(&rt_rq->push_work, push_irq_work_func); | ||
81 | #endif | 92 | #endif |
93 | #endif /* CONFIG_SMP */ | ||
82 | /* We start is dequeued state, because no RT tasks are queued */ | 94 | /* We start is dequeued state, because no RT tasks are queued */ |
83 | rt_rq->rt_queued = 0; | 95 | rt_rq->rt_queued = 0; |
84 | 96 | ||
@@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
193 | if (!rt_se) | 205 | if (!rt_se) |
194 | goto err_free_rq; | 206 | goto err_free_rq; |
195 | 207 | ||
196 | init_rt_rq(rt_rq, cpu_rq(i)); | 208 | init_rt_rq(rt_rq); |
197 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 209 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
198 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 210 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
199 | } | 211 | } |
@@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) | |||
1778 | ; | 1790 | ; |
1779 | } | 1791 | } |
1780 | 1792 | ||
1793 | #ifdef HAVE_RT_PUSH_IPI | ||
1794 | /* | ||
1795 | * The search for the next cpu always starts at rq->cpu and ends | ||
1796 | * when we reach rq->cpu again. It will never return rq->cpu. | ||
1797 | * This returns the next cpu to check, or nr_cpu_ids if the loop | ||
1798 | * is complete. | ||
1799 | * | ||
1800 | * rq->rt.push_cpu holds the last cpu returned by this function, | ||
1801 | * or if this is the first instance, it must hold rq->cpu. | ||
1802 | */ | ||
1803 | static int rto_next_cpu(struct rq *rq) | ||
1804 | { | ||
1805 | int prev_cpu = rq->rt.push_cpu; | ||
1806 | int cpu; | ||
1807 | |||
1808 | cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); | ||
1809 | |||
1810 | /* | ||
1811 | * If the previous cpu is less than the rq's CPU, then it already | ||
1812 | * passed the end of the mask, and has started from the beginning. | ||
1813 | * We end if the next CPU is greater or equal to rq's CPU. | ||
1814 | */ | ||
1815 | if (prev_cpu < rq->cpu) { | ||
1816 | if (cpu >= rq->cpu) | ||
1817 | return nr_cpu_ids; | ||
1818 | |||
1819 | } else if (cpu >= nr_cpu_ids) { | ||
1820 | /* | ||
1821 | * We passed the end of the mask, start at the beginning. | ||
1822 | * If the result is greater or equal to the rq's CPU, then | ||
1823 | * the loop is finished. | ||
1824 | */ | ||
1825 | cpu = cpumask_first(rq->rd->rto_mask); | ||
1826 | if (cpu >= rq->cpu) | ||
1827 | return nr_cpu_ids; | ||
1828 | } | ||
1829 | rq->rt.push_cpu = cpu; | ||
1830 | |||
1831 | /* Return cpu to let the caller know if the loop is finished or not */ | ||
1832 | return cpu; | ||
1833 | } | ||
1834 | |||
1835 | static int find_next_push_cpu(struct rq *rq) | ||
1836 | { | ||
1837 | struct rq *next_rq; | ||
1838 | int cpu; | ||
1839 | |||
1840 | while (1) { | ||
1841 | cpu = rto_next_cpu(rq); | ||
1842 | if (cpu >= nr_cpu_ids) | ||
1843 | break; | ||
1844 | next_rq = cpu_rq(cpu); | ||
1845 | |||
1846 | /* Make sure the next rq can push to this rq */ | ||
1847 | if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) | ||
1848 | break; | ||
1849 | } | ||
1850 | |||
1851 | return cpu; | ||
1852 | } | ||
1853 | |||
1854 | #define RT_PUSH_IPI_EXECUTING 1 | ||
1855 | #define RT_PUSH_IPI_RESTART 2 | ||
1856 | |||
1857 | static void tell_cpu_to_push(struct rq *rq) | ||
1858 | { | ||
1859 | int cpu; | ||
1860 | |||
1861 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
1862 | raw_spin_lock(&rq->rt.push_lock); | ||
1863 | /* Make sure it's still executing */ | ||
1864 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
1865 | /* | ||
1866 | * Tell the IPI to restart the loop as things have | ||
1867 | * changed since it started. | ||
1868 | */ | ||
1869 | rq->rt.push_flags |= RT_PUSH_IPI_RESTART; | ||
1870 | raw_spin_unlock(&rq->rt.push_lock); | ||
1871 | return; | ||
1872 | } | ||
1873 | raw_spin_unlock(&rq->rt.push_lock); | ||
1874 | } | ||
1875 | |||
1876 | /* When here, there's no IPI going around */ | ||
1877 | |||
1878 | rq->rt.push_cpu = rq->cpu; | ||
1879 | cpu = find_next_push_cpu(rq); | ||
1880 | if (cpu >= nr_cpu_ids) | ||
1881 | return; | ||
1882 | |||
1883 | rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; | ||
1884 | |||
1885 | irq_work_queue_on(&rq->rt.push_work, cpu); | ||
1886 | } | ||
1887 | |||
1888 | /* Called from hardirq context */ | ||
1889 | static void try_to_push_tasks(void *arg) | ||
1890 | { | ||
1891 | struct rt_rq *rt_rq = arg; | ||
1892 | struct rq *rq, *src_rq; | ||
1893 | int this_cpu; | ||
1894 | int cpu; | ||
1895 | |||
1896 | this_cpu = rt_rq->push_cpu; | ||
1897 | |||
1898 | /* Paranoid check */ | ||
1899 | BUG_ON(this_cpu != smp_processor_id()); | ||
1900 | |||
1901 | rq = cpu_rq(this_cpu); | ||
1902 | src_rq = rq_of_rt_rq(rt_rq); | ||
1903 | |||
1904 | again: | ||
1905 | if (has_pushable_tasks(rq)) { | ||
1906 | raw_spin_lock(&rq->lock); | ||
1907 | push_rt_task(rq); | ||
1908 | raw_spin_unlock(&rq->lock); | ||
1909 | } | ||
1910 | |||
1911 | /* Pass the IPI to the next rt overloaded queue */ | ||
1912 | raw_spin_lock(&rt_rq->push_lock); | ||
1913 | /* | ||
1914 | * If the source queue changed since the IPI went out, | ||
1915 | * we need to restart the search from that CPU again. | ||
1916 | */ | ||
1917 | if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { | ||
1918 | rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; | ||
1919 | rt_rq->push_cpu = src_rq->cpu; | ||
1920 | } | ||
1921 | |||
1922 | cpu = find_next_push_cpu(src_rq); | ||
1923 | |||
1924 | if (cpu >= nr_cpu_ids) | ||
1925 | rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; | ||
1926 | raw_spin_unlock(&rt_rq->push_lock); | ||
1927 | |||
1928 | if (cpu >= nr_cpu_ids) | ||
1929 | return; | ||
1930 | |||
1931 | /* | ||
1932 | * It is possible that a restart caused this CPU to be | ||
1933 | * chosen again. Don't bother with an IPI, just see if we | ||
1934 | * have more to push. | ||
1935 | */ | ||
1936 | if (unlikely(cpu == rq->cpu)) | ||
1937 | goto again; | ||
1938 | |||
1939 | /* Try the next RT overloaded CPU */ | ||
1940 | irq_work_queue_on(&rt_rq->push_work, cpu); | ||
1941 | } | ||
1942 | |||
1943 | static void push_irq_work_func(struct irq_work *work) | ||
1944 | { | ||
1945 | struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); | ||
1946 | |||
1947 | try_to_push_tasks(rt_rq); | ||
1948 | } | ||
1949 | #endif /* HAVE_RT_PUSH_IPI */ | ||
1950 | |||
1781 | static int pull_rt_task(struct rq *this_rq) | 1951 | static int pull_rt_task(struct rq *this_rq) |
1782 | { | 1952 | { |
1783 | int this_cpu = this_rq->cpu, ret = 0, cpu; | 1953 | int this_cpu = this_rq->cpu, ret = 0, cpu; |
@@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) | |||
1793 | */ | 1963 | */ |
1794 | smp_rmb(); | 1964 | smp_rmb(); |
1795 | 1965 | ||
1966 | #ifdef HAVE_RT_PUSH_IPI | ||
1967 | if (sched_feat(RT_PUSH_IPI)) { | ||
1968 | tell_cpu_to_push(this_rq); | ||
1969 | return 0; | ||
1970 | } | ||
1971 | #endif | ||
1972 | |||
1796 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1973 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
1797 | if (this_cpu == cpu) | 1974 | if (this_cpu == cpu) |
1798 | continue; | 1975 | continue; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435a2779..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
8 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> |
9 | #include <linux/irq_work.h> | ||
9 | #include <linux/tick.h> | 10 | #include <linux/tick.h> |
10 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
11 | 12 | ||
@@ -362,8 +363,14 @@ struct cfs_rq { | |||
362 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 363 | * Under CFS, load is tracked on a per-entity basis and aggregated up. |
363 | * This allows for the description of both thread and group usage (in | 364 | * This allows for the description of both thread and group usage (in |
364 | * the FAIR_GROUP_SCHED case). | 365 | * the FAIR_GROUP_SCHED case). |
366 | * runnable_load_avg is the sum of the load_avg_contrib of the | ||
367 | * sched_entities on the rq. | ||
368 | * blocked_load_avg is similar to runnable_load_avg except that its | ||
369 | * the blocked sched_entities on the rq. | ||
370 | * utilization_load_avg is the sum of the average running time of the | ||
371 | * sched_entities on the rq. | ||
365 | */ | 372 | */ |
366 | unsigned long runnable_load_avg, blocked_load_avg; | 373 | unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; |
367 | atomic64_t decay_counter; | 374 | atomic64_t decay_counter; |
368 | u64 last_decay; | 375 | u64 last_decay; |
369 | atomic_long_t removed_load; | 376 | atomic_long_t removed_load; |
@@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) | |||
418 | return sysctl_sched_rt_runtime >= 0; | 425 | return sysctl_sched_rt_runtime >= 0; |
419 | } | 426 | } |
420 | 427 | ||
428 | /* RT IPI pull logic requires IRQ_WORK */ | ||
429 | #ifdef CONFIG_IRQ_WORK | ||
430 | # define HAVE_RT_PUSH_IPI | ||
431 | #endif | ||
432 | |||
421 | /* Real-Time classes' related field in a runqueue: */ | 433 | /* Real-Time classes' related field in a runqueue: */ |
422 | struct rt_rq { | 434 | struct rt_rq { |
423 | struct rt_prio_array active; | 435 | struct rt_prio_array active; |
@@ -435,7 +447,13 @@ struct rt_rq { | |||
435 | unsigned long rt_nr_total; | 447 | unsigned long rt_nr_total; |
436 | int overloaded; | 448 | int overloaded; |
437 | struct plist_head pushable_tasks; | 449 | struct plist_head pushable_tasks; |
450 | #ifdef HAVE_RT_PUSH_IPI | ||
451 | int push_flags; | ||
452 | int push_cpu; | ||
453 | struct irq_work push_work; | ||
454 | raw_spinlock_t push_lock; | ||
438 | #endif | 455 | #endif |
456 | #endif /* CONFIG_SMP */ | ||
439 | int rt_queued; | 457 | int rt_queued; |
440 | 458 | ||
441 | int rt_throttled; | 459 | int rt_throttled; |
@@ -597,6 +615,7 @@ struct rq { | |||
597 | struct sched_domain *sd; | 615 | struct sched_domain *sd; |
598 | 616 | ||
599 | unsigned long cpu_capacity; | 617 | unsigned long cpu_capacity; |
618 | unsigned long cpu_capacity_orig; | ||
600 | 619 | ||
601 | unsigned char idle_balance; | 620 | unsigned char idle_balance; |
602 | /* For active balancing */ | 621 | /* For active balancing */ |
@@ -807,7 +826,7 @@ struct sched_group_capacity { | |||
807 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity | 826 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity |
808 | * for a single CPU. | 827 | * for a single CPU. |
809 | */ | 828 | */ |
810 | unsigned int capacity, capacity_orig; | 829 | unsigned int capacity; |
811 | unsigned long next_update; | 830 | unsigned long next_update; |
812 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 831 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
813 | /* | 832 | /* |
@@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) | |||
1368 | 1387 | ||
1369 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP |
1370 | extern void sched_avg_update(struct rq *rq); | 1389 | extern void sched_avg_update(struct rq *rq); |
1390 | |||
1391 | #ifndef arch_scale_freq_capacity | ||
1392 | static __always_inline | ||
1393 | unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
1394 | { | ||
1395 | return SCHED_CAPACITY_SCALE; | ||
1396 | } | ||
1397 | #endif | ||
1398 | |||
1371 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1399 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1372 | { | 1400 | { |
1373 | rq->rt_avg += rt_delta; | 1401 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
1374 | sched_avg_update(rq); | 1402 | sched_avg_update(rq); |
1375 | } | 1403 | } |
1376 | #else | 1404 | #else |
@@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
1643 | extern void print_dl_stats(struct seq_file *m, int cpu); | 1671 | extern void print_dl_stats(struct seq_file *m, int cpu); |
1644 | 1672 | ||
1645 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1673 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
1646 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1674 | extern void init_rt_rq(struct rt_rq *rt_rq); |
1647 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | 1675 | extern void init_dl_rq(struct dl_rq *dl_rq); |
1648 | 1676 | ||
1649 | extern void cfs_bandwidth_usage_inc(void); | 1677 | extern void cfs_bandwidth_usage_inc(void); |
1650 | extern void cfs_bandwidth_usage_dec(void); | 1678 | extern void cfs_bandwidth_usage_dec(void); |