diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 21:03:50 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 21:03:50 -0500 |
| commit | 53528695ff6d8b77011bc818407c13e30914a946 (patch) | |
| tree | 04acd099c5759bf6f1d728c5415f574d572c6872 /kernel/sched | |
| parent | b831ef2cad979912850e34f82415c0c5d59de8cb (diff) | |
| parent | e73e85f0593832aa583b252f9a16cf90ed6d30fa (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The main changes in this cycle were:
- sched/fair load tracking fixes and cleanups (Byungchul Park)
- Make load tracking frequency scale invariant (Dietmar Eggemann)
- sched/deadline updates (Juri Lelli)
- stop machine fixes, cleanups and enhancements for bugs triggered by
CPU hotplug stress testing (Oleg Nesterov)
- scheduler preemption code rework: remove PREEMPT_ACTIVE and related
cleanups (Peter Zijlstra)
- Rework the sched_info::run_delay code to fix races (Peter Zijlstra)
- Optimize per entity utilization tracking (Peter Zijlstra)
- ... misc other fixes, cleanups and smaller updates"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS
sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop()
sched: Start stopper early
stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark()
stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark()
stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled
stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works()
stop_machine: Ensure that a queued callback will be called before cpu_stop_park()
sched/x86: Fix typo in __switch_to() comments
sched/core: Remove a parameter in the migrate_task_rq() function
sched/core: Drop unlikely behind BUG_ON()
sched/core: Fix task and run queue sched_info::run_delay inconsistencies
sched/numa: Fix task_tick_fair() from disabling numa_balancing
sched/core: Add preempt_count invariant check
sched/core: More notrace annotations
sched/core: Kill PREEMPT_ACTIVE
sched/core, sched/x86: Kill thread_info::saved_preempt_count
sched/core: Simplify preempt_count tests
sched/core: Robustify preemption leak checks
sched/core: Stop setting PREEMPT_ACTIVE
...
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 203 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 5 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.h | 1 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 419 | ||||
| -rw-r--r-- | kernel/sched/features.h | 21 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 22 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 55 |
7 files changed, 382 insertions, 344 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7402f7eb448..aa5973220ad2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p) | |||
| 817 | /* | 817 | /* |
| 818 | * SCHED_IDLE tasks get minimal weight: | 818 | * SCHED_IDLE tasks get minimal weight: |
| 819 | */ | 819 | */ |
| 820 | if (p->policy == SCHED_IDLE) { | 820 | if (idle_policy(p->policy)) { |
| 821 | load->weight = scale_load(WEIGHT_IDLEPRIO); | 821 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
| 822 | load->inv_weight = WMULT_IDLEPRIO; | 822 | load->inv_weight = WMULT_IDLEPRIO; |
| 823 | return; | 823 | return; |
| @@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p) | |||
| 827 | load->inv_weight = prio_to_wmult[prio]; | 827 | load->inv_weight = prio_to_wmult[prio]; |
| 828 | } | 828 | } |
| 829 | 829 | ||
| 830 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 830 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
| 831 | { | 831 | { |
| 832 | update_rq_clock(rq); | 832 | update_rq_clock(rq); |
| 833 | sched_info_queued(rq, p); | 833 | if (!(flags & ENQUEUE_RESTORE)) |
| 834 | sched_info_queued(rq, p); | ||
| 834 | p->sched_class->enqueue_task(rq, p, flags); | 835 | p->sched_class->enqueue_task(rq, p, flags); |
| 835 | } | 836 | } |
| 836 | 837 | ||
| 837 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 838 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
| 838 | { | 839 | { |
| 839 | update_rq_clock(rq); | 840 | update_rq_clock(rq); |
| 840 | sched_info_dequeued(rq, p); | 841 | if (!(flags & DEQUEUE_SAVE)) |
| 842 | sched_info_dequeued(rq, p); | ||
| 841 | p->sched_class->dequeue_task(rq, p, flags); | 843 | p->sched_class->dequeue_task(rq, p, flags); |
| 842 | } | 844 | } |
| 843 | 845 | ||
| @@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
| 1178 | * holding rq->lock. | 1180 | * holding rq->lock. |
| 1179 | */ | 1181 | */ |
| 1180 | lockdep_assert_held(&rq->lock); | 1182 | lockdep_assert_held(&rq->lock); |
| 1181 | dequeue_task(rq, p, 0); | 1183 | dequeue_task(rq, p, DEQUEUE_SAVE); |
| 1182 | } | 1184 | } |
| 1183 | if (running) | 1185 | if (running) |
| 1184 | put_prev_task(rq, p); | 1186 | put_prev_task(rq, p); |
| @@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
| 1188 | if (running) | 1190 | if (running) |
| 1189 | p->sched_class->set_curr_task(rq); | 1191 | p->sched_class->set_curr_task(rq); |
| 1190 | if (queued) | 1192 | if (queued) |
| 1191 | enqueue_task(rq, p, 0); | 1193 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
| 1192 | } | 1194 | } |
| 1193 | 1195 | ||
| 1194 | /* | 1196 | /* |
| @@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1292 | 1294 | ||
| 1293 | if (task_cpu(p) != new_cpu) { | 1295 | if (task_cpu(p) != new_cpu) { |
| 1294 | if (p->sched_class->migrate_task_rq) | 1296 | if (p->sched_class->migrate_task_rq) |
| 1295 | p->sched_class->migrate_task_rq(p, new_cpu); | 1297 | p->sched_class->migrate_task_rq(p); |
| 1296 | p->se.nr_migrations++; | 1298 | p->se.nr_migrations++; |
| 1297 | perf_event_task_migrate(p); | 1299 | perf_event_task_migrate(p); |
| 1298 | } | 1300 | } |
| @@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data) | |||
| 1333 | struct rq *src_rq, *dst_rq; | 1335 | struct rq *src_rq, *dst_rq; |
| 1334 | int ret = -EAGAIN; | 1336 | int ret = -EAGAIN; |
| 1335 | 1337 | ||
| 1338 | if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) | ||
| 1339 | return -EAGAIN; | ||
| 1340 | |||
| 1336 | src_rq = cpu_rq(arg->src_cpu); | 1341 | src_rq = cpu_rq(arg->src_cpu); |
| 1337 | dst_rq = cpu_rq(arg->dst_cpu); | 1342 | dst_rq = cpu_rq(arg->dst_cpu); |
| 1338 | 1343 | ||
| 1339 | double_raw_lock(&arg->src_task->pi_lock, | 1344 | double_raw_lock(&arg->src_task->pi_lock, |
| 1340 | &arg->dst_task->pi_lock); | 1345 | &arg->dst_task->pi_lock); |
| 1341 | double_rq_lock(src_rq, dst_rq); | 1346 | double_rq_lock(src_rq, dst_rq); |
| 1347 | |||
| 1342 | if (task_cpu(arg->dst_task) != arg->dst_cpu) | 1348 | if (task_cpu(arg->dst_task) != arg->dst_cpu) |
| 1343 | goto unlock; | 1349 | goto unlock; |
| 1344 | 1350 | ||
| @@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
| 1574 | goto out; | 1580 | goto out; |
| 1575 | } | 1581 | } |
| 1576 | 1582 | ||
| 1583 | /* No more Mr. Nice Guy. */ | ||
| 1577 | switch (state) { | 1584 | switch (state) { |
| 1578 | case cpuset: | 1585 | case cpuset: |
| 1579 | /* No more Mr. Nice Guy. */ | 1586 | if (IS_ENABLED(CONFIG_CPUSETS)) { |
| 1580 | cpuset_cpus_allowed_fallback(p); | 1587 | cpuset_cpus_allowed_fallback(p); |
| 1581 | state = possible; | 1588 | state = possible; |
| 1582 | break; | 1589 | break; |
| 1583 | 1590 | } | |
| 1591 | /* fall-through */ | ||
| 1584 | case possible: | 1592 | case possible: |
| 1585 | do_set_cpus_allowed(p, cpu_possible_mask); | 1593 | do_set_cpus_allowed(p, cpu_possible_mask); |
| 1586 | state = fail; | 1594 | state = fail; |
| @@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
| 1692 | #endif /* CONFIG_SCHEDSTATS */ | 1700 | #endif /* CONFIG_SCHEDSTATS */ |
| 1693 | } | 1701 | } |
| 1694 | 1702 | ||
| 1695 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1703 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
| 1696 | { | 1704 | { |
| 1697 | activate_task(rq, p, en_flags); | 1705 | activate_task(rq, p, en_flags); |
| 1698 | p->on_rq = TASK_ON_RQ_QUEUED; | 1706 | p->on_rq = TASK_ON_RQ_QUEUED; |
| @@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2114 | #endif /* CONFIG_NUMA_BALANCING */ | 2122 | #endif /* CONFIG_NUMA_BALANCING */ |
| 2115 | } | 2123 | } |
| 2116 | 2124 | ||
| 2125 | DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); | ||
| 2126 | |||
| 2117 | #ifdef CONFIG_NUMA_BALANCING | 2127 | #ifdef CONFIG_NUMA_BALANCING |
| 2118 | #ifdef CONFIG_SCHED_DEBUG | 2128 | |
| 2119 | void set_numabalancing_state(bool enabled) | 2129 | void set_numabalancing_state(bool enabled) |
| 2120 | { | 2130 | { |
| 2121 | if (enabled) | 2131 | if (enabled) |
| 2122 | sched_feat_set("NUMA"); | 2132 | static_branch_enable(&sched_numa_balancing); |
| 2123 | else | 2133 | else |
| 2124 | sched_feat_set("NO_NUMA"); | 2134 | static_branch_disable(&sched_numa_balancing); |
| 2125 | } | 2135 | } |
| 2126 | #else | ||
| 2127 | __read_mostly bool numabalancing_enabled; | ||
| 2128 | |||
| 2129 | void set_numabalancing_state(bool enabled) | ||
| 2130 | { | ||
| 2131 | numabalancing_enabled = enabled; | ||
| 2132 | } | ||
| 2133 | #endif /* CONFIG_SCHED_DEBUG */ | ||
| 2134 | 2136 | ||
| 2135 | #ifdef CONFIG_PROC_SYSCTL | 2137 | #ifdef CONFIG_PROC_SYSCTL |
| 2136 | int sysctl_numa_balancing(struct ctl_table *table, int write, | 2138 | int sysctl_numa_balancing(struct ctl_table *table, int write, |
| @@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
| 2138 | { | 2140 | { |
| 2139 | struct ctl_table t; | 2141 | struct ctl_table t; |
| 2140 | int err; | 2142 | int err; |
| 2141 | int state = numabalancing_enabled; | 2143 | int state = static_branch_likely(&sched_numa_balancing); |
| 2142 | 2144 | ||
| 2143 | if (write && !capable(CAP_SYS_ADMIN)) | 2145 | if (write && !capable(CAP_SYS_ADMIN)) |
| 2144 | return -EPERM; | 2146 | return -EPERM; |
| @@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2349 | struct rq *rq; | 2351 | struct rq *rq; |
| 2350 | 2352 | ||
| 2351 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2353 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 2354 | /* Initialize new task's runnable average */ | ||
| 2355 | init_entity_runnable_average(&p->se); | ||
| 2352 | #ifdef CONFIG_SMP | 2356 | #ifdef CONFIG_SMP |
| 2353 | /* | 2357 | /* |
| 2354 | * Fork balancing, do it here and not earlier because: | 2358 | * Fork balancing, do it here and not earlier because: |
| @@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2358 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2362 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
| 2359 | #endif | 2363 | #endif |
| 2360 | 2364 | ||
| 2361 | /* Initialize new task's runnable average */ | ||
| 2362 | init_entity_runnable_average(&p->se); | ||
| 2363 | rq = __task_rq_lock(p); | 2365 | rq = __task_rq_lock(p); |
| 2364 | activate_task(rq, p, 0); | 2366 | activate_task(rq, p, 0); |
| 2365 | p->on_rq = TASK_ON_RQ_QUEUED; | 2367 | p->on_rq = TASK_ON_RQ_QUEUED; |
| @@ -2483,7 +2485,6 @@ static inline void | |||
| 2483 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2485 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
| 2484 | struct task_struct *next) | 2486 | struct task_struct *next) |
| 2485 | { | 2487 | { |
| 2486 | trace_sched_switch(prev, next); | ||
| 2487 | sched_info_switch(rq, prev, next); | 2488 | sched_info_switch(rq, prev, next); |
| 2488 | perf_event_task_sched_out(prev, next); | 2489 | perf_event_task_sched_out(prev, next); |
| 2489 | fire_sched_out_preempt_notifiers(prev, next); | 2490 | fire_sched_out_preempt_notifiers(prev, next); |
| @@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
| 2517 | struct mm_struct *mm = rq->prev_mm; | 2518 | struct mm_struct *mm = rq->prev_mm; |
| 2518 | long prev_state; | 2519 | long prev_state; |
| 2519 | 2520 | ||
| 2521 | /* | ||
| 2522 | * The previous task will have left us with a preempt_count of 2 | ||
| 2523 | * because it left us after: | ||
| 2524 | * | ||
| 2525 | * schedule() | ||
| 2526 | * preempt_disable(); // 1 | ||
| 2527 | * __schedule() | ||
| 2528 | * raw_spin_lock_irq(&rq->lock) // 2 | ||
| 2529 | * | ||
| 2530 | * Also, see FORK_PREEMPT_COUNT. | ||
| 2531 | */ | ||
| 2532 | if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, | ||
| 2533 | "corrupted preempt_count: %s/%d/0x%x\n", | ||
| 2534 | current->comm, current->pid, preempt_count())) | ||
| 2535 | preempt_count_set(FORK_PREEMPT_COUNT); | ||
| 2536 | |||
| 2520 | rq->prev_mm = NULL; | 2537 | rq->prev_mm = NULL; |
| 2521 | 2538 | ||
| 2522 | /* | 2539 | /* |
| @@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
| 2601 | { | 2618 | { |
| 2602 | struct rq *rq; | 2619 | struct rq *rq; |
| 2603 | 2620 | ||
| 2604 | /* finish_task_switch() drops rq->lock and enables preemtion */ | 2621 | /* |
| 2605 | preempt_disable(); | 2622 | * New tasks start with FORK_PREEMPT_COUNT, see there and |
| 2623 | * finish_task_switch() for details. | ||
| 2624 | * | ||
| 2625 | * finish_task_switch() will drop rq->lock() and lower preempt_count | ||
| 2626 | * and the preempt_enable() will end up enabling preemption (on | ||
| 2627 | * PREEMPT_COUNT kernels). | ||
| 2628 | */ | ||
| 2629 | |||
| 2606 | rq = finish_task_switch(prev); | 2630 | rq = finish_task_switch(prev); |
| 2607 | balance_callback(rq); | 2631 | balance_callback(rq); |
| 2608 | preempt_enable(); | 2632 | preempt_enable(); |
| @@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
| 2960 | static inline void schedule_debug(struct task_struct *prev) | 2984 | static inline void schedule_debug(struct task_struct *prev) |
| 2961 | { | 2985 | { |
| 2962 | #ifdef CONFIG_SCHED_STACK_END_CHECK | 2986 | #ifdef CONFIG_SCHED_STACK_END_CHECK |
| 2963 | BUG_ON(unlikely(task_stack_end_corrupted(prev))); | 2987 | BUG_ON(task_stack_end_corrupted(prev)); |
| 2964 | #endif | 2988 | #endif |
| 2965 | /* | 2989 | |
| 2966 | * Test if we are atomic. Since do_exit() needs to call into | 2990 | if (unlikely(in_atomic_preempt_off())) { |
| 2967 | * schedule() atomically, we ignore that path. Otherwise whine | ||
| 2968 | * if we are scheduling when we should not. | ||
| 2969 | */ | ||
| 2970 | if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) | ||
| 2971 | __schedule_bug(prev); | 2991 | __schedule_bug(prev); |
| 2992 | preempt_count_set(PREEMPT_DISABLED); | ||
| 2993 | } | ||
| 2972 | rcu_sleep_check(); | 2994 | rcu_sleep_check(); |
| 2973 | 2995 | ||
| 2974 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 2996 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
| @@ -3054,7 +3076,7 @@ again: | |||
| 3054 | * | 3076 | * |
| 3055 | * WARNING: must be called with preemption disabled! | 3077 | * WARNING: must be called with preemption disabled! |
| 3056 | */ | 3078 | */ |
| 3057 | static void __sched __schedule(void) | 3079 | static void __sched notrace __schedule(bool preempt) |
| 3058 | { | 3080 | { |
| 3059 | struct task_struct *prev, *next; | 3081 | struct task_struct *prev, *next; |
| 3060 | unsigned long *switch_count; | 3082 | unsigned long *switch_count; |
| @@ -3066,6 +3088,17 @@ static void __sched __schedule(void) | |||
| 3066 | rcu_note_context_switch(); | 3088 | rcu_note_context_switch(); |
| 3067 | prev = rq->curr; | 3089 | prev = rq->curr; |
| 3068 | 3090 | ||
| 3091 | /* | ||
| 3092 | * do_exit() calls schedule() with preemption disabled as an exception; | ||
| 3093 | * however we must fix that up, otherwise the next task will see an | ||
| 3094 | * inconsistent (higher) preempt count. | ||
| 3095 | * | ||
| 3096 | * It also avoids the below schedule_debug() test from complaining | ||
| 3097 | * about this. | ||
| 3098 | */ | ||
| 3099 | if (unlikely(prev->state == TASK_DEAD)) | ||
| 3100 | preempt_enable_no_resched_notrace(); | ||
| 3101 | |||
| 3069 | schedule_debug(prev); | 3102 | schedule_debug(prev); |
| 3070 | 3103 | ||
| 3071 | if (sched_feat(HRTICK)) | 3104 | if (sched_feat(HRTICK)) |
| @@ -3083,7 +3116,7 @@ static void __sched __schedule(void) | |||
| 3083 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | 3116 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ |
| 3084 | 3117 | ||
| 3085 | switch_count = &prev->nivcsw; | 3118 | switch_count = &prev->nivcsw; |
| 3086 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3119 | if (!preempt && prev->state) { |
| 3087 | if (unlikely(signal_pending_state(prev->state, prev))) { | 3120 | if (unlikely(signal_pending_state(prev->state, prev))) { |
| 3088 | prev->state = TASK_RUNNING; | 3121 | prev->state = TASK_RUNNING; |
| 3089 | } else { | 3122 | } else { |
| @@ -3119,6 +3152,7 @@ static void __sched __schedule(void) | |||
| 3119 | rq->curr = next; | 3152 | rq->curr = next; |
| 3120 | ++*switch_count; | 3153 | ++*switch_count; |
| 3121 | 3154 | ||
| 3155 | trace_sched_switch(preempt, prev, next); | ||
| 3122 | rq = context_switch(rq, prev, next); /* unlocks the rq */ | 3156 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
| 3123 | cpu = cpu_of(rq); | 3157 | cpu = cpu_of(rq); |
| 3124 | } else { | 3158 | } else { |
| @@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void) | |||
| 3148 | sched_submit_work(tsk); | 3182 | sched_submit_work(tsk); |
| 3149 | do { | 3183 | do { |
| 3150 | preempt_disable(); | 3184 | preempt_disable(); |
| 3151 | __schedule(); | 3185 | __schedule(false); |
| 3152 | sched_preempt_enable_no_resched(); | 3186 | sched_preempt_enable_no_resched(); |
| 3153 | } while (need_resched()); | 3187 | } while (need_resched()); |
| 3154 | } | 3188 | } |
| @@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void) | |||
| 3188 | static void __sched notrace preempt_schedule_common(void) | 3222 | static void __sched notrace preempt_schedule_common(void) |
| 3189 | { | 3223 | { |
| 3190 | do { | 3224 | do { |
| 3191 | preempt_active_enter(); | 3225 | preempt_disable_notrace(); |
| 3192 | __schedule(); | 3226 | __schedule(true); |
| 3193 | preempt_active_exit(); | 3227 | preempt_enable_no_resched_notrace(); |
| 3194 | 3228 | ||
| 3195 | /* | 3229 | /* |
| 3196 | * Check again in case we missed a preemption opportunity | 3230 | * Check again in case we missed a preemption opportunity |
| @@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) | |||
| 3241 | return; | 3275 | return; |
| 3242 | 3276 | ||
| 3243 | do { | 3277 | do { |
| 3244 | /* | 3278 | preempt_disable_notrace(); |
| 3245 | * Use raw __prempt_count() ops that don't call function. | ||
| 3246 | * We can't call functions before disabling preemption which | ||
| 3247 | * disarm preemption tracing recursions. | ||
| 3248 | */ | ||
| 3249 | __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); | ||
| 3250 | barrier(); | ||
| 3251 | /* | 3279 | /* |
| 3252 | * Needs preempt disabled in case user_exit() is traced | 3280 | * Needs preempt disabled in case user_exit() is traced |
| 3253 | * and the tracer calls preempt_enable_notrace() causing | 3281 | * and the tracer calls preempt_enable_notrace() causing |
| 3254 | * an infinite recursion. | 3282 | * an infinite recursion. |
| 3255 | */ | 3283 | */ |
| 3256 | prev_ctx = exception_enter(); | 3284 | prev_ctx = exception_enter(); |
| 3257 | __schedule(); | 3285 | __schedule(true); |
| 3258 | exception_exit(prev_ctx); | 3286 | exception_exit(prev_ctx); |
| 3259 | 3287 | ||
| 3260 | barrier(); | 3288 | preempt_enable_no_resched_notrace(); |
| 3261 | __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); | ||
| 3262 | } while (need_resched()); | 3289 | } while (need_resched()); |
| 3263 | } | 3290 | } |
| 3264 | EXPORT_SYMBOL_GPL(preempt_schedule_notrace); | 3291 | EXPORT_SYMBOL_GPL(preempt_schedule_notrace); |
| @@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) | |||
| 3281 | prev_state = exception_enter(); | 3308 | prev_state = exception_enter(); |
| 3282 | 3309 | ||
| 3283 | do { | 3310 | do { |
| 3284 | preempt_active_enter(); | 3311 | preempt_disable(); |
| 3285 | local_irq_enable(); | 3312 | local_irq_enable(); |
| 3286 | __schedule(); | 3313 | __schedule(true); |
| 3287 | local_irq_disable(); | 3314 | local_irq_disable(); |
| 3288 | preempt_active_exit(); | 3315 | sched_preempt_enable_no_resched(); |
| 3289 | } while (need_resched()); | 3316 | } while (need_resched()); |
| 3290 | 3317 | ||
| 3291 | exception_exit(prev_state); | 3318 | exception_exit(prev_state); |
| @@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
| 3313 | */ | 3340 | */ |
| 3314 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3341 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 3315 | { | 3342 | { |
| 3316 | int oldprio, queued, running, enqueue_flag = 0; | 3343 | int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; |
| 3317 | struct rq *rq; | 3344 | struct rq *rq; |
| 3318 | const struct sched_class *prev_class; | 3345 | const struct sched_class *prev_class; |
| 3319 | 3346 | ||
| @@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3345 | queued = task_on_rq_queued(p); | 3372 | queued = task_on_rq_queued(p); |
| 3346 | running = task_current(rq, p); | 3373 | running = task_current(rq, p); |
| 3347 | if (queued) | 3374 | if (queued) |
| 3348 | dequeue_task(rq, p, 0); | 3375 | dequeue_task(rq, p, DEQUEUE_SAVE); |
| 3349 | if (running) | 3376 | if (running) |
| 3350 | put_prev_task(rq, p); | 3377 | put_prev_task(rq, p); |
| 3351 | 3378 | ||
| @@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3363 | if (!dl_prio(p->normal_prio) || | 3390 | if (!dl_prio(p->normal_prio) || |
| 3364 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | 3391 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { |
| 3365 | p->dl.dl_boosted = 1; | 3392 | p->dl.dl_boosted = 1; |
| 3366 | enqueue_flag = ENQUEUE_REPLENISH; | 3393 | enqueue_flag |= ENQUEUE_REPLENISH; |
| 3367 | } else | 3394 | } else |
| 3368 | p->dl.dl_boosted = 0; | 3395 | p->dl.dl_boosted = 0; |
| 3369 | p->sched_class = &dl_sched_class; | 3396 | p->sched_class = &dl_sched_class; |
| @@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3371 | if (dl_prio(oldprio)) | 3398 | if (dl_prio(oldprio)) |
| 3372 | p->dl.dl_boosted = 0; | 3399 | p->dl.dl_boosted = 0; |
| 3373 | if (oldprio < prio) | 3400 | if (oldprio < prio) |
| 3374 | enqueue_flag = ENQUEUE_HEAD; | 3401 | enqueue_flag |= ENQUEUE_HEAD; |
| 3375 | p->sched_class = &rt_sched_class; | 3402 | p->sched_class = &rt_sched_class; |
| 3376 | } else { | 3403 | } else { |
| 3377 | if (dl_prio(oldprio)) | 3404 | if (dl_prio(oldprio)) |
| @@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3423 | } | 3450 | } |
| 3424 | queued = task_on_rq_queued(p); | 3451 | queued = task_on_rq_queued(p); |
| 3425 | if (queued) | 3452 | if (queued) |
| 3426 | dequeue_task(rq, p, 0); | 3453 | dequeue_task(rq, p, DEQUEUE_SAVE); |
| 3427 | 3454 | ||
| 3428 | p->static_prio = NICE_TO_PRIO(nice); | 3455 | p->static_prio = NICE_TO_PRIO(nice); |
| 3429 | set_load_weight(p); | 3456 | set_load_weight(p); |
| @@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 3432 | delta = p->prio - old_prio; | 3459 | delta = p->prio - old_prio; |
| 3433 | 3460 | ||
| 3434 | if (queued) { | 3461 | if (queued) { |
| 3435 | enqueue_task(rq, p, 0); | 3462 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
| 3436 | /* | 3463 | /* |
| 3437 | * If the task increased its priority or is running and | 3464 | * If the task increased its priority or is running and |
| 3438 | * lowered its priority, then reschedule its CPU: | 3465 | * lowered its priority, then reschedule its CPU: |
| @@ -3753,10 +3780,7 @@ recheck: | |||
| 3753 | } else { | 3780 | } else { |
| 3754 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); | 3781 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); |
| 3755 | 3782 | ||
| 3756 | if (policy != SCHED_DEADLINE && | 3783 | if (!valid_policy(policy)) |
| 3757 | policy != SCHED_FIFO && policy != SCHED_RR && | ||
| 3758 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
| 3759 | policy != SCHED_IDLE) | ||
| 3760 | return -EINVAL; | 3784 | return -EINVAL; |
| 3761 | } | 3785 | } |
| 3762 | 3786 | ||
| @@ -3812,7 +3836,7 @@ recheck: | |||
| 3812 | * Treat SCHED_IDLE as nice 20. Only allow a switch to | 3836 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
| 3813 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 3837 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
| 3814 | */ | 3838 | */ |
| 3815 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { | 3839 | if (idle_policy(p->policy) && !idle_policy(policy)) { |
| 3816 | if (!can_nice(p, task_nice(p))) | 3840 | if (!can_nice(p, task_nice(p))) |
| 3817 | return -EPERM; | 3841 | return -EPERM; |
| 3818 | } | 3842 | } |
| @@ -3937,7 +3961,7 @@ change: | |||
| 3937 | queued = task_on_rq_queued(p); | 3961 | queued = task_on_rq_queued(p); |
| 3938 | running = task_current(rq, p); | 3962 | running = task_current(rq, p); |
| 3939 | if (queued) | 3963 | if (queued) |
| 3940 | dequeue_task(rq, p, 0); | 3964 | dequeue_task(rq, p, DEQUEUE_SAVE); |
| 3941 | if (running) | 3965 | if (running) |
| 3942 | put_prev_task(rq, p); | 3966 | put_prev_task(rq, p); |
| 3943 | 3967 | ||
| @@ -3947,11 +3971,15 @@ change: | |||
| 3947 | if (running) | 3971 | if (running) |
| 3948 | p->sched_class->set_curr_task(rq); | 3972 | p->sched_class->set_curr_task(rq); |
| 3949 | if (queued) { | 3973 | if (queued) { |
| 3974 | int enqueue_flags = ENQUEUE_RESTORE; | ||
| 3950 | /* | 3975 | /* |
| 3951 | * We enqueue to tail when the priority of a task is | 3976 | * We enqueue to tail when the priority of a task is |
| 3952 | * increased (user space view). | 3977 | * increased (user space view). |
| 3953 | */ | 3978 | */ |
| 3954 | enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); | 3979 | if (oldprio <= p->prio) |
| 3980 | enqueue_flags |= ENQUEUE_HEAD; | ||
| 3981 | |||
| 3982 | enqueue_task(rq, p, enqueue_flags); | ||
| 3955 | } | 3983 | } |
| 3956 | 3984 | ||
| 3957 | check_class_changed(rq, p, prev_class, oldprio); | 3985 | check_class_changed(rq, p, prev_class, oldprio); |
| @@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
| 5101 | running = task_current(rq, p); | 5129 | running = task_current(rq, p); |
| 5102 | 5130 | ||
| 5103 | if (queued) | 5131 | if (queued) |
| 5104 | dequeue_task(rq, p, 0); | 5132 | dequeue_task(rq, p, DEQUEUE_SAVE); |
| 5105 | if (running) | 5133 | if (running) |
| 5106 | put_prev_task(rq, p); | 5134 | put_prev_task(rq, p); |
| 5107 | 5135 | ||
| @@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
| 5110 | if (running) | 5138 | if (running) |
| 5111 | p->sched_class->set_curr_task(rq); | 5139 | p->sched_class->set_curr_task(rq); |
| 5112 | if (queued) | 5140 | if (queued) |
| 5113 | enqueue_task(rq, p, 0); | 5141 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
| 5114 | task_rq_unlock(rq, p, &flags); | 5142 | task_rq_unlock(rq, p, &flags); |
| 5115 | } | 5143 | } |
| 5116 | #endif /* CONFIG_NUMA_BALANCING */ | 5144 | #endif /* CONFIG_NUMA_BALANCING */ |
| @@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void) | |||
| 5531 | static int sched_cpu_active(struct notifier_block *nfb, | 5559 | static int sched_cpu_active(struct notifier_block *nfb, |
| 5532 | unsigned long action, void *hcpu) | 5560 | unsigned long action, void *hcpu) |
| 5533 | { | 5561 | { |
| 5562 | int cpu = (long)hcpu; | ||
| 5563 | |||
| 5534 | switch (action & ~CPU_TASKS_FROZEN) { | 5564 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5535 | case CPU_STARTING: | 5565 | case CPU_STARTING: |
| 5536 | set_cpu_rq_start_time(); | 5566 | set_cpu_rq_start_time(); |
| 5537 | return NOTIFY_OK; | 5567 | return NOTIFY_OK; |
| 5568 | |||
| 5538 | case CPU_ONLINE: | 5569 | case CPU_ONLINE: |
| 5539 | /* | 5570 | /* |
| 5540 | * At this point a starting CPU has marked itself as online via | 5571 | * At this point a starting CPU has marked itself as online via |
| 5541 | * set_cpu_online(). But it might not yet have marked itself | 5572 | * set_cpu_online(). But it might not yet have marked itself |
| 5542 | * as active, which is essential from here on. | 5573 | * as active, which is essential from here on. |
| 5543 | * | ||
| 5544 | * Thus, fall-through and help the starting CPU along. | ||
| 5545 | */ | 5574 | */ |
| 5575 | set_cpu_active(cpu, true); | ||
| 5576 | stop_machine_unpark(cpu); | ||
| 5577 | return NOTIFY_OK; | ||
| 5578 | |||
| 5546 | case CPU_DOWN_FAILED: | 5579 | case CPU_DOWN_FAILED: |
| 5547 | set_cpu_active((long)hcpu, true); | 5580 | set_cpu_active(cpu, true); |
| 5548 | return NOTIFY_OK; | 5581 | return NOTIFY_OK; |
| 5582 | |||
| 5549 | default: | 5583 | default: |
| 5550 | return NOTIFY_DONE; | 5584 | return NOTIFY_DONE; |
| 5551 | } | 5585 | } |
| @@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = { | |||
| 6477 | { NULL, }, | 6511 | { NULL, }, |
| 6478 | }; | 6512 | }; |
| 6479 | 6513 | ||
| 6480 | struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6514 | static struct sched_domain_topology_level *sched_domain_topology = |
| 6515 | default_topology; | ||
| 6481 | 6516 | ||
| 6482 | #define for_each_sd_topology(tl) \ | 6517 | #define for_each_sd_topology(tl) \ |
| 6483 | for (tl = sched_domain_topology; tl->mask; tl++) | 6518 | for (tl = sched_domain_topology; tl->mask; tl++) |
| @@ -7478,7 +7513,7 @@ void __init sched_init(void) | |||
| 7478 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | 7513 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
| 7479 | static inline int preempt_count_equals(int preempt_offset) | 7514 | static inline int preempt_count_equals(int preempt_offset) |
| 7480 | { | 7515 | { |
| 7481 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 7516 | int nested = preempt_count() + rcu_preempt_depth(); |
| 7482 | 7517 | ||
| 7483 | return (nested == preempt_offset); | 7518 | return (nested == preempt_offset); |
| 7484 | } | 7519 | } |
| @@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7725 | queued = task_on_rq_queued(tsk); | 7760 | queued = task_on_rq_queued(tsk); |
| 7726 | 7761 | ||
| 7727 | if (queued) | 7762 | if (queued) |
| 7728 | dequeue_task(rq, tsk, 0); | 7763 | dequeue_task(rq, tsk, DEQUEUE_SAVE); |
| 7729 | if (unlikely(running)) | 7764 | if (unlikely(running)) |
| 7730 | put_prev_task(rq, tsk); | 7765 | put_prev_task(rq, tsk); |
| 7731 | 7766 | ||
| @@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7741 | 7776 | ||
| 7742 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7777 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7743 | if (tsk->sched_class->task_move_group) | 7778 | if (tsk->sched_class->task_move_group) |
| 7744 | tsk->sched_class->task_move_group(tsk, queued); | 7779 | tsk->sched_class->task_move_group(tsk); |
| 7745 | else | 7780 | else |
| 7746 | #endif | 7781 | #endif |
| 7747 | set_task_rq(tsk, task_cpu(tsk)); | 7782 | set_task_rq(tsk, task_cpu(tsk)); |
| @@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 7749 | if (unlikely(running)) | 7784 | if (unlikely(running)) |
| 7750 | tsk->sched_class->set_curr_task(rq); | 7785 | tsk->sched_class->set_curr_task(rq); |
| 7751 | if (queued) | 7786 | if (queued) |
| 7752 | enqueue_task(rq, tsk, 0); | 7787 | enqueue_task(rq, tsk, ENQUEUE_RESTORE); |
| 7753 | 7788 | ||
| 7754 | task_rq_unlock(rq, tsk, &flags); | 7789 | task_rq_unlock(rq, tsk, &flags); |
| 7755 | } | 7790 | } |
| @@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, | |||
| 8213 | struct cgroup_subsys_state *old_css, | 8248 | struct cgroup_subsys_state *old_css, |
| 8214 | struct task_struct *task) | 8249 | struct task_struct *task) |
| 8215 | { | 8250 | { |
| 8216 | /* | ||
| 8217 | * cgroup_exit() is called in the copy_process() failure path. | ||
| 8218 | * Ignore this case since the task hasn't ran yet, this avoids | ||
| 8219 | * trying to poke a half freed task state from generic code. | ||
| 8220 | */ | ||
| 8221 | if (!(task->flags & PF_EXITING)) | ||
| 8222 | return; | ||
| 8223 | |||
| 8224 | sched_move_task(task); | 8251 | sched_move_task(task); |
| 8225 | } | 8252 | } |
| 8226 | 8253 | ||
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index c6acb07466bb..5a75b08cfd85 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -31,11 +31,6 @@ static inline int right_child(int i) | |||
| 31 | return (i << 1) + 2; | 31 | return (i << 1) + 2; |
| 32 | } | 32 | } |
| 33 | 33 | ||
| 34 | static inline int dl_time_before(u64 a, u64 b) | ||
| 35 | { | ||
| 36 | return (s64)(a - b) < 0; | ||
| 37 | } | ||
| 38 | |||
| 39 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | 34 | static void cpudl_exchange(struct cpudl *cp, int a, int b) |
| 40 | { | 35 | { |
| 41 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 36 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 1a0a6ef2fbe1..fcbdf83fed7e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #define _LINUX_CPUDL_H | 2 | #define _LINUX_CPUDL_H |
| 3 | 3 | ||
| 4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
| 5 | #include <linux/sched/deadline.h> | ||
| 5 | 6 | ||
| 6 | #define IDX_INVALID -1 | 7 | #define IDX_INVALID -1 |
| 7 | 8 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a5e60fe721a..824aa9f501a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p); | |||
| 661 | 661 | ||
| 662 | /* | 662 | /* |
| 663 | * We choose a half-life close to 1 scheduling period. | 663 | * We choose a half-life close to 1 scheduling period. |
| 664 | * Note: The tables below are dependent on this value. | 664 | * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are |
| 665 | * dependent on this value. | ||
| 665 | */ | 666 | */ |
| 666 | #define LOAD_AVG_PERIOD 32 | 667 | #define LOAD_AVG_PERIOD 32 |
| 667 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | 668 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ |
| 668 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | 669 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ |
| 669 | 670 | ||
| 670 | /* Give new sched_entity start runnable values to heavy its load in infant time */ | 671 | /* Give new sched_entity start runnable values to heavy its load in infant time */ |
| 671 | void init_entity_runnable_average(struct sched_entity *se) | 672 | void init_entity_runnable_average(struct sched_entity *se) |
| @@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
| 682 | sa->load_avg = scale_load_down(se->load.weight); | 683 | sa->load_avg = scale_load_down(se->load.weight); |
| 683 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | 684 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; |
| 684 | sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); | 685 | sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); |
| 685 | sa->util_sum = LOAD_AVG_MAX; | 686 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; |
| 686 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 687 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
| 687 | } | 688 | } |
| 688 | 689 | ||
| @@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 2069 | int local = !!(flags & TNF_FAULT_LOCAL); | 2070 | int local = !!(flags & TNF_FAULT_LOCAL); |
| 2070 | int priv; | 2071 | int priv; |
| 2071 | 2072 | ||
| 2072 | if (!numabalancing_enabled) | 2073 | if (!static_branch_likely(&sched_numa_balancing)) |
| 2073 | return; | 2074 | return; |
| 2074 | 2075 | ||
| 2075 | /* for example, ksmd faulting in a user's mm */ | 2076 | /* for example, ksmd faulting in a user's mm */ |
| @@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work) | |||
| 2157 | struct vm_area_struct *vma; | 2158 | struct vm_area_struct *vma; |
| 2158 | unsigned long start, end; | 2159 | unsigned long start, end; |
| 2159 | unsigned long nr_pte_updates = 0; | 2160 | unsigned long nr_pte_updates = 0; |
| 2160 | long pages; | 2161 | long pages, virtpages; |
| 2161 | 2162 | ||
| 2162 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 2163 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
| 2163 | 2164 | ||
| @@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work) | |||
| 2203 | start = mm->numa_scan_offset; | 2204 | start = mm->numa_scan_offset; |
| 2204 | pages = sysctl_numa_balancing_scan_size; | 2205 | pages = sysctl_numa_balancing_scan_size; |
| 2205 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | 2206 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ |
| 2207 | virtpages = pages * 8; /* Scan up to this much virtual space */ | ||
| 2206 | if (!pages) | 2208 | if (!pages) |
| 2207 | return; | 2209 | return; |
| 2208 | 2210 | ||
| 2211 | |||
| 2209 | down_read(&mm->mmap_sem); | 2212 | down_read(&mm->mmap_sem); |
| 2210 | vma = find_vma(mm, start); | 2213 | vma = find_vma(mm, start); |
| 2211 | if (!vma) { | 2214 | if (!vma) { |
| @@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work) | |||
| 2240 | start = max(start, vma->vm_start); | 2243 | start = max(start, vma->vm_start); |
| 2241 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 2244 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
| 2242 | end = min(end, vma->vm_end); | 2245 | end = min(end, vma->vm_end); |
| 2243 | nr_pte_updates += change_prot_numa(vma, start, end); | 2246 | nr_pte_updates = change_prot_numa(vma, start, end); |
| 2244 | 2247 | ||
| 2245 | /* | 2248 | /* |
| 2246 | * Scan sysctl_numa_balancing_scan_size but ensure that | 2249 | * Try to scan sysctl_numa_balancing_size worth of |
| 2247 | * at least one PTE is updated so that unused virtual | 2250 | * hpages that have at least one present PTE that |
| 2248 | * address space is quickly skipped. | 2251 | * is not already pte-numa. If the VMA contains |
| 2252 | * areas that are unused or already full of prot_numa | ||
| 2253 | * PTEs, scan up to virtpages, to skip through those | ||
| 2254 | * areas faster. | ||
| 2249 | */ | 2255 | */ |
| 2250 | if (nr_pte_updates) | 2256 | if (nr_pte_updates) |
| 2251 | pages -= (end - start) >> PAGE_SHIFT; | 2257 | pages -= (end - start) >> PAGE_SHIFT; |
| 2258 | virtpages -= (end - start) >> PAGE_SHIFT; | ||
| 2252 | 2259 | ||
| 2253 | start = end; | 2260 | start = end; |
| 2254 | if (pages <= 0) | 2261 | if (pages <= 0 || virtpages <= 0) |
| 2255 | goto out; | 2262 | goto out; |
| 2256 | 2263 | ||
| 2257 | cond_resched(); | 2264 | cond_resched(); |
| @@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n) | |||
| 2515 | return contrib + runnable_avg_yN_sum[n]; | 2522 | return contrib + runnable_avg_yN_sum[n]; |
| 2516 | } | 2523 | } |
| 2517 | 2524 | ||
| 2525 | #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 | ||
| 2526 | #error "load tracking assumes 2^10 as unit" | ||
| 2527 | #endif | ||
| 2528 | |||
| 2529 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) | ||
| 2530 | |||
| 2518 | /* | 2531 | /* |
| 2519 | * We can represent the historical contribution to runnable average as the | 2532 | * We can represent the historical contribution to runnable average as the |
| 2520 | * coefficients of a geometric series. To do this we sub-divide our runnable | 2533 | * coefficients of a geometric series. To do this we sub-divide our runnable |
| @@ -2547,10 +2560,10 @@ static __always_inline int | |||
| 2547 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | 2560 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, |
| 2548 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | 2561 | unsigned long weight, int running, struct cfs_rq *cfs_rq) |
| 2549 | { | 2562 | { |
| 2550 | u64 delta, periods; | 2563 | u64 delta, scaled_delta, periods; |
| 2551 | u32 contrib; | 2564 | u32 contrib; |
| 2552 | int delta_w, decayed = 0; | 2565 | unsigned int delta_w, scaled_delta_w, decayed = 0; |
| 2553 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | 2566 | unsigned long scale_freq, scale_cpu; |
| 2554 | 2567 | ||
| 2555 | delta = now - sa->last_update_time; | 2568 | delta = now - sa->last_update_time; |
| 2556 | /* | 2569 | /* |
| @@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
| 2571 | return 0; | 2584 | return 0; |
| 2572 | sa->last_update_time = now; | 2585 | sa->last_update_time = now; |
| 2573 | 2586 | ||
| 2587 | scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
| 2588 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
| 2589 | |||
| 2574 | /* delta_w is the amount already accumulated against our next period */ | 2590 | /* delta_w is the amount already accumulated against our next period */ |
| 2575 | delta_w = sa->period_contrib; | 2591 | delta_w = sa->period_contrib; |
| 2576 | if (delta + delta_w >= 1024) { | 2592 | if (delta + delta_w >= 1024) { |
| @@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
| 2585 | * period and accrue it. | 2601 | * period and accrue it. |
| 2586 | */ | 2602 | */ |
| 2587 | delta_w = 1024 - delta_w; | 2603 | delta_w = 1024 - delta_w; |
| 2604 | scaled_delta_w = cap_scale(delta_w, scale_freq); | ||
| 2588 | if (weight) { | 2605 | if (weight) { |
| 2589 | sa->load_sum += weight * delta_w; | 2606 | sa->load_sum += weight * scaled_delta_w; |
| 2590 | if (cfs_rq) | 2607 | if (cfs_rq) { |
| 2591 | cfs_rq->runnable_load_sum += weight * delta_w; | 2608 | cfs_rq->runnable_load_sum += |
| 2609 | weight * scaled_delta_w; | ||
| 2610 | } | ||
| 2592 | } | 2611 | } |
| 2593 | if (running) | 2612 | if (running) |
| 2594 | sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; | 2613 | sa->util_sum += scaled_delta_w * scale_cpu; |
| 2595 | 2614 | ||
| 2596 | delta -= delta_w; | 2615 | delta -= delta_w; |
| 2597 | 2616 | ||
| @@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
| 2608 | 2627 | ||
| 2609 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2628 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
| 2610 | contrib = __compute_runnable_contrib(periods); | 2629 | contrib = __compute_runnable_contrib(periods); |
| 2630 | contrib = cap_scale(contrib, scale_freq); | ||
| 2611 | if (weight) { | 2631 | if (weight) { |
| 2612 | sa->load_sum += weight * contrib; | 2632 | sa->load_sum += weight * contrib; |
| 2613 | if (cfs_rq) | 2633 | if (cfs_rq) |
| 2614 | cfs_rq->runnable_load_sum += weight * contrib; | 2634 | cfs_rq->runnable_load_sum += weight * contrib; |
| 2615 | } | 2635 | } |
| 2616 | if (running) | 2636 | if (running) |
| 2617 | sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; | 2637 | sa->util_sum += contrib * scale_cpu; |
| 2618 | } | 2638 | } |
| 2619 | 2639 | ||
| 2620 | /* Remainder of delta accrued against u_0` */ | 2640 | /* Remainder of delta accrued against u_0` */ |
| 2641 | scaled_delta = cap_scale(delta, scale_freq); | ||
| 2621 | if (weight) { | 2642 | if (weight) { |
| 2622 | sa->load_sum += weight * delta; | 2643 | sa->load_sum += weight * scaled_delta; |
| 2623 | if (cfs_rq) | 2644 | if (cfs_rq) |
| 2624 | cfs_rq->runnable_load_sum += weight * delta; | 2645 | cfs_rq->runnable_load_sum += weight * scaled_delta; |
| 2625 | } | 2646 | } |
| 2626 | if (running) | 2647 | if (running) |
| 2627 | sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; | 2648 | sa->util_sum += scaled_delta * scale_cpu; |
| 2628 | 2649 | ||
| 2629 | sa->period_contrib += delta; | 2650 | sa->period_contrib += delta; |
| 2630 | 2651 | ||
| @@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
| 2634 | cfs_rq->runnable_load_avg = | 2655 | cfs_rq->runnable_load_avg = |
| 2635 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | 2656 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); |
| 2636 | } | 2657 | } |
| 2637 | sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; | 2658 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; |
| 2638 | } | 2659 | } |
| 2639 | 2660 | ||
| 2640 | return decayed; | 2661 | return decayed; |
| @@ -2677,8 +2698,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
| 2677 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | 2698 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { |
| 2678 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); | 2699 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); |
| 2679 | sa->util_avg = max_t(long, sa->util_avg - r, 0); | 2700 | sa->util_avg = max_t(long, sa->util_avg - r, 0); |
| 2680 | sa->util_sum = max_t(s32, sa->util_sum - | 2701 | sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); |
| 2681 | ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); | ||
| 2682 | } | 2702 | } |
| 2683 | 2703 | ||
| 2684 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 2704 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
| @@ -2696,33 +2716,70 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
| 2696 | static inline void update_load_avg(struct sched_entity *se, int update_tg) | 2716 | static inline void update_load_avg(struct sched_entity *se, int update_tg) |
| 2697 | { | 2717 | { |
| 2698 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2718 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 2699 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
| 2700 | u64 now = cfs_rq_clock_task(cfs_rq); | 2719 | u64 now = cfs_rq_clock_task(cfs_rq); |
| 2720 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
| 2701 | 2721 | ||
| 2702 | /* | 2722 | /* |
| 2703 | * Track task load average for carrying it to new CPU after migrated, and | 2723 | * Track task load average for carrying it to new CPU after migrated, and |
| 2704 | * track group sched_entity load average for task_h_load calc in migration | 2724 | * track group sched_entity load average for task_h_load calc in migration |
| 2705 | */ | 2725 | */ |
| 2706 | __update_load_avg(now, cpu, &se->avg, | 2726 | __update_load_avg(now, cpu, &se->avg, |
| 2707 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); | 2727 | se->on_rq * scale_load_down(se->load.weight), |
| 2728 | cfs_rq->curr == se, NULL); | ||
| 2708 | 2729 | ||
| 2709 | if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) | 2730 | if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) |
| 2710 | update_tg_load_avg(cfs_rq, 0); | 2731 | update_tg_load_avg(cfs_rq, 0); |
| 2711 | } | 2732 | } |
| 2712 | 2733 | ||
| 2734 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 2735 | { | ||
| 2736 | if (!sched_feat(ATTACH_AGE_LOAD)) | ||
| 2737 | goto skip_aging; | ||
| 2738 | |||
| 2739 | /* | ||
| 2740 | * If we got migrated (either between CPUs or between cgroups) we'll | ||
| 2741 | * have aged the average right before clearing @last_update_time. | ||
| 2742 | */ | ||
| 2743 | if (se->avg.last_update_time) { | ||
| 2744 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
| 2745 | &se->avg, 0, 0, NULL); | ||
| 2746 | |||
| 2747 | /* | ||
| 2748 | * XXX: we could have just aged the entire load away if we've been | ||
| 2749 | * absent from the fair class for too long. | ||
| 2750 | */ | ||
| 2751 | } | ||
| 2752 | |||
| 2753 | skip_aging: | ||
| 2754 | se->avg.last_update_time = cfs_rq->avg.last_update_time; | ||
| 2755 | cfs_rq->avg.load_avg += se->avg.load_avg; | ||
| 2756 | cfs_rq->avg.load_sum += se->avg.load_sum; | ||
| 2757 | cfs_rq->avg.util_avg += se->avg.util_avg; | ||
| 2758 | cfs_rq->avg.util_sum += se->avg.util_sum; | ||
| 2759 | } | ||
| 2760 | |||
| 2761 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 2762 | { | ||
| 2763 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
| 2764 | &se->avg, se->on_rq * scale_load_down(se->load.weight), | ||
| 2765 | cfs_rq->curr == se, NULL); | ||
| 2766 | |||
| 2767 | cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | ||
| 2768 | cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | ||
| 2769 | cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | ||
| 2770 | cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | ||
| 2771 | } | ||
| 2772 | |||
| 2713 | /* Add the load generated by se into cfs_rq's load average */ | 2773 | /* Add the load generated by se into cfs_rq's load average */ |
| 2714 | static inline void | 2774 | static inline void |
| 2715 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2775 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 2716 | { | 2776 | { |
| 2717 | struct sched_avg *sa = &se->avg; | 2777 | struct sched_avg *sa = &se->avg; |
| 2718 | u64 now = cfs_rq_clock_task(cfs_rq); | 2778 | u64 now = cfs_rq_clock_task(cfs_rq); |
| 2719 | int migrated = 0, decayed; | 2779 | int migrated, decayed; |
| 2720 | 2780 | ||
| 2721 | if (sa->last_update_time == 0) { | 2781 | migrated = !sa->last_update_time; |
| 2722 | sa->last_update_time = now; | 2782 | if (!migrated) { |
| 2723 | migrated = 1; | ||
| 2724 | } | ||
| 2725 | else { | ||
| 2726 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 2783 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
| 2727 | se->on_rq * scale_load_down(se->load.weight), | 2784 | se->on_rq * scale_load_down(se->load.weight), |
| 2728 | cfs_rq->curr == se, NULL); | 2785 | cfs_rq->curr == se, NULL); |
| @@ -2733,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 2733 | cfs_rq->runnable_load_avg += sa->load_avg; | 2790 | cfs_rq->runnable_load_avg += sa->load_avg; |
| 2734 | cfs_rq->runnable_load_sum += sa->load_sum; | 2791 | cfs_rq->runnable_load_sum += sa->load_sum; |
| 2735 | 2792 | ||
| 2736 | if (migrated) { | 2793 | if (migrated) |
| 2737 | cfs_rq->avg.load_avg += sa->load_avg; | 2794 | attach_entity_load_avg(cfs_rq, se); |
| 2738 | cfs_rq->avg.load_sum += sa->load_sum; | ||
| 2739 | cfs_rq->avg.util_avg += sa->util_avg; | ||
| 2740 | cfs_rq->avg.util_sum += sa->util_sum; | ||
| 2741 | } | ||
| 2742 | 2795 | ||
| 2743 | if (decayed || migrated) | 2796 | if (decayed || migrated) |
| 2744 | update_tg_load_avg(cfs_rq, 0); | 2797 | update_tg_load_avg(cfs_rq, 0); |
| @@ -2753,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 2753 | cfs_rq->runnable_load_avg = | 2806 | cfs_rq->runnable_load_avg = |
| 2754 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); | 2807 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); |
| 2755 | cfs_rq->runnable_load_sum = | 2808 | cfs_rq->runnable_load_sum = |
| 2756 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); | 2809 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); |
| 2757 | } | 2810 | } |
| 2758 | 2811 | ||
| 2759 | /* | 2812 | /* |
| @@ -2821,6 +2874,11 @@ static inline void | |||
| 2821 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 2874 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
| 2822 | static inline void remove_entity_load_avg(struct sched_entity *se) {} | 2875 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
| 2823 | 2876 | ||
| 2877 | static inline void | ||
| 2878 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
| 2879 | static inline void | ||
| 2880 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
| 2881 | |||
| 2824 | static inline int idle_balance(struct rq *rq) | 2882 | static inline int idle_balance(struct rq *rq) |
| 2825 | { | 2883 | { |
| 2826 | return 0; | 2884 | return 0; |
| @@ -4817,32 +4875,39 @@ next: | |||
| 4817 | done: | 4875 | done: |
| 4818 | return target; | 4876 | return target; |
| 4819 | } | 4877 | } |
| 4878 | |||
| 4820 | /* | 4879 | /* |
| 4821 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | 4880 | * cpu_util returns the amount of capacity of a CPU that is used by CFS |
| 4822 | * tasks. The unit of the return value must be the one of capacity so we can | 4881 | * tasks. The unit of the return value must be the one of capacity so we can |
| 4823 | * compare the usage with the capacity of the CPU that is available for CFS | 4882 | * compare the utilization with the capacity of the CPU that is available for |
| 4824 | * task (ie cpu_capacity). | 4883 | * CFS task (ie cpu_capacity). |
| 4825 | * cfs.avg.util_avg is the sum of running time of runnable tasks on a | 4884 | * |
| 4826 | * CPU. It represents the amount of utilization of a CPU in the range | 4885 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the |
| 4827 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | 4886 | * recent utilization of currently non-runnable tasks on a CPU. It represents |
| 4828 | * capacity of the CPU because it's about the running time on this CPU. | 4887 | * the amount of utilization of a CPU in the range [0..capacity_orig] where |
| 4829 | * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE | 4888 | * capacity_orig is the cpu_capacity available at the highest frequency |
| 4830 | * because of unfortunate rounding in util_avg or just | 4889 | * (arch_scale_freq_capacity()). |
| 4831 | * after migrating tasks until the average stabilizes with the new running | 4890 | * The utilization of a CPU converges towards a sum equal to or less than the |
| 4832 | * time. So we need to check that the usage stays into the range | 4891 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is |
| 4833 | * [0..cpu_capacity_orig] and cap if necessary. | 4892 | * the running time on this CPU scaled by capacity_curr. |
| 4834 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | 4893 | * |
| 4835 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | 4894 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even |
| 4895 | * higher than capacity_orig because of unfortunate rounding in | ||
| 4896 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until | ||
| 4897 | * the average stabilizes with the new running time. We need to check that the | ||
| 4898 | * utilization stays within the range of [0..capacity_orig] and cap it if | ||
| 4899 | * necessary. Without utilization capping, a group could be seen as overloaded | ||
| 4900 | * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of | ||
| 4901 | * available capacity. We allow utilization to overshoot capacity_curr (but not | ||
| 4902 | * capacity_orig) as it useful for predicting the capacity required after task | ||
| 4903 | * migrations (scheduler-driven DVFS). | ||
| 4836 | */ | 4904 | */ |
| 4837 | static int get_cpu_usage(int cpu) | 4905 | static int cpu_util(int cpu) |
| 4838 | { | 4906 | { |
| 4839 | unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; | 4907 | unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; |
| 4840 | unsigned long capacity = capacity_orig_of(cpu); | 4908 | unsigned long capacity = capacity_orig_of(cpu); |
| 4841 | 4909 | ||
| 4842 | if (usage >= SCHED_LOAD_SCALE) | 4910 | return (util >= capacity) ? capacity : util; |
| 4843 | return capacity; | ||
| 4844 | |||
| 4845 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
| 4846 | } | 4911 | } |
| 4847 | 4912 | ||
| 4848 | /* | 4913 | /* |
| @@ -4945,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4945 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | 5010 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no |
| 4946 | * other assumptions, including the state of rq->lock, should be made. | 5011 | * other assumptions, including the state of rq->lock, should be made. |
| 4947 | */ | 5012 | */ |
| 4948 | static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) | 5013 | static void migrate_task_rq_fair(struct task_struct *p) |
| 4949 | { | 5014 | { |
| 4950 | /* | 5015 | /* |
| 4951 | * We are supposed to update the task to "current" time, then its up to date | 5016 | * We are supposed to update the task to "current" time, then its up to date |
| @@ -5525,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
| 5525 | unsigned long src_faults, dst_faults; | 5590 | unsigned long src_faults, dst_faults; |
| 5526 | int src_nid, dst_nid; | 5591 | int src_nid, dst_nid; |
| 5527 | 5592 | ||
| 5528 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | 5593 | if (!static_branch_likely(&sched_numa_balancing)) |
| 5529 | return -1; | 5594 | return -1; |
| 5530 | 5595 | ||
| 5531 | if (!sched_feat(NUMA)) | 5596 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) |
| 5532 | return -1; | 5597 | return -1; |
| 5533 | 5598 | ||
| 5534 | src_nid = cpu_to_node(env->src_cpu); | 5599 | src_nid = cpu_to_node(env->src_cpu); |
| @@ -5934,7 +5999,7 @@ struct sg_lb_stats { | |||
| 5934 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5999 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 5935 | unsigned long load_per_task; | 6000 | unsigned long load_per_task; |
| 5936 | unsigned long group_capacity; | 6001 | unsigned long group_capacity; |
| 5937 | unsigned long group_usage; /* Total usage of the group */ | 6002 | unsigned long group_util; /* Total utilization of the group */ |
| 5938 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 6003 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
| 5939 | unsigned int idle_cpus; | 6004 | unsigned int idle_cpus; |
| 5940 | unsigned int group_weight; | 6005 | unsigned int group_weight; |
| @@ -6010,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
| 6010 | return load_idx; | 6075 | return load_idx; |
| 6011 | } | 6076 | } |
| 6012 | 6077 | ||
| 6013 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
| 6014 | { | ||
| 6015 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | ||
| 6016 | return sd->smt_gain / sd->span_weight; | ||
| 6017 | |||
| 6018 | return SCHED_CAPACITY_SCALE; | ||
| 6019 | } | ||
| 6020 | |||
| 6021 | unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
| 6022 | { | ||
| 6023 | return default_scale_cpu_capacity(sd, cpu); | ||
| 6024 | } | ||
| 6025 | |||
| 6026 | static unsigned long scale_rt_capacity(int cpu) | 6078 | static unsigned long scale_rt_capacity(int cpu) |
| 6027 | { | 6079 | { |
| 6028 | struct rq *rq = cpu_rq(cpu); | 6080 | struct rq *rq = cpu_rq(cpu); |
| @@ -6052,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 6052 | 6104 | ||
| 6053 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6105 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
| 6054 | { | 6106 | { |
| 6055 | unsigned long capacity = SCHED_CAPACITY_SCALE; | 6107 | unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); |
| 6056 | struct sched_group *sdg = sd->groups; | 6108 | struct sched_group *sdg = sd->groups; |
| 6057 | 6109 | ||
| 6058 | if (sched_feat(ARCH_CAPACITY)) | ||
| 6059 | capacity *= arch_scale_cpu_capacity(sd, cpu); | ||
| 6060 | else | ||
| 6061 | capacity *= default_scale_cpu_capacity(sd, cpu); | ||
| 6062 | |||
| 6063 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
| 6064 | |||
| 6065 | cpu_rq(cpu)->cpu_capacity_orig = capacity; | 6110 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
| 6066 | 6111 | ||
| 6067 | capacity *= scale_rt_capacity(cpu); | 6112 | capacity *= scale_rt_capacity(cpu); |
| @@ -6187,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
| 6187 | * group_has_capacity returns true if the group has spare capacity that could | 6232 | * group_has_capacity returns true if the group has spare capacity that could |
| 6188 | * be used by some tasks. | 6233 | * be used by some tasks. |
| 6189 | * We consider that a group has spare capacity if the * number of task is | 6234 | * We consider that a group has spare capacity if the * number of task is |
| 6190 | * smaller than the number of CPUs or if the usage is lower than the available | 6235 | * smaller than the number of CPUs or if the utilization is lower than the |
| 6191 | * capacity for CFS tasks. | 6236 | * available capacity for CFS tasks. |
| 6192 | * For the latter, we use a threshold to stabilize the state, to take into | 6237 | * For the latter, we use a threshold to stabilize the state, to take into |
| 6193 | * account the variance of the tasks' load and to return true if the available | 6238 | * account the variance of the tasks' load and to return true if the available |
| 6194 | * capacity in meaningful for the load balancer. | 6239 | * capacity in meaningful for the load balancer. |
| @@ -6202,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | |||
| 6202 | return true; | 6247 | return true; |
| 6203 | 6248 | ||
| 6204 | if ((sgs->group_capacity * 100) > | 6249 | if ((sgs->group_capacity * 100) > |
| 6205 | (sgs->group_usage * env->sd->imbalance_pct)) | 6250 | (sgs->group_util * env->sd->imbalance_pct)) |
| 6206 | return true; | 6251 | return true; |
| 6207 | 6252 | ||
| 6208 | return false; | 6253 | return false; |
| @@ -6223,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | |||
| 6223 | return false; | 6268 | return false; |
| 6224 | 6269 | ||
| 6225 | if ((sgs->group_capacity * 100) < | 6270 | if ((sgs->group_capacity * 100) < |
| 6226 | (sgs->group_usage * env->sd->imbalance_pct)) | 6271 | (sgs->group_util * env->sd->imbalance_pct)) |
| 6227 | return true; | 6272 | return true; |
| 6228 | 6273 | ||
| 6229 | return false; | 6274 | return false; |
| 6230 | } | 6275 | } |
| 6231 | 6276 | ||
| 6232 | static enum group_type group_classify(struct lb_env *env, | 6277 | static inline enum |
| 6233 | struct sched_group *group, | 6278 | group_type group_classify(struct sched_group *group, |
| 6234 | struct sg_lb_stats *sgs) | 6279 | struct sg_lb_stats *sgs) |
| 6235 | { | 6280 | { |
| 6236 | if (sgs->group_no_capacity) | 6281 | if (sgs->group_no_capacity) |
| 6237 | return group_overloaded; | 6282 | return group_overloaded; |
| @@ -6271,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6271 | load = source_load(i, load_idx); | 6316 | load = source_load(i, load_idx); |
| 6272 | 6317 | ||
| 6273 | sgs->group_load += load; | 6318 | sgs->group_load += load; |
| 6274 | sgs->group_usage += get_cpu_usage(i); | 6319 | sgs->group_util += cpu_util(i); |
| 6275 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6320 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 6276 | 6321 | ||
| 6277 | if (rq->nr_running > 1) | 6322 | if (rq->nr_running > 1) |
| @@ -6296,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6296 | sgs->group_weight = group->group_weight; | 6341 | sgs->group_weight = group->group_weight; |
| 6297 | 6342 | ||
| 6298 | sgs->group_no_capacity = group_is_overloaded(env, sgs); | 6343 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
| 6299 | sgs->group_type = group_classify(env, group, sgs); | 6344 | sgs->group_type = group_classify(group, sgs); |
| 6300 | } | 6345 | } |
| 6301 | 6346 | ||
| 6302 | /** | 6347 | /** |
| @@ -6430,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 6430 | group_has_capacity(env, &sds->local_stat) && | 6475 | group_has_capacity(env, &sds->local_stat) && |
| 6431 | (sgs->sum_nr_running > 1)) { | 6476 | (sgs->sum_nr_running > 1)) { |
| 6432 | sgs->group_no_capacity = 1; | 6477 | sgs->group_no_capacity = 1; |
| 6433 | sgs->group_type = group_overloaded; | 6478 | sgs->group_type = group_classify(sg, sgs); |
| 6434 | } | 6479 | } |
| 6435 | 6480 | ||
| 6436 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6481 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| @@ -7610,8 +7655,22 @@ out: | |||
| 7610 | * When the cpu is attached to null domain for ex, it will not be | 7655 | * When the cpu is attached to null domain for ex, it will not be |
| 7611 | * updated. | 7656 | * updated. |
| 7612 | */ | 7657 | */ |
| 7613 | if (likely(update_next_balance)) | 7658 | if (likely(update_next_balance)) { |
| 7614 | rq->next_balance = next_balance; | 7659 | rq->next_balance = next_balance; |
| 7660 | |||
| 7661 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 7662 | /* | ||
| 7663 | * If this CPU has been elected to perform the nohz idle | ||
| 7664 | * balance. Other idle CPUs have already rebalanced with | ||
| 7665 | * nohz_idle_balance() and nohz.next_balance has been | ||
| 7666 | * updated accordingly. This CPU is now running the idle load | ||
| 7667 | * balance for itself and we need to update the | ||
| 7668 | * nohz.next_balance accordingly. | ||
| 7669 | */ | ||
| 7670 | if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) | ||
| 7671 | nohz.next_balance = rq->next_balance; | ||
| 7672 | #endif | ||
| 7673 | } | ||
| 7615 | } | 7674 | } |
| 7616 | 7675 | ||
| 7617 | #ifdef CONFIG_NO_HZ_COMMON | 7676 | #ifdef CONFIG_NO_HZ_COMMON |
| @@ -7624,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
| 7624 | int this_cpu = this_rq->cpu; | 7683 | int this_cpu = this_rq->cpu; |
| 7625 | struct rq *rq; | 7684 | struct rq *rq; |
| 7626 | int balance_cpu; | 7685 | int balance_cpu; |
| 7686 | /* Earliest time when we have to do rebalance again */ | ||
| 7687 | unsigned long next_balance = jiffies + 60*HZ; | ||
| 7688 | int update_next_balance = 0; | ||
| 7627 | 7689 | ||
| 7628 | if (idle != CPU_IDLE || | 7690 | if (idle != CPU_IDLE || |
| 7629 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) | 7691 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) |
| @@ -7655,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
| 7655 | rebalance_domains(rq, CPU_IDLE); | 7717 | rebalance_domains(rq, CPU_IDLE); |
| 7656 | } | 7718 | } |
| 7657 | 7719 | ||
| 7658 | if (time_after(this_rq->next_balance, rq->next_balance)) | 7720 | if (time_after(next_balance, rq->next_balance)) { |
| 7659 | this_rq->next_balance = rq->next_balance; | 7721 | next_balance = rq->next_balance; |
| 7722 | update_next_balance = 1; | ||
| 7723 | } | ||
| 7660 | } | 7724 | } |
| 7661 | nohz.next_balance = this_rq->next_balance; | 7725 | |
| 7726 | /* | ||
| 7727 | * next_balance will be updated only when there is a need. | ||
| 7728 | * When the CPU is attached to null domain for ex, it will not be | ||
| 7729 | * updated. | ||
| 7730 | */ | ||
| 7731 | if (likely(update_next_balance)) | ||
| 7732 | nohz.next_balance = next_balance; | ||
| 7662 | end: | 7733 | end: |
| 7663 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | 7734 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); |
| 7664 | } | 7735 | } |
| @@ -7811,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
| 7811 | entity_tick(cfs_rq, se, queued); | 7882 | entity_tick(cfs_rq, se, queued); |
| 7812 | } | 7883 | } |
| 7813 | 7884 | ||
| 7814 | if (numabalancing_enabled) | 7885 | if (static_branch_unlikely(&sched_numa_balancing)) |
| 7815 | task_tick_numa(rq, curr); | 7886 | task_tick_numa(rq, curr); |
| 7816 | } | 7887 | } |
| 7817 | 7888 | ||
| @@ -7887,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | |||
| 7887 | check_preempt_curr(rq, p, 0); | 7958 | check_preempt_curr(rq, p, 0); |
| 7888 | } | 7959 | } |
| 7889 | 7960 | ||
| 7890 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | 7961 | static inline bool vruntime_normalized(struct task_struct *p) |
| 7891 | { | 7962 | { |
| 7892 | struct sched_entity *se = &p->se; | 7963 | struct sched_entity *se = &p->se; |
| 7893 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 7894 | 7964 | ||
| 7895 | /* | 7965 | /* |
| 7896 | * Ensure the task's vruntime is normalized, so that when it's | 7966 | * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, |
| 7897 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7967 | * the dequeue_entity(.flags=0) will already have normalized the |
| 7898 | * do the right thing. | 7968 | * vruntime. |
| 7969 | */ | ||
| 7970 | if (p->on_rq) | ||
| 7971 | return true; | ||
| 7972 | |||
| 7973 | /* | ||
| 7974 | * When !on_rq, vruntime of the task has usually NOT been normalized. | ||
| 7975 | * But there are some cases where it has already been normalized: | ||
| 7899 | * | 7976 | * |
| 7900 | * If it's queued, then the dequeue_entity(.flags=0) will already | 7977 | * - A forked child which is waiting for being woken up by |
| 7901 | * have normalized the vruntime, if it's !queued, then only when | 7978 | * wake_up_new_task(). |
| 7902 | * the task is sleeping will it still have non-normalized vruntime. | 7979 | * - A task which has been woken up by try_to_wake_up() and |
| 7980 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
| 7903 | */ | 7981 | */ |
| 7904 | if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { | 7982 | if (!se->sum_exec_runtime || p->state == TASK_WAKING) |
| 7983 | return true; | ||
| 7984 | |||
| 7985 | return false; | ||
| 7986 | } | ||
| 7987 | |||
| 7988 | static void detach_task_cfs_rq(struct task_struct *p) | ||
| 7989 | { | ||
| 7990 | struct sched_entity *se = &p->se; | ||
| 7991 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 7992 | |||
| 7993 | if (!vruntime_normalized(p)) { | ||
| 7905 | /* | 7994 | /* |
| 7906 | * Fix up our vruntime so that the current sleep doesn't | 7995 | * Fix up our vruntime so that the current sleep doesn't |
| 7907 | * cause 'unlimited' sleep bonus. | 7996 | * cause 'unlimited' sleep bonus. |
| @@ -7910,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 7910 | se->vruntime -= cfs_rq->min_vruntime; | 7999 | se->vruntime -= cfs_rq->min_vruntime; |
| 7911 | } | 8000 | } |
| 7912 | 8001 | ||
| 7913 | #ifdef CONFIG_SMP | ||
| 7914 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8002 | /* Catch up with the cfs_rq and remove our load when we leave */ |
| 7915 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, | 8003 | detach_entity_load_avg(cfs_rq, se); |
| 7916 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); | ||
| 7917 | |||
| 7918 | cfs_rq->avg.load_avg = | ||
| 7919 | max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | ||
| 7920 | cfs_rq->avg.load_sum = | ||
| 7921 | max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | ||
| 7922 | cfs_rq->avg.util_avg = | ||
| 7923 | max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | ||
| 7924 | cfs_rq->avg.util_sum = | ||
| 7925 | max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | ||
| 7926 | #endif | ||
| 7927 | } | 8004 | } |
| 7928 | 8005 | ||
| 7929 | /* | 8006 | static void attach_task_cfs_rq(struct task_struct *p) |
| 7930 | * We switched to the sched_fair class. | ||
| 7931 | */ | ||
| 7932 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | ||
| 7933 | { | 8007 | { |
| 7934 | struct sched_entity *se = &p->se; | 8008 | struct sched_entity *se = &p->se; |
| 8009 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
| 7935 | 8010 | ||
| 7936 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8011 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7937 | /* | 8012 | /* |
| @@ -7941,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) | |||
| 7941 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 8016 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
| 7942 | #endif | 8017 | #endif |
| 7943 | 8018 | ||
| 7944 | if (!task_on_rq_queued(p)) { | 8019 | /* Synchronize task with its cfs_rq */ |
| 8020 | attach_entity_load_avg(cfs_rq, se); | ||
| 7945 | 8021 | ||
| 8022 | if (!vruntime_normalized(p)) | ||
| 8023 | se->vruntime += cfs_rq->min_vruntime; | ||
| 8024 | } | ||
| 8025 | |||
| 8026 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
| 8027 | { | ||
| 8028 | detach_task_cfs_rq(p); | ||
| 8029 | } | ||
| 8030 | |||
| 8031 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | ||
| 8032 | { | ||
| 8033 | attach_task_cfs_rq(p); | ||
| 8034 | |||
| 8035 | if (task_on_rq_queued(p)) { | ||
| 7946 | /* | 8036 | /* |
| 7947 | * Ensure the task has a non-normalized vruntime when it is switched | 8037 | * We were most likely switched from sched_rt, so |
| 7948 | * back to the fair class with !queued, so that enqueue_entity() at | 8038 | * kick off the schedule if running, otherwise just see |
| 7949 | * wake-up time will do the right thing. | 8039 | * if we can still preempt the current task. |
| 7950 | * | ||
| 7951 | * If it's queued, then the enqueue_entity(.flags=0) makes the task | ||
| 7952 | * has non-normalized vruntime, if it's !queued, then it still has | ||
| 7953 | * normalized vruntime. | ||
| 7954 | */ | 8040 | */ |
| 7955 | if (p->state != TASK_RUNNING) | 8041 | if (rq->curr == p) |
| 7956 | se->vruntime += cfs_rq_of(se)->min_vruntime; | 8042 | resched_curr(rq); |
| 7957 | return; | 8043 | else |
| 8044 | check_preempt_curr(rq, p, 0); | ||
| 7958 | } | 8045 | } |
| 7959 | |||
| 7960 | /* | ||
| 7961 | * We were most likely switched from sched_rt, so | ||
| 7962 | * kick off the schedule if running, otherwise just see | ||
| 7963 | * if we can still preempt the current task. | ||
| 7964 | */ | ||
| 7965 | if (rq->curr == p) | ||
| 7966 | resched_curr(rq); | ||
| 7967 | else | ||
| 7968 | check_preempt_curr(rq, p, 0); | ||
| 7969 | } | 8046 | } |
| 7970 | 8047 | ||
| 7971 | /* Account for a task changing its policy or group. | 8048 | /* Account for a task changing its policy or group. |
| @@ -8000,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 8000 | } | 8077 | } |
| 8001 | 8078 | ||
| 8002 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8079 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8003 | static void task_move_group_fair(struct task_struct *p, int queued) | 8080 | static void task_move_group_fair(struct task_struct *p) |
| 8004 | { | 8081 | { |
| 8005 | struct sched_entity *se = &p->se; | 8082 | detach_task_cfs_rq(p); |
| 8006 | struct cfs_rq *cfs_rq; | ||
| 8007 | |||
| 8008 | /* | ||
| 8009 | * If the task was not on the rq at the time of this cgroup movement | ||
| 8010 | * it must have been asleep, sleeping tasks keep their ->vruntime | ||
| 8011 | * absolute on their old rq until wakeup (needed for the fair sleeper | ||
| 8012 | * bonus in place_entity()). | ||
| 8013 | * | ||
| 8014 | * If it was on the rq, we've just 'preempted' it, which does convert | ||
| 8015 | * ->vruntime to a relative base. | ||
| 8016 | * | ||
| 8017 | * Make sure both cases convert their relative position when migrating | ||
| 8018 | * to another cgroup's rq. This does somewhat interfere with the | ||
| 8019 | * fair sleeper stuff for the first placement, but who cares. | ||
| 8020 | */ | ||
| 8021 | /* | ||
| 8022 | * When !queued, vruntime of the task has usually NOT been normalized. | ||
| 8023 | * But there are some cases where it has already been normalized: | ||
| 8024 | * | ||
| 8025 | * - Moving a forked child which is waiting for being woken up by | ||
| 8026 | * wake_up_new_task(). | ||
| 8027 | * - Moving a task which has been woken up by try_to_wake_up() and | ||
| 8028 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
| 8029 | * | ||
| 8030 | * To prevent boost or penalty in the new cfs_rq caused by delta | ||
| 8031 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | ||
| 8032 | */ | ||
| 8033 | if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) | ||
| 8034 | queued = 1; | ||
| 8035 | |||
| 8036 | if (!queued) | ||
| 8037 | se->vruntime -= cfs_rq_of(se)->min_vruntime; | ||
| 8038 | set_task_rq(p, task_cpu(p)); | 8083 | set_task_rq(p, task_cpu(p)); |
| 8039 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
| 8040 | if (!queued) { | ||
| 8041 | cfs_rq = cfs_rq_of(se); | ||
| 8042 | se->vruntime += cfs_rq->min_vruntime; | ||
| 8043 | 8084 | ||
| 8044 | #ifdef CONFIG_SMP | 8085 | #ifdef CONFIG_SMP |
| 8045 | /* Virtually synchronize task with its new cfs_rq */ | 8086 | /* Tell se's cfs_rq has been changed -- migrated */ |
| 8046 | p->se.avg.last_update_time = cfs_rq->avg.last_update_time; | 8087 | p->se.avg.last_update_time = 0; |
| 8047 | cfs_rq->avg.load_avg += p->se.avg.load_avg; | ||
| 8048 | cfs_rq->avg.load_sum += p->se.avg.load_sum; | ||
| 8049 | cfs_rq->avg.util_avg += p->se.avg.util_avg; | ||
| 8050 | cfs_rq->avg.util_sum += p->se.avg.util_sum; | ||
| 8051 | #endif | 8088 | #endif |
| 8052 | } | 8089 | attach_task_cfs_rq(p); |
| 8053 | } | 8090 | } |
| 8054 | 8091 | ||
| 8055 | void free_fair_sched_group(struct task_group *tg) | 8092 | void free_fair_sched_group(struct task_group *tg) |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 83a50e7ca533..69631fa46c2f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
| 36 | */ | 36 | */ |
| 37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | 37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) |
| 38 | 38 | ||
| 39 | /* | ||
| 40 | * Use arch dependent cpu capacity functions | ||
| 41 | */ | ||
| 42 | SCHED_FEAT(ARCH_CAPACITY, true) | ||
| 43 | |||
| 44 | SCHED_FEAT(HRTICK, false) | 39 | SCHED_FEAT(HRTICK, false) |
| 45 | SCHED_FEAT(DOUBLE_TICK, false) | 40 | SCHED_FEAT(DOUBLE_TICK, false) |
| 46 | SCHED_FEAT(LB_BIAS, true) | 41 | SCHED_FEAT(LB_BIAS, true) |
| @@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) | |||
| 72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 67 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
| 73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 68 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
| 74 | SCHED_FEAT(LB_MIN, false) | 69 | SCHED_FEAT(LB_MIN, false) |
| 70 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | ||
| 75 | 71 | ||
| 76 | /* | ||
| 77 | * Apply the automatic NUMA scheduling policy. Enabled automatically | ||
| 78 | * at runtime if running on a NUMA machine. Can be controlled via | ||
| 79 | * numa_balancing= | ||
| 80 | */ | ||
| 81 | #ifdef CONFIG_NUMA_BALANCING | ||
| 82 | |||
| 83 | /* | ||
| 84 | * NUMA will favor moving tasks towards nodes where a higher number of | ||
| 85 | * hinting faults are recorded during active load balancing. It will | ||
| 86 | * resist moving tasks towards nodes where a lower number of hinting | ||
| 87 | * faults have been recorded. | ||
| 88 | */ | ||
| 89 | SCHED_FEAT(NUMA, true) | ||
| 90 | #endif | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d2ea59364a1c..e3cc16312046 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) | |||
| 635 | /* | 635 | /* |
| 636 | * We ran out of runtime, see if we can borrow some from our neighbours. | 636 | * We ran out of runtime, see if we can borrow some from our neighbours. |
| 637 | */ | 637 | */ |
| 638 | static int do_balance_runtime(struct rt_rq *rt_rq) | 638 | static void do_balance_runtime(struct rt_rq *rt_rq) |
| 639 | { | 639 | { |
| 640 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 640 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
| 641 | struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; | 641 | struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; |
| 642 | int i, weight, more = 0; | 642 | int i, weight; |
| 643 | u64 rt_period; | 643 | u64 rt_period; |
| 644 | 644 | ||
| 645 | weight = cpumask_weight(rd->span); | 645 | weight = cpumask_weight(rd->span); |
| @@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) | |||
| 673 | diff = rt_period - rt_rq->rt_runtime; | 673 | diff = rt_period - rt_rq->rt_runtime; |
| 674 | iter->rt_runtime -= diff; | 674 | iter->rt_runtime -= diff; |
| 675 | rt_rq->rt_runtime += diff; | 675 | rt_rq->rt_runtime += diff; |
| 676 | more = 1; | ||
| 677 | if (rt_rq->rt_runtime == rt_period) { | 676 | if (rt_rq->rt_runtime == rt_period) { |
| 678 | raw_spin_unlock(&iter->rt_runtime_lock); | 677 | raw_spin_unlock(&iter->rt_runtime_lock); |
| 679 | break; | 678 | break; |
| @@ -683,8 +682,6 @@ next: | |||
| 683 | raw_spin_unlock(&iter->rt_runtime_lock); | 682 | raw_spin_unlock(&iter->rt_runtime_lock); |
| 684 | } | 683 | } |
| 685 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 684 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
| 686 | |||
| 687 | return more; | ||
| 688 | } | 685 | } |
| 689 | 686 | ||
| 690 | /* | 687 | /* |
| @@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq) | |||
| 796 | } | 793 | } |
| 797 | } | 794 | } |
| 798 | 795 | ||
| 799 | static int balance_runtime(struct rt_rq *rt_rq) | 796 | static void balance_runtime(struct rt_rq *rt_rq) |
| 800 | { | 797 | { |
| 801 | int more = 0; | ||
| 802 | |||
| 803 | if (!sched_feat(RT_RUNTIME_SHARE)) | 798 | if (!sched_feat(RT_RUNTIME_SHARE)) |
| 804 | return more; | 799 | return; |
| 805 | 800 | ||
| 806 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | 801 | if (rt_rq->rt_time > rt_rq->rt_runtime) { |
| 807 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 802 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
| 808 | more = do_balance_runtime(rt_rq); | 803 | do_balance_runtime(rt_rq); |
| 809 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 804 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
| 810 | } | 805 | } |
| 811 | |||
| 812 | return more; | ||
| 813 | } | 806 | } |
| 814 | #else /* !CONFIG_SMP */ | 807 | #else /* !CONFIG_SMP */ |
| 815 | static inline int balance_runtime(struct rt_rq *rt_rq) | 808 | static inline void balance_runtime(struct rt_rq *rt_rq) {} |
| 816 | { | ||
| 817 | return 0; | ||
| 818 | } | ||
| 819 | #endif /* CONFIG_SMP */ | 809 | #endif /* CONFIG_SMP */ |
| 820 | 810 | ||
| 821 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | 811 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d2a119c7ad9..efd3bfc7e347 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } | |||
| 84 | */ | 84 | */ |
| 85 | #define RUNTIME_INF ((u64)~0ULL) | 85 | #define RUNTIME_INF ((u64)~0ULL) |
| 86 | 86 | ||
| 87 | static inline int idle_policy(int policy) | ||
| 88 | { | ||
| 89 | return policy == SCHED_IDLE; | ||
| 90 | } | ||
| 87 | static inline int fair_policy(int policy) | 91 | static inline int fair_policy(int policy) |
| 88 | { | 92 | { |
| 89 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; | 93 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; |
| @@ -98,6 +102,11 @@ static inline int dl_policy(int policy) | |||
| 98 | { | 102 | { |
| 99 | return policy == SCHED_DEADLINE; | 103 | return policy == SCHED_DEADLINE; |
| 100 | } | 104 | } |
| 105 | static inline bool valid_policy(int policy) | ||
| 106 | { | ||
| 107 | return idle_policy(policy) || fair_policy(policy) || | ||
| 108 | rt_policy(policy) || dl_policy(policy); | ||
| 109 | } | ||
| 101 | 110 | ||
| 102 | static inline int task_has_rt_policy(struct task_struct *p) | 111 | static inline int task_has_rt_policy(struct task_struct *p) |
| 103 | { | 112 | { |
| @@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) | |||
| 109 | return dl_policy(p->policy); | 118 | return dl_policy(p->policy); |
| 110 | } | 119 | } |
| 111 | 120 | ||
| 112 | static inline bool dl_time_before(u64 a, u64 b) | ||
| 113 | { | ||
| 114 | return (s64)(a - b) < 0; | ||
| 115 | } | ||
| 116 | |||
| 117 | /* | 121 | /* |
| 118 | * Tells if entity @a should preempt entity @b. | 122 | * Tells if entity @a should preempt entity @b. |
| 119 | */ | 123 | */ |
| @@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
| 1003 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 1007 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
| 1004 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 1008 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
| 1005 | 1009 | ||
| 1006 | #ifdef CONFIG_NUMA_BALANCING | 1010 | extern struct static_key_false sched_numa_balancing; |
| 1007 | #define sched_feat_numa(x) sched_feat(x) | ||
| 1008 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1009 | #define numabalancing_enabled sched_feat_numa(NUMA) | ||
| 1010 | #else | ||
| 1011 | extern bool numabalancing_enabled; | ||
| 1012 | #endif /* CONFIG_SCHED_DEBUG */ | ||
| 1013 | #else | ||
| 1014 | #define sched_feat_numa(x) (0) | ||
| 1015 | #define numabalancing_enabled (0) | ||
| 1016 | #endif /* CONFIG_NUMA_BALANCING */ | ||
| 1017 | 1011 | ||
| 1018 | static inline u64 global_rt_period(void) | 1012 | static inline u64 global_rt_period(void) |
| 1019 | { | 1013 | { |
| @@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { | |||
| 1157 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1151 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
| 1158 | }; | 1152 | }; |
| 1159 | 1153 | ||
| 1160 | #define ENQUEUE_WAKEUP 1 | 1154 | #define ENQUEUE_WAKEUP 0x01 |
| 1161 | #define ENQUEUE_HEAD 2 | 1155 | #define ENQUEUE_HEAD 0x02 |
| 1162 | #ifdef CONFIG_SMP | 1156 | #ifdef CONFIG_SMP |
| 1163 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ | 1157 | #define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ |
| 1164 | #else | 1158 | #else |
| 1165 | #define ENQUEUE_WAKING 0 | 1159 | #define ENQUEUE_WAKING 0x00 |
| 1166 | #endif | 1160 | #endif |
| 1167 | #define ENQUEUE_REPLENISH 8 | 1161 | #define ENQUEUE_REPLENISH 0x08 |
| 1162 | #define ENQUEUE_RESTORE 0x10 | ||
| 1168 | 1163 | ||
| 1169 | #define DEQUEUE_SLEEP 1 | 1164 | #define DEQUEUE_SLEEP 0x01 |
| 1165 | #define DEQUEUE_SAVE 0x02 | ||
| 1170 | 1166 | ||
| 1171 | #define RETRY_TASK ((void *)-1UL) | 1167 | #define RETRY_TASK ((void *)-1UL) |
| 1172 | 1168 | ||
| @@ -1194,7 +1190,7 @@ struct sched_class { | |||
| 1194 | 1190 | ||
| 1195 | #ifdef CONFIG_SMP | 1191 | #ifdef CONFIG_SMP |
| 1196 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1192 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
| 1197 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1193 | void (*migrate_task_rq)(struct task_struct *p); |
| 1198 | 1194 | ||
| 1199 | void (*task_waking) (struct task_struct *task); | 1195 | void (*task_waking) (struct task_struct *task); |
| 1200 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1196 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); |
| @@ -1227,7 +1223,7 @@ struct sched_class { | |||
| 1227 | void (*update_curr) (struct rq *rq); | 1223 | void (*update_curr) (struct rq *rq); |
| 1228 | 1224 | ||
| 1229 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1225 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1230 | void (*task_move_group) (struct task_struct *p, int on_rq); | 1226 | void (*task_move_group) (struct task_struct *p); |
| 1231 | #endif | 1227 | #endif |
| 1232 | }; | 1228 | }; |
| 1233 | 1229 | ||
| @@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | |||
| 1405 | } | 1401 | } |
| 1406 | #endif | 1402 | #endif |
| 1407 | 1403 | ||
| 1404 | #ifndef arch_scale_cpu_capacity | ||
| 1405 | static __always_inline | ||
| 1406 | unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
| 1407 | { | ||
| 1408 | if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | ||
| 1409 | return sd->smt_gain / sd->span_weight; | ||
| 1410 | |||
| 1411 | return SCHED_CAPACITY_SCALE; | ||
| 1412 | } | ||
| 1413 | #endif | ||
| 1414 | |||
| 1408 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1415 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
| 1409 | { | 1416 | { |
| 1410 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); | 1417 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
