diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 21:03:50 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 21:03:50 -0500 |
commit | 53528695ff6d8b77011bc818407c13e30914a946 (patch) | |
tree | 04acd099c5759bf6f1d728c5415f574d572c6872 | |
parent | b831ef2cad979912850e34f82415c0c5d59de8cb (diff) | |
parent | e73e85f0593832aa583b252f9a16cf90ed6d30fa (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The main changes in this cycle were:
- sched/fair load tracking fixes and cleanups (Byungchul Park)
- Make load tracking frequency scale invariant (Dietmar Eggemann)
- sched/deadline updates (Juri Lelli)
- stop machine fixes, cleanups and enhancements for bugs triggered by
CPU hotplug stress testing (Oleg Nesterov)
- scheduler preemption code rework: remove PREEMPT_ACTIVE and related
cleanups (Peter Zijlstra)
- Rework the sched_info::run_delay code to fix races (Peter Zijlstra)
- Optimize per entity utilization tracking (Peter Zijlstra)
- ... misc other fixes, cleanups and smaller updates"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS
sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop()
sched: Start stopper early
stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark()
stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark()
stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled
stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works()
stop_machine: Ensure that a queued callback will be called before cpu_stop_park()
sched/x86: Fix typo in __switch_to() comments
sched/core: Remove a parameter in the migrate_task_rq() function
sched/core: Drop unlikely behind BUG_ON()
sched/core: Fix task and run queue sched_info::run_delay inconsistencies
sched/numa: Fix task_tick_fair() from disabling numa_balancing
sched/core: Add preempt_count invariant check
sched/core: More notrace annotations
sched/core: Kill PREEMPT_ACTIVE
sched/core, sched/x86: Kill thread_info::saved_preempt_count
sched/core: Simplify preempt_count tests
sched/core: Robustify preemption leak checks
sched/core: Stop setting PREEMPT_ACTIVE
...
-rw-r--r-- | arch/x86/include/asm/preempt.h | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/thread_info.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_32.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 10 | ||||
-rw-r--r-- | include/asm-generic/preempt.h | 2 | ||||
-rw-r--r-- | include/linux/preempt.h | 20 | ||||
-rw-r--r-- | include/linux/sched.h | 36 | ||||
-rw-r--r-- | include/linux/sched/deadline.h | 5 | ||||
-rw-r--r-- | include/linux/smpboot.h | 4 | ||||
-rw-r--r-- | include/linux/stop_machine.h | 2 | ||||
-rw-r--r-- | include/trace/events/sched.h | 22 | ||||
-rw-r--r-- | kernel/cpu.c | 10 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 203 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 5 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 419 | ||||
-rw-r--r-- | kernel/sched/features.h | 21 | ||||
-rw-r--r-- | kernel/sched/rt.c | 22 | ||||
-rw-r--r-- | kernel/sched/sched.h | 55 | ||||
-rw-r--r-- | kernel/smpboot.c | 5 | ||||
-rw-r--r-- | kernel/stop_machine.c | 90 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_sched_switch.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 2 |
26 files changed, 492 insertions, 469 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index b12f81022a6b..01bcde84d3e4 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h | |||
@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc) | |||
30 | /* | 30 | /* |
31 | * must be macros to avoid header recursion hell | 31 | * must be macros to avoid header recursion hell |
32 | */ | 32 | */ |
33 | #define init_task_preempt_count(p) do { \ | 33 | #define init_task_preempt_count(p) do { } while (0) |
34 | task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ | ||
35 | } while (0) | ||
36 | 34 | ||
37 | #define init_idle_preempt_count(p, cpu) do { \ | 35 | #define init_idle_preempt_count(p, cpu) do { \ |
38 | task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ | ||
39 | per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ | 36 | per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ |
40 | } while (0) | 37 | } while (0) |
41 | 38 | ||
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e44247..809877e9030b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h | |||
@@ -57,7 +57,6 @@ struct thread_info { | |||
57 | __u32 flags; /* low level flags */ | 57 | __u32 flags; /* low level flags */ |
58 | __u32 status; /* thread synchronous flags */ | 58 | __u32 status; /* thread synchronous flags */ |
59 | __u32 cpu; /* current CPU */ | 59 | __u32 cpu; /* current CPU */ |
60 | int saved_preempt_count; | ||
61 | mm_segment_t addr_limit; | 60 | mm_segment_t addr_limit; |
62 | void __user *sysenter_return; | 61 | void __user *sysenter_return; |
63 | unsigned int sig_on_uaccess_error:1; | 62 | unsigned int sig_on_uaccess_error:1; |
@@ -69,7 +68,6 @@ struct thread_info { | |||
69 | .task = &tsk, \ | 68 | .task = &tsk, \ |
70 | .flags = 0, \ | 69 | .flags = 0, \ |
71 | .cpu = 0, \ | 70 | .cpu = 0, \ |
72 | .saved_preempt_count = INIT_PREEMPT_COUNT, \ | ||
73 | .addr_limit = KERNEL_DS, \ | 71 | .addr_limit = KERNEL_DS, \ |
74 | } | 72 | } |
75 | 73 | ||
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 737527b40e5b..9f950917528b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -280,14 +280,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
280 | set_iopl_mask(next->iopl); | 280 | set_iopl_mask(next->iopl); |
281 | 281 | ||
282 | /* | 282 | /* |
283 | * If it were not for PREEMPT_ACTIVE we could guarantee that the | ||
284 | * preempt_count of all tasks was equal here and this would not be | ||
285 | * needed. | ||
286 | */ | ||
287 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); | ||
288 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); | ||
289 | |||
290 | /* | ||
291 | * Now maybe handle debug registers and/or IO bitmaps | 283 | * Now maybe handle debug registers and/or IO bitmaps |
292 | */ | 284 | */ |
293 | if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || | 285 | if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || |
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b35921a670b2..e835d263a33b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c | |||
@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
332 | /* | 332 | /* |
333 | * Switch FS and GS. | 333 | * Switch FS and GS. |
334 | * | 334 | * |
335 | * These are even more complicated than FS and GS: they have | 335 | * These are even more complicated than DS and ES: they have |
336 | * 64-bit bases are that controlled by arch_prctl. Those bases | 336 | * 64-bit bases are that controlled by arch_prctl. Those bases |
337 | * only differ from the values in the GDT or LDT if the selector | 337 | * only differ from the values in the GDT or LDT if the selector |
338 | * is 0. | 338 | * is 0. |
@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |||
401 | */ | 401 | */ |
402 | this_cpu_write(current_task, next_p); | 402 | this_cpu_write(current_task, next_p); |
403 | 403 | ||
404 | /* | ||
405 | * If it were not for PREEMPT_ACTIVE we could guarantee that the | ||
406 | * preempt_count of all tasks was equal here and this would not be | ||
407 | * needed. | ||
408 | */ | ||
409 | task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); | ||
410 | this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); | ||
411 | |||
412 | /* Reload esp0 and ss1. This changes current_thread_info(). */ | 404 | /* Reload esp0 and ss1. This changes current_thread_info(). */ |
413 | load_sp0(tss, next); | 405 | load_sp0(tss, next); |
414 | 406 | ||
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 0bec580a4885..5d8ffa3e6f8c 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h | |||
@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc) | |||
24 | * must be macros to avoid header recursion hell | 24 | * must be macros to avoid header recursion hell |
25 | */ | 25 | */ |
26 | #define init_task_preempt_count(p) do { \ | 26 | #define init_task_preempt_count(p) do { \ |
27 | task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ | 27 | task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ |
28 | } while (0) | 28 | } while (0) |
29 | 29 | ||
30 | #define init_idle_preempt_count(p, cpu) do { \ | 30 | #define init_idle_preempt_count(p, cpu) do { \ |
diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bea8dd8ff5e0..75e4e30677f1 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h | |||
@@ -26,7 +26,6 @@ | |||
26 | * SOFTIRQ_MASK: 0x0000ff00 | 26 | * SOFTIRQ_MASK: 0x0000ff00 |
27 | * HARDIRQ_MASK: 0x000f0000 | 27 | * HARDIRQ_MASK: 0x000f0000 |
28 | * NMI_MASK: 0x00100000 | 28 | * NMI_MASK: 0x00100000 |
29 | * PREEMPT_ACTIVE: 0x00200000 | ||
30 | * PREEMPT_NEED_RESCHED: 0x80000000 | 29 | * PREEMPT_NEED_RESCHED: 0x80000000 |
31 | */ | 30 | */ |
32 | #define PREEMPT_BITS 8 | 31 | #define PREEMPT_BITS 8 |
@@ -53,10 +52,6 @@ | |||
53 | 52 | ||
54 | #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) | 53 | #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) |
55 | 54 | ||
56 | #define PREEMPT_ACTIVE_BITS 1 | ||
57 | #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) | ||
58 | #define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) | ||
59 | |||
60 | /* We use the MSB mostly because its available */ | 55 | /* We use the MSB mostly because its available */ |
61 | #define PREEMPT_NEED_RESCHED 0x80000000 | 56 | #define PREEMPT_NEED_RESCHED 0x80000000 |
62 | 57 | ||
@@ -126,8 +121,7 @@ | |||
126 | * Check whether we were atomic before we did preempt_disable(): | 121 | * Check whether we were atomic before we did preempt_disable(): |
127 | * (used by the scheduler) | 122 | * (used by the scheduler) |
128 | */ | 123 | */ |
129 | #define in_atomic_preempt_off() \ | 124 | #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) |
130 | ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) | ||
131 | 125 | ||
132 | #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) | 126 | #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) |
133 | extern void preempt_count_add(int val); | 127 | extern void preempt_count_add(int val); |
@@ -146,18 +140,6 @@ extern void preempt_count_sub(int val); | |||
146 | #define preempt_count_inc() preempt_count_add(1) | 140 | #define preempt_count_inc() preempt_count_add(1) |
147 | #define preempt_count_dec() preempt_count_sub(1) | 141 | #define preempt_count_dec() preempt_count_sub(1) |
148 | 142 | ||
149 | #define preempt_active_enter() \ | ||
150 | do { \ | ||
151 | preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ | ||
152 | barrier(); \ | ||
153 | } while (0) | ||
154 | |||
155 | #define preempt_active_exit() \ | ||
156 | do { \ | ||
157 | barrier(); \ | ||
158 | preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ | ||
159 | } while (0) | ||
160 | |||
161 | #ifdef CONFIG_PREEMPT_COUNT | 143 | #ifdef CONFIG_PREEMPT_COUNT |
162 | 144 | ||
163 | #define preempt_disable() \ | 145 | #define preempt_disable() \ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 56667292d1e4..9e1e06c3ce05 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -599,20 +599,26 @@ struct task_cputime_atomic { | |||
599 | .sum_exec_runtime = ATOMIC64_INIT(0), \ | 599 | .sum_exec_runtime = ATOMIC64_INIT(0), \ |
600 | } | 600 | } |
601 | 601 | ||
602 | #ifdef CONFIG_PREEMPT_COUNT | 602 | #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) |
603 | #define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) | 603 | |
604 | #else | 604 | /* |
605 | #define PREEMPT_DISABLED PREEMPT_ENABLED | 605 | * Disable preemption until the scheduler is running -- use an unconditional |
606 | #endif | 606 | * value so that it also works on !PREEMPT_COUNT kernels. |
607 | * | ||
608 | * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). | ||
609 | */ | ||
610 | #define INIT_PREEMPT_COUNT PREEMPT_OFFSET | ||
607 | 611 | ||
608 | /* | 612 | /* |
609 | * Disable preemption until the scheduler is running. | 613 | * Initial preempt_count value; reflects the preempt_count schedule invariant |
610 | * Reset by start_kernel()->sched_init()->init_idle(). | 614 | * which states that during context switches: |
611 | * | 615 | * |
612 | * We include PREEMPT_ACTIVE to avoid cond_resched() from working | 616 | * preempt_count() == 2*PREEMPT_DISABLE_OFFSET |
613 | * before the scheduler is active -- see should_resched(). | 617 | * |
618 | * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. | ||
619 | * Note: See finish_task_switch(). | ||
614 | */ | 620 | */ |
615 | #define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) | 621 | #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) |
616 | 622 | ||
617 | /** | 623 | /** |
618 | * struct thread_group_cputimer - thread group interval timer counts | 624 | * struct thread_group_cputimer - thread group interval timer counts |
@@ -1142,8 +1148,6 @@ struct sched_domain_topology_level { | |||
1142 | #endif | 1148 | #endif |
1143 | }; | 1149 | }; |
1144 | 1150 | ||
1145 | extern struct sched_domain_topology_level *sched_domain_topology; | ||
1146 | |||
1147 | extern void set_sched_topology(struct sched_domain_topology_level *tl); | 1151 | extern void set_sched_topology(struct sched_domain_topology_level *tl); |
1148 | extern void wake_up_if_idle(int cpu); | 1152 | extern void wake_up_if_idle(int cpu); |
1149 | 1153 | ||
@@ -1192,10 +1196,10 @@ struct load_weight { | |||
1192 | 1196 | ||
1193 | /* | 1197 | /* |
1194 | * The load_avg/util_avg accumulates an infinite geometric series. | 1198 | * The load_avg/util_avg accumulates an infinite geometric series. |
1195 | * 1) load_avg factors the amount of time that a sched_entity is | 1199 | * 1) load_avg factors frequency scaling into the amount of time that a |
1196 | * runnable on a rq into its weight. For cfs_rq, it is the aggregated | 1200 | * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the |
1197 | * such weights of all runnable and blocked sched_entities. | 1201 | * aggregated such weights of all runnable and blocked sched_entities. |
1198 | * 2) util_avg factors frequency scaling into the amount of time | 1202 | * 2) util_avg factors frequency and cpu scaling into the amount of time |
1199 | * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. | 1203 | * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. |
1200 | * For cfs_rq, it is the aggregated such times of all runnable and | 1204 | * For cfs_rq, it is the aggregated such times of all runnable and |
1201 | * blocked sched_entities. | 1205 | * blocked sched_entities. |
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 9d303b8847df..9089a2ae913d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h | |||
@@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p) | |||
21 | return dl_prio(p->prio); | 21 | return dl_prio(p->prio); |
22 | } | 22 | } |
23 | 23 | ||
24 | static inline bool dl_time_before(u64 a, u64 b) | ||
25 | { | ||
26 | return (s64)(a - b) < 0; | ||
27 | } | ||
28 | |||
24 | #endif /* _SCHED_DEADLINE_H */ | 29 | #endif /* _SCHED_DEADLINE_H */ |
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index e6109a6cd8f6..12910cf19869 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h | |||
@@ -24,9 +24,6 @@ struct smpboot_thread_data; | |||
24 | * parked (cpu offline) | 24 | * parked (cpu offline) |
25 | * @unpark: Optional unpark function, called when the thread is | 25 | * @unpark: Optional unpark function, called when the thread is |
26 | * unparked (cpu online) | 26 | * unparked (cpu online) |
27 | * @pre_unpark: Optional unpark function, called before the thread is | ||
28 | * unparked (cpu online). This is not guaranteed to be | ||
29 | * called on the target cpu of the thread. Careful! | ||
30 | * @cpumask: Internal state. To update which threads are unparked, | 27 | * @cpumask: Internal state. To update which threads are unparked, |
31 | * call smpboot_update_cpumask_percpu_thread(). | 28 | * call smpboot_update_cpumask_percpu_thread(). |
32 | * @selfparking: Thread is not parked by the park function. | 29 | * @selfparking: Thread is not parked by the park function. |
@@ -42,7 +39,6 @@ struct smp_hotplug_thread { | |||
42 | void (*cleanup)(unsigned int cpu, bool online); | 39 | void (*cleanup)(unsigned int cpu, bool online); |
43 | void (*park)(unsigned int cpu); | 40 | void (*park)(unsigned int cpu); |
44 | void (*unpark)(unsigned int cpu); | 41 | void (*unpark)(unsigned int cpu); |
45 | void (*pre_unpark)(unsigned int cpu); | ||
46 | cpumask_var_t cpumask; | 42 | cpumask_var_t cpumask; |
47 | bool selfparking; | 43 | bool selfparking; |
48 | const char *thread_comm; | 44 | const char *thread_comm; |
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 414d924318ce..0adedca24c5b 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h | |||
@@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |||
33 | struct cpu_stop_work *work_buf); | 33 | struct cpu_stop_work *work_buf); |
34 | int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); | 34 | int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); |
35 | int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); | 35 | int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); |
36 | void stop_machine_park(int cpu); | ||
37 | void stop_machine_unpark(int cpu); | ||
36 | 38 | ||
37 | #else /* CONFIG_SMP */ | 39 | #else /* CONFIG_SMP */ |
38 | 40 | ||
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 539d6bc3216a..9b90c57517a9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, | |||
104 | TP_ARGS(p)); | 104 | TP_ARGS(p)); |
105 | 105 | ||
106 | #ifdef CREATE_TRACE_POINTS | 106 | #ifdef CREATE_TRACE_POINTS |
107 | static inline long __trace_sched_switch_state(struct task_struct *p) | 107 | static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) |
108 | { | 108 | { |
109 | long state = p->state; | ||
110 | |||
111 | #ifdef CONFIG_PREEMPT | ||
112 | #ifdef CONFIG_SCHED_DEBUG | 109 | #ifdef CONFIG_SCHED_DEBUG |
113 | BUG_ON(p != current); | 110 | BUG_ON(p != current); |
114 | #endif /* CONFIG_SCHED_DEBUG */ | 111 | #endif /* CONFIG_SCHED_DEBUG */ |
112 | |||
115 | /* | 113 | /* |
116 | * For all intents and purposes a preempted task is a running task. | 114 | * Preemption ignores task state, therefore preempted tasks are always |
115 | * RUNNING (we will not have dequeued if state != RUNNING). | ||
117 | */ | 116 | */ |
118 | if (preempt_count() & PREEMPT_ACTIVE) | 117 | return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; |
119 | state = TASK_RUNNING | TASK_STATE_MAX; | ||
120 | #endif /* CONFIG_PREEMPT */ | ||
121 | |||
122 | return state; | ||
123 | } | 118 | } |
124 | #endif /* CREATE_TRACE_POINTS */ | 119 | #endif /* CREATE_TRACE_POINTS */ |
125 | 120 | ||
@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p) | |||
128 | */ | 123 | */ |
129 | TRACE_EVENT(sched_switch, | 124 | TRACE_EVENT(sched_switch, |
130 | 125 | ||
131 | TP_PROTO(struct task_struct *prev, | 126 | TP_PROTO(bool preempt, |
127 | struct task_struct *prev, | ||
132 | struct task_struct *next), | 128 | struct task_struct *next), |
133 | 129 | ||
134 | TP_ARGS(prev, next), | 130 | TP_ARGS(preempt, prev, next), |
135 | 131 | ||
136 | TP_STRUCT__entry( | 132 | TP_STRUCT__entry( |
137 | __array( char, prev_comm, TASK_COMM_LEN ) | 133 | __array( char, prev_comm, TASK_COMM_LEN ) |
@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch, | |||
147 | memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); | 143 | memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); |
148 | __entry->prev_pid = prev->pid; | 144 | __entry->prev_pid = prev->pid; |
149 | __entry->prev_prio = prev->prio; | 145 | __entry->prev_prio = prev->prio; |
150 | __entry->prev_state = __trace_sched_switch_state(prev); | 146 | __entry->prev_state = __trace_sched_switch_state(preempt, prev); |
151 | memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); | 147 | memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); |
152 | __entry->next_pid = next->pid; | 148 | __entry->next_pid = next->pid; |
153 | __entry->next_prio = next->prio; | 149 | __entry->next_prio = next->prio; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 14a9cdf8abe9..85ff5e26e23b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu) | |||
291 | { | 291 | { |
292 | struct task_struct *g, *p; | 292 | struct task_struct *g, *p; |
293 | 293 | ||
294 | read_lock_irq(&tasklist_lock); | 294 | read_lock(&tasklist_lock); |
295 | do_each_thread(g, p) { | 295 | for_each_process_thread(g, p) { |
296 | if (!p->on_rq) | 296 | if (!p->on_rq) |
297 | continue; | 297 | continue; |
298 | /* | 298 | /* |
@@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu) | |||
307 | 307 | ||
308 | pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", | 308 | pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", |
309 | p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); | 309 | p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); |
310 | } while_each_thread(g, p); | 310 | } |
311 | read_unlock_irq(&tasklist_lock); | 311 | read_unlock(&tasklist_lock); |
312 | } | 312 | } |
313 | 313 | ||
314 | struct take_cpu_down_param { | 314 | struct take_cpu_down_param { |
@@ -331,7 +331,7 @@ static int take_cpu_down(void *_param) | |||
331 | /* Give up timekeeping duties */ | 331 | /* Give up timekeeping duties */ |
332 | tick_handover_do_timer(); | 332 | tick_handover_do_timer(); |
333 | /* Park the stopper thread */ | 333 | /* Park the stopper thread */ |
334 | kthread_park(current); | 334 | stop_machine_park((long)param->hcpu); |
335 | return 0; | 335 | return 0; |
336 | } | 336 | } |
337 | 337 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 0e93b63bbc59..07110c6020a0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -706,10 +706,12 @@ void do_exit(long code) | |||
706 | smp_mb(); | 706 | smp_mb(); |
707 | raw_spin_unlock_wait(&tsk->pi_lock); | 707 | raw_spin_unlock_wait(&tsk->pi_lock); |
708 | 708 | ||
709 | if (unlikely(in_atomic())) | 709 | if (unlikely(in_atomic())) { |
710 | pr_info("note: %s[%d] exited with preempt_count %d\n", | 710 | pr_info("note: %s[%d] exited with preempt_count %d\n", |
711 | current->comm, task_pid_nr(current), | 711 | current->comm, task_pid_nr(current), |
712 | preempt_count()); | 712 | preempt_count()); |
713 | preempt_count_set(PREEMPT_ENABLED); | ||
714 | } | ||
713 | 715 | ||
714 | /* sync mm's RSS info before statistics gathering */ | 716 | /* sync mm's RSS info before statistics gathering */ |
715 | if (tsk->mm) | 717 | if (tsk->mm) |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index bbb72b4f64a1..8251e75dd9c0 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, | |||
170 | * then right waiter has a dl_prio() too. | 170 | * then right waiter has a dl_prio() too. |
171 | */ | 171 | */ |
172 | if (dl_prio(left->prio)) | 172 | if (dl_prio(left->prio)) |
173 | return (left->task->dl.deadline < right->task->dl.deadline); | 173 | return dl_time_before(left->task->dl.deadline, |
174 | right->task->dl.deadline); | ||
174 | 175 | ||
175 | return 0; | 176 | return 0; |
176 | } | 177 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7402f7eb448..aa5973220ad2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p) | |||
817 | /* | 817 | /* |
818 | * SCHED_IDLE tasks get minimal weight: | 818 | * SCHED_IDLE tasks get minimal weight: |
819 | */ | 819 | */ |
820 | if (p->policy == SCHED_IDLE) { | 820 | if (idle_policy(p->policy)) { |
821 | load->weight = scale_load(WEIGHT_IDLEPRIO); | 821 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
822 | load->inv_weight = WMULT_IDLEPRIO; | 822 | load->inv_weight = WMULT_IDLEPRIO; |
823 | return; | 823 | return; |
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p) | |||
827 | load->inv_weight = prio_to_wmult[prio]; | 827 | load->inv_weight = prio_to_wmult[prio]; |
828 | } | 828 | } |
829 | 829 | ||
830 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 830 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
831 | { | 831 | { |
832 | update_rq_clock(rq); | 832 | update_rq_clock(rq); |
833 | sched_info_queued(rq, p); | 833 | if (!(flags & ENQUEUE_RESTORE)) |
834 | sched_info_queued(rq, p); | ||
834 | p->sched_class->enqueue_task(rq, p, flags); | 835 | p->sched_class->enqueue_task(rq, p, flags); |
835 | } | 836 | } |
836 | 837 | ||
837 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 838 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
838 | { | 839 | { |
839 | update_rq_clock(rq); | 840 | update_rq_clock(rq); |
840 | sched_info_dequeued(rq, p); | 841 | if (!(flags & DEQUEUE_SAVE)) |
842 | sched_info_dequeued(rq, p); | ||
841 | p->sched_class->dequeue_task(rq, p, flags); | 843 | p->sched_class->dequeue_task(rq, p, flags); |
842 | } | 844 | } |
843 | 845 | ||
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1178 | * holding rq->lock. | 1180 | * holding rq->lock. |
1179 | */ | 1181 | */ |
1180 | lockdep_assert_held(&rq->lock); | 1182 | lockdep_assert_held(&rq->lock); |
1181 | dequeue_task(rq, p, 0); | 1183 | dequeue_task(rq, p, DEQUEUE_SAVE); |
1182 | } | 1184 | } |
1183 | if (running) | 1185 | if (running) |
1184 | put_prev_task(rq, p); | 1186 | put_prev_task(rq, p); |
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1188 | if (running) | 1190 | if (running) |
1189 | p->sched_class->set_curr_task(rq); | 1191 | p->sched_class->set_curr_task(rq); |
1190 | if (queued) | 1192 | if (queued) |
1191 | enqueue_task(rq, p, 0); | 1193 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
1192 | } | 1194 | } |
1193 | 1195 | ||
1194 | /* | 1196 | /* |
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1292 | 1294 | ||
1293 | if (task_cpu(p) != new_cpu) { | 1295 | if (task_cpu(p) != new_cpu) { |
1294 | if (p->sched_class->migrate_task_rq) | 1296 | if (p->sched_class->migrate_task_rq) |
1295 | p->sched_class->migrate_task_rq(p, new_cpu); | 1297 | p->sched_class->migrate_task_rq(p); |
1296 | p->se.nr_migrations++; | 1298 | p->se.nr_migrations++; |
1297 | perf_event_task_migrate(p); | 1299 | perf_event_task_migrate(p); |
1298 | } | 1300 | } |
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data) | |||
1333 | struct rq *src_rq, *dst_rq; | 1335 | struct rq *src_rq, *dst_rq; |
1334 | int ret = -EAGAIN; | 1336 | int ret = -EAGAIN; |
1335 | 1337 | ||
1338 | if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) | ||
1339 | return -EAGAIN; | ||
1340 | |||
1336 | src_rq = cpu_rq(arg->src_cpu); | 1341 | src_rq = cpu_rq(arg->src_cpu); |
1337 | dst_rq = cpu_rq(arg->dst_cpu); | 1342 | dst_rq = cpu_rq(arg->dst_cpu); |
1338 | 1343 | ||
1339 | double_raw_lock(&arg->src_task->pi_lock, | 1344 | double_raw_lock(&arg->src_task->pi_lock, |
1340 | &arg->dst_task->pi_lock); | 1345 | &arg->dst_task->pi_lock); |
1341 | double_rq_lock(src_rq, dst_rq); | 1346 | double_rq_lock(src_rq, dst_rq); |
1347 | |||
1342 | if (task_cpu(arg->dst_task) != arg->dst_cpu) | 1348 | if (task_cpu(arg->dst_task) != arg->dst_cpu) |
1343 | goto unlock; | 1349 | goto unlock; |
1344 | 1350 | ||
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1574 | goto out; | 1580 | goto out; |
1575 | } | 1581 | } |
1576 | 1582 | ||
1583 | /* No more Mr. Nice Guy. */ | ||
1577 | switch (state) { | 1584 | switch (state) { |
1578 | case cpuset: | 1585 | case cpuset: |
1579 | /* No more Mr. Nice Guy. */ | 1586 | if (IS_ENABLED(CONFIG_CPUSETS)) { |
1580 | cpuset_cpus_allowed_fallback(p); | 1587 | cpuset_cpus_allowed_fallback(p); |
1581 | state = possible; | 1588 | state = possible; |
1582 | break; | 1589 | break; |
1583 | 1590 | } | |
1591 | /* fall-through */ | ||
1584 | case possible: | 1592 | case possible: |
1585 | do_set_cpus_allowed(p, cpu_possible_mask); | 1593 | do_set_cpus_allowed(p, cpu_possible_mask); |
1586 | state = fail; | 1594 | state = fail; |
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
1692 | #endif /* CONFIG_SCHEDSTATS */ | 1700 | #endif /* CONFIG_SCHEDSTATS */ |
1693 | } | 1701 | } |
1694 | 1702 | ||
1695 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1703 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
1696 | { | 1704 | { |
1697 | activate_task(rq, p, en_flags); | 1705 | activate_task(rq, p, en_flags); |
1698 | p->on_rq = TASK_ON_RQ_QUEUED; | 1706 | p->on_rq = TASK_ON_RQ_QUEUED; |
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2114 | #endif /* CONFIG_NUMA_BALANCING */ | 2122 | #endif /* CONFIG_NUMA_BALANCING */ |
2115 | } | 2123 | } |
2116 | 2124 | ||
2125 | DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); | ||
2126 | |||
2117 | #ifdef CONFIG_NUMA_BALANCING | 2127 | #ifdef CONFIG_NUMA_BALANCING |
2118 | #ifdef CONFIG_SCHED_DEBUG | 2128 | |
2119 | void set_numabalancing_state(bool enabled) | 2129 | void set_numabalancing_state(bool enabled) |
2120 | { | 2130 | { |
2121 | if (enabled) | 2131 | if (enabled) |
2122 | sched_feat_set("NUMA"); | 2132 | static_branch_enable(&sched_numa_balancing); |
2123 | else | 2133 | else |
2124 | sched_feat_set("NO_NUMA"); | 2134 | static_branch_disable(&sched_numa_balancing); |
2125 | } | 2135 | } |
2126 | #else | ||
2127 | __read_mostly bool numabalancing_enabled; | ||
2128 | |||
2129 | void set_numabalancing_state(bool enabled) | ||
2130 | { | ||
2131 | numabalancing_enabled = enabled; | ||
2132 | } | ||
2133 | #endif /* CONFIG_SCHED_DEBUG */ | ||
2134 | 2136 | ||
2135 | #ifdef CONFIG_PROC_SYSCTL | 2137 | #ifdef CONFIG_PROC_SYSCTL |
2136 | int sysctl_numa_balancing(struct ctl_table *table, int write, | 2138 | int sysctl_numa_balancing(struct ctl_table *table, int write, |
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
2138 | { | 2140 | { |
2139 | struct ctl_table t; | 2141 | struct ctl_table t; |
2140 | int err; | 2142 | int err; |
2141 | int state = numabalancing_enabled; | 2143 | int state = static_branch_likely(&sched_numa_balancing); |
2142 | 2144 | ||
2143 | if (write && !capable(CAP_SYS_ADMIN)) | 2145 | if (write && !capable(CAP_SYS_ADMIN)) |
2144 | return -EPERM; | 2146 | return -EPERM; |
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p) | |||
2349 | struct rq *rq; | 2351 | struct rq *rq; |
2350 | 2352 | ||
2351 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2353 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2354 | /* Initialize new task's runnable average */ | ||
2355 | init_entity_runnable_average(&p->se); | ||
2352 | #ifdef CONFIG_SMP | 2356 | #ifdef CONFIG_SMP |
2353 | /* | 2357 | /* |
2354 | * Fork balancing, do it here and not earlier because: | 2358 | * Fork balancing, do it here and not earlier because: |
@@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p) | |||
2358 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2362 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2359 | #endif | 2363 | #endif |
2360 | 2364 | ||
2361 | /* Initialize new task's runnable average */ | ||
2362 | init_entity_runnable_average(&p->se); | ||
2363 | rq = __task_rq_lock(p); | 2365 | rq = __task_rq_lock(p); |
2364 | activate_task(rq, p, 0); | 2366 | activate_task(rq, p, 0); |
2365 | p->on_rq = TASK_ON_RQ_QUEUED; | 2367 | p->on_rq = TASK_ON_RQ_QUEUED; |
@@ -2483,7 +2485,6 @@ static inline void | |||
2483 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2485 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2484 | struct task_struct *next) | 2486 | struct task_struct *next) |
2485 | { | 2487 | { |
2486 | trace_sched_switch(prev, next); | ||
2487 | sched_info_switch(rq, prev, next); | 2488 | sched_info_switch(rq, prev, next); |
2488 | perf_event_task_sched_out(prev, next); | 2489 | perf_event_task_sched_out(prev, next); |
2489 | fire_sched_out_preempt_notifiers(prev, next); | 2490 | fire_sched_out_preempt_notifiers(prev, next); |
@@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
2517 | struct mm_struct *mm = rq->prev_mm; | 2518 | struct mm_struct *mm = rq->prev_mm; |
2518 | long prev_state; | 2519 | long prev_state; |
2519 | 2520 | ||
2521 | /* | ||
2522 | * The previous task will have left us with a preempt_count of 2 | ||
2523 | * because it left us after: | ||
2524 | * | ||
2525 | * schedule() | ||
2526 | * preempt_disable(); // 1 | ||
2527 | * __schedule() | ||
2528 | * raw_spin_lock_irq(&rq->lock) // 2 | ||
2529 | * | ||
2530 | * Also, see FORK_PREEMPT_COUNT. | ||
2531 | */ | ||
2532 | if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, | ||
2533 | "corrupted preempt_count: %s/%d/0x%x\n", | ||
2534 | current->comm, current->pid, preempt_count())) | ||
2535 | preempt_count_set(FORK_PREEMPT_COUNT); | ||
2536 | |||
2520 | rq->prev_mm = NULL; | 2537 | rq->prev_mm = NULL; |
2521 | 2538 | ||
2522 | /* | 2539 | /* |
@@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
2601 | { | 2618 | { |
2602 | struct rq *rq; | 2619 | struct rq *rq; |
2603 | 2620 | ||
2604 | /* finish_task_switch() drops rq->lock and enables preemtion */ | 2621 | /* |
2605 | preempt_disable(); | 2622 | * New tasks start with FORK_PREEMPT_COUNT, see there and |
2623 | * finish_task_switch() for details. | ||
2624 | * | ||
2625 | * finish_task_switch() will drop rq->lock() and lower preempt_count | ||
2626 | * and the preempt_enable() will end up enabling preemption (on | ||
2627 | * PREEMPT_COUNT kernels). | ||
2628 | */ | ||
2629 | |||
2606 | rq = finish_task_switch(prev); | 2630 | rq = finish_task_switch(prev); |
2607 | balance_callback(rq); | 2631 | balance_callback(rq); |
2608 | preempt_enable(); | 2632 | preempt_enable(); |
@@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
2960 | static inline void schedule_debug(struct task_struct *prev) | 2984 | static inline void schedule_debug(struct task_struct *prev) |
2961 | { | 2985 | { |
2962 | #ifdef CONFIG_SCHED_STACK_END_CHECK | 2986 | #ifdef CONFIG_SCHED_STACK_END_CHECK |
2963 | BUG_ON(unlikely(task_stack_end_corrupted(prev))); | 2987 | BUG_ON(task_stack_end_corrupted(prev)); |
2964 | #endif | 2988 | #endif |
2965 | /* | 2989 | |
2966 | * Test if we are atomic. Since do_exit() needs to call into | 2990 | if (unlikely(in_atomic_preempt_off())) { |
2967 | * schedule() atomically, we ignore that path. Otherwise whine | ||
2968 | * if we are scheduling when we should not. | ||
2969 | */ | ||
2970 | if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) | ||
2971 | __schedule_bug(prev); | 2991 | __schedule_bug(prev); |
2992 | preempt_count_set(PREEMPT_DISABLED); | ||
2993 | } | ||
2972 | rcu_sleep_check(); | 2994 | rcu_sleep_check(); |
2973 | 2995 | ||
2974 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 2996 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
@@ -3054,7 +3076,7 @@ again: | |||
3054 | * | 3076 | * |
3055 | * WARNING: must be called with preemption disabled! | 3077 | * WARNING: must be called with preemption disabled! |
3056 | */ | 3078 | */ |
3057 | static void __sched __schedule(void) | 3079 | static void __sched notrace __schedule(bool preempt) |
3058 | { | 3080 | { |
3059 | struct task_struct *prev, *next; | 3081 | struct task_struct *prev, *next; |
3060 | unsigned long *switch_count; | 3082 | unsigned long *switch_count; |
@@ -3066,6 +3088,17 @@ static void __sched __schedule(void) | |||
3066 | rcu_note_context_switch(); | 3088 | rcu_note_context_switch(); |
3067 | prev = rq->curr; | 3089 | prev = rq->curr; |
3068 | 3090 | ||
3091 | /* | ||
3092 | * do_exit() calls schedule() with preemption disabled as an exception; | ||
3093 | * however we must fix that up, otherwise the next task will see an | ||
3094 | * inconsistent (higher) preempt count. | ||
3095 | * | ||
3096 | * It also avoids the below schedule_debug() test from complaining | ||
3097 | * about this. | ||
3098 | */ | ||
3099 | if (unlikely(prev->state == TASK_DEAD)) | ||
3100 | preempt_enable_no_resched_notrace(); | ||
3101 | |||
3069 | schedule_debug(prev); | 3102 | schedule_debug(prev); |
3070 | 3103 | ||
3071 | if (sched_feat(HRTICK)) | 3104 | if (sched_feat(HRTICK)) |
@@ -3083,7 +3116,7 @@ static void __sched __schedule(void) | |||
3083 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | 3116 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ |
3084 | 3117 | ||
3085 | switch_count = &prev->nivcsw; | 3118 | switch_count = &prev->nivcsw; |
3086 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3119 | if (!preempt && prev->state) { |
3087 | if (unlikely(signal_pending_state(prev->state, prev))) { | 3120 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3088 | prev->state = TASK_RUNNING; | 3121 | prev->state = TASK_RUNNING; |
3089 | } else { | 3122 | } else { |
@@ -3119,6 +3152,7 @@ static void __sched __schedule(void) | |||
3119 | rq->curr = next; | 3152 | rq->curr = next; |
3120 | ++*switch_count; | 3153 | ++*switch_count; |
3121 | 3154 | ||
3155 | trace_sched_switch(preempt, prev, next); | ||
3122 | rq = context_switch(rq, prev, next); /* unlocks the rq */ | 3156 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
3123 | cpu = cpu_of(rq); | 3157 | cpu = cpu_of(rq); |
3124 | } else { | 3158 | } else { |
@@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void) | |||
3148 | sched_submit_work(tsk); | 3182 | sched_submit_work(tsk); |
3149 | do { | 3183 | do { |
3150 | preempt_disable(); | 3184 | preempt_disable(); |
3151 | __schedule(); | 3185 | __schedule(false); |
3152 | sched_preempt_enable_no_resched(); | 3186 | sched_preempt_enable_no_resched(); |
3153 | } while (need_resched()); | 3187 | } while (need_resched()); |
3154 | } | 3188 | } |
@@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void) | |||
3188 | static void __sched notrace preempt_schedule_common(void) | 3222 | static void __sched notrace preempt_schedule_common(void) |
3189 | { | 3223 | { |
3190 | do { | 3224 | do { |
3191 | preempt_active_enter(); | 3225 | preempt_disable_notrace(); |
3192 | __schedule(); | 3226 | __schedule(true); |
3193 | preempt_active_exit(); | 3227 | preempt_enable_no_resched_notrace(); |
3194 | 3228 | ||
3195 | /* | 3229 | /* |
3196 | * Check again in case we missed a preemption opportunity | 3230 | * Check again in case we missed a preemption opportunity |
@@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) | |||
3241 | return; | 3275 | return; |
3242 | 3276 | ||
3243 | do { | 3277 | do { |
3244 | /* | 3278 | preempt_disable_notrace(); |
3245 | * Use raw __prempt_count() ops that don't call function. | ||
3246 | * We can't call functions before disabling preemption which | ||
3247 | * disarm preemption tracing recursions. | ||
3248 | */ | ||
3249 | __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); | ||
3250 | barrier(); | ||
3251 | /* | 3279 | /* |
3252 | * Needs preempt disabled in case user_exit() is traced | 3280 | * Needs preempt disabled in case user_exit() is traced |
3253 | * and the tracer calls preempt_enable_notrace() causing | 3281 | * and the tracer calls preempt_enable_notrace() causing |
3254 | * an infinite recursion. | 3282 | * an infinite recursion. |
3255 | */ | 3283 | */ |
3256 | prev_ctx = exception_enter(); | 3284 | prev_ctx = exception_enter(); |
3257 | __schedule(); | 3285 | __schedule(true); |
3258 | exception_exit(prev_ctx); | 3286 | exception_exit(prev_ctx); |
3259 | 3287 | ||
3260 | barrier(); | 3288 | preempt_enable_no_resched_notrace(); |
3261 | __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); | ||
3262 | } while (need_resched()); | 3289 | } while (need_resched()); |
3263 | } | 3290 | } |
3264 | EXPORT_SYMBOL_GPL(preempt_schedule_notrace); | 3291 | EXPORT_SYMBOL_GPL(preempt_schedule_notrace); |
@@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) | |||
3281 | prev_state = exception_enter(); | 3308 | prev_state = exception_enter(); |
3282 | 3309 | ||
3283 | do { | 3310 | do { |
3284 | preempt_active_enter(); | 3311 | preempt_disable(); |
3285 | local_irq_enable(); | 3312 | local_irq_enable(); |
3286 | __schedule(); | 3313 | __schedule(true); |
3287 | local_irq_disable(); | 3314 | local_irq_disable(); |
3288 | preempt_active_exit(); | 3315 | sched_preempt_enable_no_resched(); |
3289 | } while (need_resched()); | 3316 | } while (need_resched()); |
3290 | 3317 | ||
3291 | exception_exit(prev_state); | 3318 | exception_exit(prev_state); |
@@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
3313 | */ | 3340 | */ |
3314 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3341 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3315 | { | 3342 | { |
3316 | int oldprio, queued, running, enqueue_flag = 0; | 3343 | int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; |
3317 | struct rq *rq; | 3344 | struct rq *rq; |
3318 | const struct sched_class *prev_class; | 3345 | const struct sched_class *prev_class; |
3319 | 3346 | ||
@@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3345 | queued = task_on_rq_queued(p); | 3372 | queued = task_on_rq_queued(p); |
3346 | running = task_current(rq, p); | 3373 | running = task_current(rq, p); |
3347 | if (queued) | 3374 | if (queued) |
3348 | dequeue_task(rq, p, 0); | 3375 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3349 | if (running) | 3376 | if (running) |
3350 | put_prev_task(rq, p); | 3377 | put_prev_task(rq, p); |
3351 | 3378 | ||
@@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3363 | if (!dl_prio(p->normal_prio) || | 3390 | if (!dl_prio(p->normal_prio) || |
3364 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | 3391 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { |
3365 | p->dl.dl_boosted = 1; | 3392 | p->dl.dl_boosted = 1; |
3366 | enqueue_flag = ENQUEUE_REPLENISH; | 3393 | enqueue_flag |= ENQUEUE_REPLENISH; |
3367 | } else | 3394 | } else |
3368 | p->dl.dl_boosted = 0; | 3395 | p->dl.dl_boosted = 0; |
3369 | p->sched_class = &dl_sched_class; | 3396 | p->sched_class = &dl_sched_class; |
@@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3371 | if (dl_prio(oldprio)) | 3398 | if (dl_prio(oldprio)) |
3372 | p->dl.dl_boosted = 0; | 3399 | p->dl.dl_boosted = 0; |
3373 | if (oldprio < prio) | 3400 | if (oldprio < prio) |
3374 | enqueue_flag = ENQUEUE_HEAD; | 3401 | enqueue_flag |= ENQUEUE_HEAD; |
3375 | p->sched_class = &rt_sched_class; | 3402 | p->sched_class = &rt_sched_class; |
3376 | } else { | 3403 | } else { |
3377 | if (dl_prio(oldprio)) | 3404 | if (dl_prio(oldprio)) |
@@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3423 | } | 3450 | } |
3424 | queued = task_on_rq_queued(p); | 3451 | queued = task_on_rq_queued(p); |
3425 | if (queued) | 3452 | if (queued) |
3426 | dequeue_task(rq, p, 0); | 3453 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3427 | 3454 | ||
3428 | p->static_prio = NICE_TO_PRIO(nice); | 3455 | p->static_prio = NICE_TO_PRIO(nice); |
3429 | set_load_weight(p); | 3456 | set_load_weight(p); |
@@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3432 | delta = p->prio - old_prio; | 3459 | delta = p->prio - old_prio; |
3433 | 3460 | ||
3434 | if (queued) { | 3461 | if (queued) { |
3435 | enqueue_task(rq, p, 0); | 3462 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
3436 | /* | 3463 | /* |
3437 | * If the task increased its priority or is running and | 3464 | * If the task increased its priority or is running and |
3438 | * lowered its priority, then reschedule its CPU: | 3465 | * lowered its priority, then reschedule its CPU: |
@@ -3753,10 +3780,7 @@ recheck: | |||
3753 | } else { | 3780 | } else { |
3754 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); | 3781 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); |
3755 | 3782 | ||
3756 | if (policy != SCHED_DEADLINE && | 3783 | if (!valid_policy(policy)) |
3757 | policy != SCHED_FIFO && policy != SCHED_RR && | ||
3758 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
3759 | policy != SCHED_IDLE) | ||
3760 | return -EINVAL; | 3784 | return -EINVAL; |
3761 | } | 3785 | } |
3762 | 3786 | ||
@@ -3812,7 +3836,7 @@ recheck: | |||
3812 | * Treat SCHED_IDLE as nice 20. Only allow a switch to | 3836 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
3813 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 3837 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
3814 | */ | 3838 | */ |
3815 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { | 3839 | if (idle_policy(p->policy) && !idle_policy(policy)) { |
3816 | if (!can_nice(p, task_nice(p))) | 3840 | if (!can_nice(p, task_nice(p))) |
3817 | return -EPERM; | 3841 | return -EPERM; |
3818 | } | 3842 | } |
@@ -3937,7 +3961,7 @@ change: | |||
3937 | queued = task_on_rq_queued(p); | 3961 | queued = task_on_rq_queued(p); |
3938 | running = task_current(rq, p); | 3962 | running = task_current(rq, p); |
3939 | if (queued) | 3963 | if (queued) |
3940 | dequeue_task(rq, p, 0); | 3964 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3941 | if (running) | 3965 | if (running) |
3942 | put_prev_task(rq, p); | 3966 | put_prev_task(rq, p); |
3943 | 3967 | ||
@@ -3947,11 +3971,15 @@ change: | |||
3947 | if (running) | 3971 | if (running) |
3948 | p->sched_class->set_curr_task(rq); | 3972 | p->sched_class->set_curr_task(rq); |
3949 | if (queued) { | 3973 | if (queued) { |
3974 | int enqueue_flags = ENQUEUE_RESTORE; | ||
3950 | /* | 3975 | /* |
3951 | * We enqueue to tail when the priority of a task is | 3976 | * We enqueue to tail when the priority of a task is |
3952 | * increased (user space view). | 3977 | * increased (user space view). |
3953 | */ | 3978 | */ |
3954 | enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); | 3979 | if (oldprio <= p->prio) |
3980 | enqueue_flags |= ENQUEUE_HEAD; | ||
3981 | |||
3982 | enqueue_task(rq, p, enqueue_flags); | ||
3955 | } | 3983 | } |
3956 | 3984 | ||
3957 | check_class_changed(rq, p, prev_class, oldprio); | 3985 | check_class_changed(rq, p, prev_class, oldprio); |
@@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5101 | running = task_current(rq, p); | 5129 | running = task_current(rq, p); |
5102 | 5130 | ||
5103 | if (queued) | 5131 | if (queued) |
5104 | dequeue_task(rq, p, 0); | 5132 | dequeue_task(rq, p, DEQUEUE_SAVE); |
5105 | if (running) | 5133 | if (running) |
5106 | put_prev_task(rq, p); | 5134 | put_prev_task(rq, p); |
5107 | 5135 | ||
@@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5110 | if (running) | 5138 | if (running) |
5111 | p->sched_class->set_curr_task(rq); | 5139 | p->sched_class->set_curr_task(rq); |
5112 | if (queued) | 5140 | if (queued) |
5113 | enqueue_task(rq, p, 0); | 5141 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
5114 | task_rq_unlock(rq, p, &flags); | 5142 | task_rq_unlock(rq, p, &flags); |
5115 | } | 5143 | } |
5116 | #endif /* CONFIG_NUMA_BALANCING */ | 5144 | #endif /* CONFIG_NUMA_BALANCING */ |
@@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void) | |||
5531 | static int sched_cpu_active(struct notifier_block *nfb, | 5559 | static int sched_cpu_active(struct notifier_block *nfb, |
5532 | unsigned long action, void *hcpu) | 5560 | unsigned long action, void *hcpu) |
5533 | { | 5561 | { |
5562 | int cpu = (long)hcpu; | ||
5563 | |||
5534 | switch (action & ~CPU_TASKS_FROZEN) { | 5564 | switch (action & ~CPU_TASKS_FROZEN) { |
5535 | case CPU_STARTING: | 5565 | case CPU_STARTING: |
5536 | set_cpu_rq_start_time(); | 5566 | set_cpu_rq_start_time(); |
5537 | return NOTIFY_OK; | 5567 | return NOTIFY_OK; |
5568 | |||
5538 | case CPU_ONLINE: | 5569 | case CPU_ONLINE: |
5539 | /* | 5570 | /* |
5540 | * At this point a starting CPU has marked itself as online via | 5571 | * At this point a starting CPU has marked itself as online via |
5541 | * set_cpu_online(). But it might not yet have marked itself | 5572 | * set_cpu_online(). But it might not yet have marked itself |
5542 | * as active, which is essential from here on. | 5573 | * as active, which is essential from here on. |
5543 | * | ||
5544 | * Thus, fall-through and help the starting CPU along. | ||
5545 | */ | 5574 | */ |
5575 | set_cpu_active(cpu, true); | ||
5576 | stop_machine_unpark(cpu); | ||
5577 | return NOTIFY_OK; | ||
5578 | |||
5546 | case CPU_DOWN_FAILED: | 5579 | case CPU_DOWN_FAILED: |
5547 | set_cpu_active((long)hcpu, true); | 5580 | set_cpu_active(cpu, true); |
5548 | return NOTIFY_OK; | 5581 | return NOTIFY_OK; |
5582 | |||
5549 | default: | 5583 | default: |
5550 | return NOTIFY_DONE; | 5584 | return NOTIFY_DONE; |
5551 | } | 5585 | } |
@@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6477 | { NULL, }, | 6511 | { NULL, }, |
6478 | }; | 6512 | }; |
6479 | 6513 | ||
6480 | struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6514 | static struct sched_domain_topology_level *sched_domain_topology = |
6515 | default_topology; | ||
6481 | 6516 | ||
6482 | #define for_each_sd_topology(tl) \ | 6517 | #define for_each_sd_topology(tl) \ |
6483 | for (tl = sched_domain_topology; tl->mask; tl++) | 6518 | for (tl = sched_domain_topology; tl->mask; tl++) |
@@ -7478,7 +7513,7 @@ void __init sched_init(void) | |||
7478 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | 7513 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
7479 | static inline int preempt_count_equals(int preempt_offset) | 7514 | static inline int preempt_count_equals(int preempt_offset) |
7480 | { | 7515 | { |
7481 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 7516 | int nested = preempt_count() + rcu_preempt_depth(); |
7482 | 7517 | ||
7483 | return (nested == preempt_offset); | 7518 | return (nested == preempt_offset); |
7484 | } | 7519 | } |
@@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7725 | queued = task_on_rq_queued(tsk); | 7760 | queued = task_on_rq_queued(tsk); |
7726 | 7761 | ||
7727 | if (queued) | 7762 | if (queued) |
7728 | dequeue_task(rq, tsk, 0); | 7763 | dequeue_task(rq, tsk, DEQUEUE_SAVE); |
7729 | if (unlikely(running)) | 7764 | if (unlikely(running)) |
7730 | put_prev_task(rq, tsk); | 7765 | put_prev_task(rq, tsk); |
7731 | 7766 | ||
@@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7741 | 7776 | ||
7742 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7777 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7743 | if (tsk->sched_class->task_move_group) | 7778 | if (tsk->sched_class->task_move_group) |
7744 | tsk->sched_class->task_move_group(tsk, queued); | 7779 | tsk->sched_class->task_move_group(tsk); |
7745 | else | 7780 | else |
7746 | #endif | 7781 | #endif |
7747 | set_task_rq(tsk, task_cpu(tsk)); | 7782 | set_task_rq(tsk, task_cpu(tsk)); |
@@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7749 | if (unlikely(running)) | 7784 | if (unlikely(running)) |
7750 | tsk->sched_class->set_curr_task(rq); | 7785 | tsk->sched_class->set_curr_task(rq); |
7751 | if (queued) | 7786 | if (queued) |
7752 | enqueue_task(rq, tsk, 0); | 7787 | enqueue_task(rq, tsk, ENQUEUE_RESTORE); |
7753 | 7788 | ||
7754 | task_rq_unlock(rq, tsk, &flags); | 7789 | task_rq_unlock(rq, tsk, &flags); |
7755 | } | 7790 | } |
@@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, | |||
8213 | struct cgroup_subsys_state *old_css, | 8248 | struct cgroup_subsys_state *old_css, |
8214 | struct task_struct *task) | 8249 | struct task_struct *task) |
8215 | { | 8250 | { |
8216 | /* | ||
8217 | * cgroup_exit() is called in the copy_process() failure path. | ||
8218 | * Ignore this case since the task hasn't ran yet, this avoids | ||
8219 | * trying to poke a half freed task state from generic code. | ||
8220 | */ | ||
8221 | if (!(task->flags & PF_EXITING)) | ||
8222 | return; | ||
8223 | |||
8224 | sched_move_task(task); | 8251 | sched_move_task(task); |
8225 | } | 8252 | } |
8226 | 8253 | ||
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index c6acb07466bb..5a75b08cfd85 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -31,11 +31,6 @@ static inline int right_child(int i) | |||
31 | return (i << 1) + 2; | 31 | return (i << 1) + 2; |
32 | } | 32 | } |
33 | 33 | ||
34 | static inline int dl_time_before(u64 a, u64 b) | ||
35 | { | ||
36 | return (s64)(a - b) < 0; | ||
37 | } | ||
38 | |||
39 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | 34 | static void cpudl_exchange(struct cpudl *cp, int a, int b) |
40 | { | 35 | { |
41 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 36 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 1a0a6ef2fbe1..fcbdf83fed7e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _LINUX_CPUDL_H | 2 | #define _LINUX_CPUDL_H |
3 | 3 | ||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | #include <linux/sched/deadline.h> | ||
5 | 6 | ||
6 | #define IDX_INVALID -1 | 7 | #define IDX_INVALID -1 |
7 | 8 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a5e60fe721a..824aa9f501a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p); | |||
661 | 661 | ||
662 | /* | 662 | /* |
663 | * We choose a half-life close to 1 scheduling period. | 663 | * We choose a half-life close to 1 scheduling period. |
664 | * Note: The tables below are dependent on this value. | 664 | * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are |
665 | * dependent on this value. | ||
665 | */ | 666 | */ |
666 | #define LOAD_AVG_PERIOD 32 | 667 | #define LOAD_AVG_PERIOD 32 |
667 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | 668 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ |
668 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | 669 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ |
669 | 670 | ||
670 | /* Give new sched_entity start runnable values to heavy its load in infant time */ | 671 | /* Give new sched_entity start runnable values to heavy its load in infant time */ |
671 | void init_entity_runnable_average(struct sched_entity *se) | 672 | void init_entity_runnable_average(struct sched_entity *se) |
@@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
682 | sa->load_avg = scale_load_down(se->load.weight); | 683 | sa->load_avg = scale_load_down(se->load.weight); |
683 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | 684 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; |
684 | sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); | 685 | sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); |
685 | sa->util_sum = LOAD_AVG_MAX; | 686 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; |
686 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 687 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
687 | } | 688 | } |
688 | 689 | ||
@@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2069 | int local = !!(flags & TNF_FAULT_LOCAL); | 2070 | int local = !!(flags & TNF_FAULT_LOCAL); |
2070 | int priv; | 2071 | int priv; |
2071 | 2072 | ||
2072 | if (!numabalancing_enabled) | 2073 | if (!static_branch_likely(&sched_numa_balancing)) |
2073 | return; | 2074 | return; |
2074 | 2075 | ||
2075 | /* for example, ksmd faulting in a user's mm */ | 2076 | /* for example, ksmd faulting in a user's mm */ |
@@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work) | |||
2157 | struct vm_area_struct *vma; | 2158 | struct vm_area_struct *vma; |
2158 | unsigned long start, end; | 2159 | unsigned long start, end; |
2159 | unsigned long nr_pte_updates = 0; | 2160 | unsigned long nr_pte_updates = 0; |
2160 | long pages; | 2161 | long pages, virtpages; |
2161 | 2162 | ||
2162 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 2163 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
2163 | 2164 | ||
@@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work) | |||
2203 | start = mm->numa_scan_offset; | 2204 | start = mm->numa_scan_offset; |
2204 | pages = sysctl_numa_balancing_scan_size; | 2205 | pages = sysctl_numa_balancing_scan_size; |
2205 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | 2206 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ |
2207 | virtpages = pages * 8; /* Scan up to this much virtual space */ | ||
2206 | if (!pages) | 2208 | if (!pages) |
2207 | return; | 2209 | return; |
2208 | 2210 | ||
2211 | |||
2209 | down_read(&mm->mmap_sem); | 2212 | down_read(&mm->mmap_sem); |
2210 | vma = find_vma(mm, start); | 2213 | vma = find_vma(mm, start); |
2211 | if (!vma) { | 2214 | if (!vma) { |
@@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work) | |||
2240 | start = max(start, vma->vm_start); | 2243 | start = max(start, vma->vm_start); |
2241 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 2244 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
2242 | end = min(end, vma->vm_end); | 2245 | end = min(end, vma->vm_end); |
2243 | nr_pte_updates += change_prot_numa(vma, start, end); | 2246 | nr_pte_updates = change_prot_numa(vma, start, end); |
2244 | 2247 | ||
2245 | /* | 2248 | /* |
2246 | * Scan sysctl_numa_balancing_scan_size but ensure that | 2249 | * Try to scan sysctl_numa_balancing_size worth of |
2247 | * at least one PTE is updated so that unused virtual | 2250 | * hpages that have at least one present PTE that |
2248 | * address space is quickly skipped. | 2251 | * is not already pte-numa. If the VMA contains |
2252 | * areas that are unused or already full of prot_numa | ||
2253 | * PTEs, scan up to virtpages, to skip through those | ||
2254 | * areas faster. | ||
2249 | */ | 2255 | */ |
2250 | if (nr_pte_updates) | 2256 | if (nr_pte_updates) |
2251 | pages -= (end - start) >> PAGE_SHIFT; | 2257 | pages -= (end - start) >> PAGE_SHIFT; |
2258 | virtpages -= (end - start) >> PAGE_SHIFT; | ||
2252 | 2259 | ||
2253 | start = end; | 2260 | start = end; |
2254 | if (pages <= 0) | 2261 | if (pages <= 0 || virtpages <= 0) |
2255 | goto out; | 2262 | goto out; |
2256 | 2263 | ||
2257 | cond_resched(); | 2264 | cond_resched(); |
@@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n) | |||
2515 | return contrib + runnable_avg_yN_sum[n]; | 2522 | return contrib + runnable_avg_yN_sum[n]; |
2516 | } | 2523 | } |
2517 | 2524 | ||
2525 | #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 | ||
2526 | #error "load tracking assumes 2^10 as unit" | ||
2527 | #endif | ||
2528 | |||
2529 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) | ||
2530 | |||
2518 | /* | 2531 | /* |
2519 | * We can represent the historical contribution to runnable average as the | 2532 | * We can represent the historical contribution to runnable average as the |
2520 | * coefficients of a geometric series. To do this we sub-divide our runnable | 2533 | * coefficients of a geometric series. To do this we sub-divide our runnable |
@@ -2547,10 +2560,10 @@ static __always_inline int | |||
2547 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | 2560 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, |
2548 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | 2561 | unsigned long weight, int running, struct cfs_rq *cfs_rq) |
2549 | { | 2562 | { |
2550 | u64 delta, periods; | 2563 | u64 delta, scaled_delta, periods; |
2551 | u32 contrib; | 2564 | u32 contrib; |
2552 | int delta_w, decayed = 0; | 2565 | unsigned int delta_w, scaled_delta_w, decayed = 0; |
2553 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | 2566 | unsigned long scale_freq, scale_cpu; |
2554 | 2567 | ||
2555 | delta = now - sa->last_update_time; | 2568 | delta = now - sa->last_update_time; |
2556 | /* | 2569 | /* |
@@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2571 | return 0; | 2584 | return 0; |
2572 | sa->last_update_time = now; | 2585 | sa->last_update_time = now; |
2573 | 2586 | ||
2587 | scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
2588 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
2589 | |||
2574 | /* delta_w is the amount already accumulated against our next period */ | 2590 | /* delta_w is the amount already accumulated against our next period */ |
2575 | delta_w = sa->period_contrib; | 2591 | delta_w = sa->period_contrib; |
2576 | if (delta + delta_w >= 1024) { | 2592 | if (delta + delta_w >= 1024) { |
@@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2585 | * period and accrue it. | 2601 | * period and accrue it. |
2586 | */ | 2602 | */ |
2587 | delta_w = 1024 - delta_w; | 2603 | delta_w = 1024 - delta_w; |
2604 | scaled_delta_w = cap_scale(delta_w, scale_freq); | ||
2588 | if (weight) { | 2605 | if (weight) { |
2589 | sa->load_sum += weight * delta_w; | 2606 | sa->load_sum += weight * scaled_delta_w; |
2590 | if (cfs_rq) | 2607 | if (cfs_rq) { |
2591 | cfs_rq->runnable_load_sum += weight * delta_w; | 2608 | cfs_rq->runnable_load_sum += |
2609 | weight * scaled_delta_w; | ||
2610 | } | ||
2592 | } | 2611 | } |
2593 | if (running) | 2612 | if (running) |
2594 | sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; | 2613 | sa->util_sum += scaled_delta_w * scale_cpu; |
2595 | 2614 | ||
2596 | delta -= delta_w; | 2615 | delta -= delta_w; |
2597 | 2616 | ||
@@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2608 | 2627 | ||
2609 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2628 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
2610 | contrib = __compute_runnable_contrib(periods); | 2629 | contrib = __compute_runnable_contrib(periods); |
2630 | contrib = cap_scale(contrib, scale_freq); | ||
2611 | if (weight) { | 2631 | if (weight) { |
2612 | sa->load_sum += weight * contrib; | 2632 | sa->load_sum += weight * contrib; |
2613 | if (cfs_rq) | 2633 | if (cfs_rq) |
2614 | cfs_rq->runnable_load_sum += weight * contrib; | 2634 | cfs_rq->runnable_load_sum += weight * contrib; |
2615 | } | 2635 | } |
2616 | if (running) | 2636 | if (running) |
2617 | sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; | 2637 | sa->util_sum += contrib * scale_cpu; |
2618 | } | 2638 | } |
2619 | 2639 | ||
2620 | /* Remainder of delta accrued against u_0` */ | 2640 | /* Remainder of delta accrued against u_0` */ |
2641 | scaled_delta = cap_scale(delta, scale_freq); | ||
2621 | if (weight) { | 2642 | if (weight) { |
2622 | sa->load_sum += weight * delta; | 2643 | sa->load_sum += weight * scaled_delta; |
2623 | if (cfs_rq) | 2644 | if (cfs_rq) |
2624 | cfs_rq->runnable_load_sum += weight * delta; | 2645 | cfs_rq->runnable_load_sum += weight * scaled_delta; |
2625 | } | 2646 | } |
2626 | if (running) | 2647 | if (running) |
2627 | sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; | 2648 | sa->util_sum += scaled_delta * scale_cpu; |
2628 | 2649 | ||
2629 | sa->period_contrib += delta; | 2650 | sa->period_contrib += delta; |
2630 | 2651 | ||
@@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2634 | cfs_rq->runnable_load_avg = | 2655 | cfs_rq->runnable_load_avg = |
2635 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | 2656 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); |
2636 | } | 2657 | } |
2637 | sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; | 2658 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; |
2638 | } | 2659 | } |
2639 | 2660 | ||
2640 | return decayed; | 2661 | return decayed; |
@@ -2677,8 +2698,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
2677 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | 2698 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { |
2678 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); | 2699 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); |
2679 | sa->util_avg = max_t(long, sa->util_avg - r, 0); | 2700 | sa->util_avg = max_t(long, sa->util_avg - r, 0); |
2680 | sa->util_sum = max_t(s32, sa->util_sum - | 2701 | sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); |
2681 | ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); | ||
2682 | } | 2702 | } |
2683 | 2703 | ||
2684 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 2704 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
@@ -2696,33 +2716,70 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
2696 | static inline void update_load_avg(struct sched_entity *se, int update_tg) | 2716 | static inline void update_load_avg(struct sched_entity *se, int update_tg) |
2697 | { | 2717 | { |
2698 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2718 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2699 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
2700 | u64 now = cfs_rq_clock_task(cfs_rq); | 2719 | u64 now = cfs_rq_clock_task(cfs_rq); |
2720 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
2701 | 2721 | ||
2702 | /* | 2722 | /* |
2703 | * Track task load average for carrying it to new CPU after migrated, and | 2723 | * Track task load average for carrying it to new CPU after migrated, and |
2704 | * track group sched_entity load average for task_h_load calc in migration | 2724 | * track group sched_entity load average for task_h_load calc in migration |
2705 | */ | 2725 | */ |
2706 | __update_load_avg(now, cpu, &se->avg, | 2726 | __update_load_avg(now, cpu, &se->avg, |
2707 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); | 2727 | se->on_rq * scale_load_down(se->load.weight), |
2728 | cfs_rq->curr == se, NULL); | ||
2708 | 2729 | ||
2709 | if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) | 2730 | if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) |
2710 | update_tg_load_avg(cfs_rq, 0); | 2731 | update_tg_load_avg(cfs_rq, 0); |
2711 | } | 2732 | } |
2712 | 2733 | ||
2734 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2735 | { | ||
2736 | if (!sched_feat(ATTACH_AGE_LOAD)) | ||
2737 | goto skip_aging; | ||
2738 | |||
2739 | /* | ||
2740 | * If we got migrated (either between CPUs or between cgroups) we'll | ||
2741 | * have aged the average right before clearing @last_update_time. | ||
2742 | */ | ||
2743 | if (se->avg.last_update_time) { | ||
2744 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
2745 | &se->avg, 0, 0, NULL); | ||
2746 | |||
2747 | /* | ||
2748 | * XXX: we could have just aged the entire load away if we've been | ||
2749 | * absent from the fair class for too long. | ||
2750 | */ | ||
2751 | } | ||
2752 | |||
2753 | skip_aging: | ||
2754 | se->avg.last_update_time = cfs_rq->avg.last_update_time; | ||
2755 | cfs_rq->avg.load_avg += se->avg.load_avg; | ||
2756 | cfs_rq->avg.load_sum += se->avg.load_sum; | ||
2757 | cfs_rq->avg.util_avg += se->avg.util_avg; | ||
2758 | cfs_rq->avg.util_sum += se->avg.util_sum; | ||
2759 | } | ||
2760 | |||
2761 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2762 | { | ||
2763 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
2764 | &se->avg, se->on_rq * scale_load_down(se->load.weight), | ||
2765 | cfs_rq->curr == se, NULL); | ||
2766 | |||
2767 | cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | ||
2768 | cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | ||
2769 | cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | ||
2770 | cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | ||
2771 | } | ||
2772 | |||
2713 | /* Add the load generated by se into cfs_rq's load average */ | 2773 | /* Add the load generated by se into cfs_rq's load average */ |
2714 | static inline void | 2774 | static inline void |
2715 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2775 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2716 | { | 2776 | { |
2717 | struct sched_avg *sa = &se->avg; | 2777 | struct sched_avg *sa = &se->avg; |
2718 | u64 now = cfs_rq_clock_task(cfs_rq); | 2778 | u64 now = cfs_rq_clock_task(cfs_rq); |
2719 | int migrated = 0, decayed; | 2779 | int migrated, decayed; |
2720 | 2780 | ||
2721 | if (sa->last_update_time == 0) { | 2781 | migrated = !sa->last_update_time; |
2722 | sa->last_update_time = now; | 2782 | if (!migrated) { |
2723 | migrated = 1; | ||
2724 | } | ||
2725 | else { | ||
2726 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 2783 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
2727 | se->on_rq * scale_load_down(se->load.weight), | 2784 | se->on_rq * scale_load_down(se->load.weight), |
2728 | cfs_rq->curr == se, NULL); | 2785 | cfs_rq->curr == se, NULL); |
@@ -2733,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2733 | cfs_rq->runnable_load_avg += sa->load_avg; | 2790 | cfs_rq->runnable_load_avg += sa->load_avg; |
2734 | cfs_rq->runnable_load_sum += sa->load_sum; | 2791 | cfs_rq->runnable_load_sum += sa->load_sum; |
2735 | 2792 | ||
2736 | if (migrated) { | 2793 | if (migrated) |
2737 | cfs_rq->avg.load_avg += sa->load_avg; | 2794 | attach_entity_load_avg(cfs_rq, se); |
2738 | cfs_rq->avg.load_sum += sa->load_sum; | ||
2739 | cfs_rq->avg.util_avg += sa->util_avg; | ||
2740 | cfs_rq->avg.util_sum += sa->util_sum; | ||
2741 | } | ||
2742 | 2795 | ||
2743 | if (decayed || migrated) | 2796 | if (decayed || migrated) |
2744 | update_tg_load_avg(cfs_rq, 0); | 2797 | update_tg_load_avg(cfs_rq, 0); |
@@ -2753,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2753 | cfs_rq->runnable_load_avg = | 2806 | cfs_rq->runnable_load_avg = |
2754 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); | 2807 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); |
2755 | cfs_rq->runnable_load_sum = | 2808 | cfs_rq->runnable_load_sum = |
2756 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); | 2809 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); |
2757 | } | 2810 | } |
2758 | 2811 | ||
2759 | /* | 2812 | /* |
@@ -2821,6 +2874,11 @@ static inline void | |||
2821 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 2874 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
2822 | static inline void remove_entity_load_avg(struct sched_entity *se) {} | 2875 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
2823 | 2876 | ||
2877 | static inline void | ||
2878 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
2879 | static inline void | ||
2880 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
2881 | |||
2824 | static inline int idle_balance(struct rq *rq) | 2882 | static inline int idle_balance(struct rq *rq) |
2825 | { | 2883 | { |
2826 | return 0; | 2884 | return 0; |
@@ -4817,32 +4875,39 @@ next: | |||
4817 | done: | 4875 | done: |
4818 | return target; | 4876 | return target; |
4819 | } | 4877 | } |
4878 | |||
4820 | /* | 4879 | /* |
4821 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | 4880 | * cpu_util returns the amount of capacity of a CPU that is used by CFS |
4822 | * tasks. The unit of the return value must be the one of capacity so we can | 4881 | * tasks. The unit of the return value must be the one of capacity so we can |
4823 | * compare the usage with the capacity of the CPU that is available for CFS | 4882 | * compare the utilization with the capacity of the CPU that is available for |
4824 | * task (ie cpu_capacity). | 4883 | * CFS task (ie cpu_capacity). |
4825 | * cfs.avg.util_avg is the sum of running time of runnable tasks on a | 4884 | * |
4826 | * CPU. It represents the amount of utilization of a CPU in the range | 4885 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the |
4827 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | 4886 | * recent utilization of currently non-runnable tasks on a CPU. It represents |
4828 | * capacity of the CPU because it's about the running time on this CPU. | 4887 | * the amount of utilization of a CPU in the range [0..capacity_orig] where |
4829 | * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE | 4888 | * capacity_orig is the cpu_capacity available at the highest frequency |
4830 | * because of unfortunate rounding in util_avg or just | 4889 | * (arch_scale_freq_capacity()). |
4831 | * after migrating tasks until the average stabilizes with the new running | 4890 | * The utilization of a CPU converges towards a sum equal to or less than the |
4832 | * time. So we need to check that the usage stays into the range | 4891 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is |
4833 | * [0..cpu_capacity_orig] and cap if necessary. | 4892 | * the running time on this CPU scaled by capacity_curr. |
4834 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | 4893 | * |
4835 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | 4894 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even |
4895 | * higher than capacity_orig because of unfortunate rounding in | ||
4896 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until | ||
4897 | * the average stabilizes with the new running time. We need to check that the | ||
4898 | * utilization stays within the range of [0..capacity_orig] and cap it if | ||
4899 | * necessary. Without utilization capping, a group could be seen as overloaded | ||
4900 | * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of | ||
4901 | * available capacity. We allow utilization to overshoot capacity_curr (but not | ||
4902 | * capacity_orig) as it useful for predicting the capacity required after task | ||
4903 | * migrations (scheduler-driven DVFS). | ||
4836 | */ | 4904 | */ |
4837 | static int get_cpu_usage(int cpu) | 4905 | static int cpu_util(int cpu) |
4838 | { | 4906 | { |
4839 | unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; | 4907 | unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; |
4840 | unsigned long capacity = capacity_orig_of(cpu); | 4908 | unsigned long capacity = capacity_orig_of(cpu); |
4841 | 4909 | ||
4842 | if (usage >= SCHED_LOAD_SCALE) | 4910 | return (util >= capacity) ? capacity : util; |
4843 | return capacity; | ||
4844 | |||
4845 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
4846 | } | 4911 | } |
4847 | 4912 | ||
4848 | /* | 4913 | /* |
@@ -4945,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4945 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | 5010 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no |
4946 | * other assumptions, including the state of rq->lock, should be made. | 5011 | * other assumptions, including the state of rq->lock, should be made. |
4947 | */ | 5012 | */ |
4948 | static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) | 5013 | static void migrate_task_rq_fair(struct task_struct *p) |
4949 | { | 5014 | { |
4950 | /* | 5015 | /* |
4951 | * We are supposed to update the task to "current" time, then its up to date | 5016 | * We are supposed to update the task to "current" time, then its up to date |
@@ -5525,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
5525 | unsigned long src_faults, dst_faults; | 5590 | unsigned long src_faults, dst_faults; |
5526 | int src_nid, dst_nid; | 5591 | int src_nid, dst_nid; |
5527 | 5592 | ||
5528 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | 5593 | if (!static_branch_likely(&sched_numa_balancing)) |
5529 | return -1; | 5594 | return -1; |
5530 | 5595 | ||
5531 | if (!sched_feat(NUMA)) | 5596 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) |
5532 | return -1; | 5597 | return -1; |
5533 | 5598 | ||
5534 | src_nid = cpu_to_node(env->src_cpu); | 5599 | src_nid = cpu_to_node(env->src_cpu); |
@@ -5934,7 +5999,7 @@ struct sg_lb_stats { | |||
5934 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5999 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
5935 | unsigned long load_per_task; | 6000 | unsigned long load_per_task; |
5936 | unsigned long group_capacity; | 6001 | unsigned long group_capacity; |
5937 | unsigned long group_usage; /* Total usage of the group */ | 6002 | unsigned long group_util; /* Total utilization of the group */ |
5938 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 6003 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
5939 | unsigned int idle_cpus; | 6004 | unsigned int idle_cpus; |
5940 | unsigned int group_weight; | 6005 | unsigned int group_weight; |
@@ -6010,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
6010 | return load_idx; | 6075 | return load_idx; |
6011 | } | 6076 | } |
6012 | 6077 | ||
6013 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
6014 | { | ||
6015 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | ||
6016 | return sd->smt_gain / sd->span_weight; | ||
6017 | |||
6018 | return SCHED_CAPACITY_SCALE; | ||
6019 | } | ||
6020 | |||
6021 | unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
6022 | { | ||
6023 | return default_scale_cpu_capacity(sd, cpu); | ||
6024 | } | ||
6025 | |||
6026 | static unsigned long scale_rt_capacity(int cpu) | 6078 | static unsigned long scale_rt_capacity(int cpu) |
6027 | { | 6079 | { |
6028 | struct rq *rq = cpu_rq(cpu); | 6080 | struct rq *rq = cpu_rq(cpu); |
@@ -6052,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu) | |||
6052 | 6104 | ||
6053 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6105 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
6054 | { | 6106 | { |
6055 | unsigned long capacity = SCHED_CAPACITY_SCALE; | 6107 | unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); |
6056 | struct sched_group *sdg = sd->groups; | 6108 | struct sched_group *sdg = sd->groups; |
6057 | 6109 | ||
6058 | if (sched_feat(ARCH_CAPACITY)) | ||
6059 | capacity *= arch_scale_cpu_capacity(sd, cpu); | ||
6060 | else | ||
6061 | capacity *= default_scale_cpu_capacity(sd, cpu); | ||
6062 | |||
6063 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
6064 | |||
6065 | cpu_rq(cpu)->cpu_capacity_orig = capacity; | 6110 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
6066 | 6111 | ||
6067 | capacity *= scale_rt_capacity(cpu); | 6112 | capacity *= scale_rt_capacity(cpu); |
@@ -6187,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
6187 | * group_has_capacity returns true if the group has spare capacity that could | 6232 | * group_has_capacity returns true if the group has spare capacity that could |
6188 | * be used by some tasks. | 6233 | * be used by some tasks. |
6189 | * We consider that a group has spare capacity if the * number of task is | 6234 | * We consider that a group has spare capacity if the * number of task is |
6190 | * smaller than the number of CPUs or if the usage is lower than the available | 6235 | * smaller than the number of CPUs or if the utilization is lower than the |
6191 | * capacity for CFS tasks. | 6236 | * available capacity for CFS tasks. |
6192 | * For the latter, we use a threshold to stabilize the state, to take into | 6237 | * For the latter, we use a threshold to stabilize the state, to take into |
6193 | * account the variance of the tasks' load and to return true if the available | 6238 | * account the variance of the tasks' load and to return true if the available |
6194 | * capacity in meaningful for the load balancer. | 6239 | * capacity in meaningful for the load balancer. |
@@ -6202,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | |||
6202 | return true; | 6247 | return true; |
6203 | 6248 | ||
6204 | if ((sgs->group_capacity * 100) > | 6249 | if ((sgs->group_capacity * 100) > |
6205 | (sgs->group_usage * env->sd->imbalance_pct)) | 6250 | (sgs->group_util * env->sd->imbalance_pct)) |
6206 | return true; | 6251 | return true; |
6207 | 6252 | ||
6208 | return false; | 6253 | return false; |
@@ -6223,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | |||
6223 | return false; | 6268 | return false; |
6224 | 6269 | ||
6225 | if ((sgs->group_capacity * 100) < | 6270 | if ((sgs->group_capacity * 100) < |
6226 | (sgs->group_usage * env->sd->imbalance_pct)) | 6271 | (sgs->group_util * env->sd->imbalance_pct)) |
6227 | return true; | 6272 | return true; |
6228 | 6273 | ||
6229 | return false; | 6274 | return false; |
6230 | } | 6275 | } |
6231 | 6276 | ||
6232 | static enum group_type group_classify(struct lb_env *env, | 6277 | static inline enum |
6233 | struct sched_group *group, | 6278 | group_type group_classify(struct sched_group *group, |
6234 | struct sg_lb_stats *sgs) | 6279 | struct sg_lb_stats *sgs) |
6235 | { | 6280 | { |
6236 | if (sgs->group_no_capacity) | 6281 | if (sgs->group_no_capacity) |
6237 | return group_overloaded; | 6282 | return group_overloaded; |
@@ -6271,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6271 | load = source_load(i, load_idx); | 6316 | load = source_load(i, load_idx); |
6272 | 6317 | ||
6273 | sgs->group_load += load; | 6318 | sgs->group_load += load; |
6274 | sgs->group_usage += get_cpu_usage(i); | 6319 | sgs->group_util += cpu_util(i); |
6275 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6320 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
6276 | 6321 | ||
6277 | if (rq->nr_running > 1) | 6322 | if (rq->nr_running > 1) |
@@ -6296,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6296 | sgs->group_weight = group->group_weight; | 6341 | sgs->group_weight = group->group_weight; |
6297 | 6342 | ||
6298 | sgs->group_no_capacity = group_is_overloaded(env, sgs); | 6343 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
6299 | sgs->group_type = group_classify(env, group, sgs); | 6344 | sgs->group_type = group_classify(group, sgs); |
6300 | } | 6345 | } |
6301 | 6346 | ||
6302 | /** | 6347 | /** |
@@ -6430,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6430 | group_has_capacity(env, &sds->local_stat) && | 6475 | group_has_capacity(env, &sds->local_stat) && |
6431 | (sgs->sum_nr_running > 1)) { | 6476 | (sgs->sum_nr_running > 1)) { |
6432 | sgs->group_no_capacity = 1; | 6477 | sgs->group_no_capacity = 1; |
6433 | sgs->group_type = group_overloaded; | 6478 | sgs->group_type = group_classify(sg, sgs); |
6434 | } | 6479 | } |
6435 | 6480 | ||
6436 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6481 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
@@ -7610,8 +7655,22 @@ out: | |||
7610 | * When the cpu is attached to null domain for ex, it will not be | 7655 | * When the cpu is attached to null domain for ex, it will not be |
7611 | * updated. | 7656 | * updated. |
7612 | */ | 7657 | */ |
7613 | if (likely(update_next_balance)) | 7658 | if (likely(update_next_balance)) { |
7614 | rq->next_balance = next_balance; | 7659 | rq->next_balance = next_balance; |
7660 | |||
7661 | #ifdef CONFIG_NO_HZ_COMMON | ||
7662 | /* | ||
7663 | * If this CPU has been elected to perform the nohz idle | ||
7664 | * balance. Other idle CPUs have already rebalanced with | ||
7665 | * nohz_idle_balance() and nohz.next_balance has been | ||
7666 | * updated accordingly. This CPU is now running the idle load | ||
7667 | * balance for itself and we need to update the | ||
7668 | * nohz.next_balance accordingly. | ||
7669 | */ | ||
7670 | if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) | ||
7671 | nohz.next_balance = rq->next_balance; | ||
7672 | #endif | ||
7673 | } | ||
7615 | } | 7674 | } |
7616 | 7675 | ||
7617 | #ifdef CONFIG_NO_HZ_COMMON | 7676 | #ifdef CONFIG_NO_HZ_COMMON |
@@ -7624,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
7624 | int this_cpu = this_rq->cpu; | 7683 | int this_cpu = this_rq->cpu; |
7625 | struct rq *rq; | 7684 | struct rq *rq; |
7626 | int balance_cpu; | 7685 | int balance_cpu; |
7686 | /* Earliest time when we have to do rebalance again */ | ||
7687 | unsigned long next_balance = jiffies + 60*HZ; | ||
7688 | int update_next_balance = 0; | ||
7627 | 7689 | ||
7628 | if (idle != CPU_IDLE || | 7690 | if (idle != CPU_IDLE || |
7629 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) | 7691 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) |
@@ -7655,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
7655 | rebalance_domains(rq, CPU_IDLE); | 7717 | rebalance_domains(rq, CPU_IDLE); |
7656 | } | 7718 | } |
7657 | 7719 | ||
7658 | if (time_after(this_rq->next_balance, rq->next_balance)) | 7720 | if (time_after(next_balance, rq->next_balance)) { |
7659 | this_rq->next_balance = rq->next_balance; | 7721 | next_balance = rq->next_balance; |
7722 | update_next_balance = 1; | ||
7723 | } | ||
7660 | } | 7724 | } |
7661 | nohz.next_balance = this_rq->next_balance; | 7725 | |
7726 | /* | ||
7727 | * next_balance will be updated only when there is a need. | ||
7728 | * When the CPU is attached to null domain for ex, it will not be | ||
7729 | * updated. | ||
7730 | */ | ||
7731 | if (likely(update_next_balance)) | ||
7732 | nohz.next_balance = next_balance; | ||
7662 | end: | 7733 | end: |
7663 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | 7734 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); |
7664 | } | 7735 | } |
@@ -7811,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
7811 | entity_tick(cfs_rq, se, queued); | 7882 | entity_tick(cfs_rq, se, queued); |
7812 | } | 7883 | } |
7813 | 7884 | ||
7814 | if (numabalancing_enabled) | 7885 | if (static_branch_unlikely(&sched_numa_balancing)) |
7815 | task_tick_numa(rq, curr); | 7886 | task_tick_numa(rq, curr); |
7816 | } | 7887 | } |
7817 | 7888 | ||
@@ -7887,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | |||
7887 | check_preempt_curr(rq, p, 0); | 7958 | check_preempt_curr(rq, p, 0); |
7888 | } | 7959 | } |
7889 | 7960 | ||
7890 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | 7961 | static inline bool vruntime_normalized(struct task_struct *p) |
7891 | { | 7962 | { |
7892 | struct sched_entity *se = &p->se; | 7963 | struct sched_entity *se = &p->se; |
7893 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
7894 | 7964 | ||
7895 | /* | 7965 | /* |
7896 | * Ensure the task's vruntime is normalized, so that when it's | 7966 | * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, |
7897 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7967 | * the dequeue_entity(.flags=0) will already have normalized the |
7898 | * do the right thing. | 7968 | * vruntime. |
7969 | */ | ||
7970 | if (p->on_rq) | ||
7971 | return true; | ||
7972 | |||
7973 | /* | ||
7974 | * When !on_rq, vruntime of the task has usually NOT been normalized. | ||
7975 | * But there are some cases where it has already been normalized: | ||
7899 | * | 7976 | * |
7900 | * If it's queued, then the dequeue_entity(.flags=0) will already | 7977 | * - A forked child which is waiting for being woken up by |
7901 | * have normalized the vruntime, if it's !queued, then only when | 7978 | * wake_up_new_task(). |
7902 | * the task is sleeping will it still have non-normalized vruntime. | 7979 | * - A task which has been woken up by try_to_wake_up() and |
7980 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
7903 | */ | 7981 | */ |
7904 | if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { | 7982 | if (!se->sum_exec_runtime || p->state == TASK_WAKING) |
7983 | return true; | ||
7984 | |||
7985 | return false; | ||
7986 | } | ||
7987 | |||
7988 | static void detach_task_cfs_rq(struct task_struct *p) | ||
7989 | { | ||
7990 | struct sched_entity *se = &p->se; | ||
7991 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
7992 | |||
7993 | if (!vruntime_normalized(p)) { | ||
7905 | /* | 7994 | /* |
7906 | * Fix up our vruntime so that the current sleep doesn't | 7995 | * Fix up our vruntime so that the current sleep doesn't |
7907 | * cause 'unlimited' sleep bonus. | 7996 | * cause 'unlimited' sleep bonus. |
@@ -7910,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
7910 | se->vruntime -= cfs_rq->min_vruntime; | 7999 | se->vruntime -= cfs_rq->min_vruntime; |
7911 | } | 8000 | } |
7912 | 8001 | ||
7913 | #ifdef CONFIG_SMP | ||
7914 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8002 | /* Catch up with the cfs_rq and remove our load when we leave */ |
7915 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, | 8003 | detach_entity_load_avg(cfs_rq, se); |
7916 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); | ||
7917 | |||
7918 | cfs_rq->avg.load_avg = | ||
7919 | max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | ||
7920 | cfs_rq->avg.load_sum = | ||
7921 | max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | ||
7922 | cfs_rq->avg.util_avg = | ||
7923 | max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | ||
7924 | cfs_rq->avg.util_sum = | ||
7925 | max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | ||
7926 | #endif | ||
7927 | } | 8004 | } |
7928 | 8005 | ||
7929 | /* | 8006 | static void attach_task_cfs_rq(struct task_struct *p) |
7930 | * We switched to the sched_fair class. | ||
7931 | */ | ||
7932 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | ||
7933 | { | 8007 | { |
7934 | struct sched_entity *se = &p->se; | 8008 | struct sched_entity *se = &p->se; |
8009 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
7935 | 8010 | ||
7936 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8011 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7937 | /* | 8012 | /* |
@@ -7941,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) | |||
7941 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 8016 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
7942 | #endif | 8017 | #endif |
7943 | 8018 | ||
7944 | if (!task_on_rq_queued(p)) { | 8019 | /* Synchronize task with its cfs_rq */ |
8020 | attach_entity_load_avg(cfs_rq, se); | ||
7945 | 8021 | ||
8022 | if (!vruntime_normalized(p)) | ||
8023 | se->vruntime += cfs_rq->min_vruntime; | ||
8024 | } | ||
8025 | |||
8026 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
8027 | { | ||
8028 | detach_task_cfs_rq(p); | ||
8029 | } | ||
8030 | |||
8031 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | ||
8032 | { | ||
8033 | attach_task_cfs_rq(p); | ||
8034 | |||
8035 | if (task_on_rq_queued(p)) { | ||
7946 | /* | 8036 | /* |
7947 | * Ensure the task has a non-normalized vruntime when it is switched | 8037 | * We were most likely switched from sched_rt, so |
7948 | * back to the fair class with !queued, so that enqueue_entity() at | 8038 | * kick off the schedule if running, otherwise just see |
7949 | * wake-up time will do the right thing. | 8039 | * if we can still preempt the current task. |
7950 | * | ||
7951 | * If it's queued, then the enqueue_entity(.flags=0) makes the task | ||
7952 | * has non-normalized vruntime, if it's !queued, then it still has | ||
7953 | * normalized vruntime. | ||
7954 | */ | 8040 | */ |
7955 | if (p->state != TASK_RUNNING) | 8041 | if (rq->curr == p) |
7956 | se->vruntime += cfs_rq_of(se)->min_vruntime; | 8042 | resched_curr(rq); |
7957 | return; | 8043 | else |
8044 | check_preempt_curr(rq, p, 0); | ||
7958 | } | 8045 | } |
7959 | |||
7960 | /* | ||
7961 | * We were most likely switched from sched_rt, so | ||
7962 | * kick off the schedule if running, otherwise just see | ||
7963 | * if we can still preempt the current task. | ||
7964 | */ | ||
7965 | if (rq->curr == p) | ||
7966 | resched_curr(rq); | ||
7967 | else | ||
7968 | check_preempt_curr(rq, p, 0); | ||
7969 | } | 8046 | } |
7970 | 8047 | ||
7971 | /* Account for a task changing its policy or group. | 8048 | /* Account for a task changing its policy or group. |
@@ -8000,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
8000 | } | 8077 | } |
8001 | 8078 | ||
8002 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8079 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8003 | static void task_move_group_fair(struct task_struct *p, int queued) | 8080 | static void task_move_group_fair(struct task_struct *p) |
8004 | { | 8081 | { |
8005 | struct sched_entity *se = &p->se; | 8082 | detach_task_cfs_rq(p); |
8006 | struct cfs_rq *cfs_rq; | ||
8007 | |||
8008 | /* | ||
8009 | * If the task was not on the rq at the time of this cgroup movement | ||
8010 | * it must have been asleep, sleeping tasks keep their ->vruntime | ||
8011 | * absolute on their old rq until wakeup (needed for the fair sleeper | ||
8012 | * bonus in place_entity()). | ||
8013 | * | ||
8014 | * If it was on the rq, we've just 'preempted' it, which does convert | ||
8015 | * ->vruntime to a relative base. | ||
8016 | * | ||
8017 | * Make sure both cases convert their relative position when migrating | ||
8018 | * to another cgroup's rq. This does somewhat interfere with the | ||
8019 | * fair sleeper stuff for the first placement, but who cares. | ||
8020 | */ | ||
8021 | /* | ||
8022 | * When !queued, vruntime of the task has usually NOT been normalized. | ||
8023 | * But there are some cases where it has already been normalized: | ||
8024 | * | ||
8025 | * - Moving a forked child which is waiting for being woken up by | ||
8026 | * wake_up_new_task(). | ||
8027 | * - Moving a task which has been woken up by try_to_wake_up() and | ||
8028 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
8029 | * | ||
8030 | * To prevent boost or penalty in the new cfs_rq caused by delta | ||
8031 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | ||
8032 | */ | ||
8033 | if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) | ||
8034 | queued = 1; | ||
8035 | |||
8036 | if (!queued) | ||
8037 | se->vruntime -= cfs_rq_of(se)->min_vruntime; | ||
8038 | set_task_rq(p, task_cpu(p)); | 8083 | set_task_rq(p, task_cpu(p)); |
8039 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
8040 | if (!queued) { | ||
8041 | cfs_rq = cfs_rq_of(se); | ||
8042 | se->vruntime += cfs_rq->min_vruntime; | ||
8043 | 8084 | ||
8044 | #ifdef CONFIG_SMP | 8085 | #ifdef CONFIG_SMP |
8045 | /* Virtually synchronize task with its new cfs_rq */ | 8086 | /* Tell se's cfs_rq has been changed -- migrated */ |
8046 | p->se.avg.last_update_time = cfs_rq->avg.last_update_time; | 8087 | p->se.avg.last_update_time = 0; |
8047 | cfs_rq->avg.load_avg += p->se.avg.load_avg; | ||
8048 | cfs_rq->avg.load_sum += p->se.avg.load_sum; | ||
8049 | cfs_rq->avg.util_avg += p->se.avg.util_avg; | ||
8050 | cfs_rq->avg.util_sum += p->se.avg.util_sum; | ||
8051 | #endif | 8088 | #endif |
8052 | } | 8089 | attach_task_cfs_rq(p); |
8053 | } | 8090 | } |
8054 | 8091 | ||
8055 | void free_fair_sched_group(struct task_group *tg) | 8092 | void free_fair_sched_group(struct task_group *tg) |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 83a50e7ca533..69631fa46c2f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
36 | */ | 36 | */ |
37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | 37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) |
38 | 38 | ||
39 | /* | ||
40 | * Use arch dependent cpu capacity functions | ||
41 | */ | ||
42 | SCHED_FEAT(ARCH_CAPACITY, true) | ||
43 | |||
44 | SCHED_FEAT(HRTICK, false) | 39 | SCHED_FEAT(HRTICK, false) |
45 | SCHED_FEAT(DOUBLE_TICK, false) | 40 | SCHED_FEAT(DOUBLE_TICK, false) |
46 | SCHED_FEAT(LB_BIAS, true) | 41 | SCHED_FEAT(LB_BIAS, true) |
@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) | |||
72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 67 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 68 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
74 | SCHED_FEAT(LB_MIN, false) | 69 | SCHED_FEAT(LB_MIN, false) |
70 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | ||
75 | 71 | ||
76 | /* | ||
77 | * Apply the automatic NUMA scheduling policy. Enabled automatically | ||
78 | * at runtime if running on a NUMA machine. Can be controlled via | ||
79 | * numa_balancing= | ||
80 | */ | ||
81 | #ifdef CONFIG_NUMA_BALANCING | ||
82 | |||
83 | /* | ||
84 | * NUMA will favor moving tasks towards nodes where a higher number of | ||
85 | * hinting faults are recorded during active load balancing. It will | ||
86 | * resist moving tasks towards nodes where a lower number of hinting | ||
87 | * faults have been recorded. | ||
88 | */ | ||
89 | SCHED_FEAT(NUMA, true) | ||
90 | #endif | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d2ea59364a1c..e3cc16312046 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) | |||
635 | /* | 635 | /* |
636 | * We ran out of runtime, see if we can borrow some from our neighbours. | 636 | * We ran out of runtime, see if we can borrow some from our neighbours. |
637 | */ | 637 | */ |
638 | static int do_balance_runtime(struct rt_rq *rt_rq) | 638 | static void do_balance_runtime(struct rt_rq *rt_rq) |
639 | { | 639 | { |
640 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 640 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
641 | struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; | 641 | struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; |
642 | int i, weight, more = 0; | 642 | int i, weight; |
643 | u64 rt_period; | 643 | u64 rt_period; |
644 | 644 | ||
645 | weight = cpumask_weight(rd->span); | 645 | weight = cpumask_weight(rd->span); |
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) | |||
673 | diff = rt_period - rt_rq->rt_runtime; | 673 | diff = rt_period - rt_rq->rt_runtime; |
674 | iter->rt_runtime -= diff; | 674 | iter->rt_runtime -= diff; |
675 | rt_rq->rt_runtime += diff; | 675 | rt_rq->rt_runtime += diff; |
676 | more = 1; | ||
677 | if (rt_rq->rt_runtime == rt_period) { | 676 | if (rt_rq->rt_runtime == rt_period) { |
678 | raw_spin_unlock(&iter->rt_runtime_lock); | 677 | raw_spin_unlock(&iter->rt_runtime_lock); |
679 | break; | 678 | break; |
@@ -683,8 +682,6 @@ next: | |||
683 | raw_spin_unlock(&iter->rt_runtime_lock); | 682 | raw_spin_unlock(&iter->rt_runtime_lock); |
684 | } | 683 | } |
685 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 684 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
686 | |||
687 | return more; | ||
688 | } | 685 | } |
689 | 686 | ||
690 | /* | 687 | /* |
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq) | |||
796 | } | 793 | } |
797 | } | 794 | } |
798 | 795 | ||
799 | static int balance_runtime(struct rt_rq *rt_rq) | 796 | static void balance_runtime(struct rt_rq *rt_rq) |
800 | { | 797 | { |
801 | int more = 0; | ||
802 | |||
803 | if (!sched_feat(RT_RUNTIME_SHARE)) | 798 | if (!sched_feat(RT_RUNTIME_SHARE)) |
804 | return more; | 799 | return; |
805 | 800 | ||
806 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | 801 | if (rt_rq->rt_time > rt_rq->rt_runtime) { |
807 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 802 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
808 | more = do_balance_runtime(rt_rq); | 803 | do_balance_runtime(rt_rq); |
809 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 804 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
810 | } | 805 | } |
811 | |||
812 | return more; | ||
813 | } | 806 | } |
814 | #else /* !CONFIG_SMP */ | 807 | #else /* !CONFIG_SMP */ |
815 | static inline int balance_runtime(struct rt_rq *rt_rq) | 808 | static inline void balance_runtime(struct rt_rq *rt_rq) {} |
816 | { | ||
817 | return 0; | ||
818 | } | ||
819 | #endif /* CONFIG_SMP */ | 809 | #endif /* CONFIG_SMP */ |
820 | 810 | ||
821 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | 811 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d2a119c7ad9..efd3bfc7e347 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } | |||
84 | */ | 84 | */ |
85 | #define RUNTIME_INF ((u64)~0ULL) | 85 | #define RUNTIME_INF ((u64)~0ULL) |
86 | 86 | ||
87 | static inline int idle_policy(int policy) | ||
88 | { | ||
89 | return policy == SCHED_IDLE; | ||
90 | } | ||
87 | static inline int fair_policy(int policy) | 91 | static inline int fair_policy(int policy) |
88 | { | 92 | { |
89 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; | 93 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; |
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy) | |||
98 | { | 102 | { |
99 | return policy == SCHED_DEADLINE; | 103 | return policy == SCHED_DEADLINE; |
100 | } | 104 | } |
105 | static inline bool valid_policy(int policy) | ||
106 | { | ||
107 | return idle_policy(policy) || fair_policy(policy) || | ||
108 | rt_policy(policy) || dl_policy(policy); | ||
109 | } | ||
101 | 110 | ||
102 | static inline int task_has_rt_policy(struct task_struct *p) | 111 | static inline int task_has_rt_policy(struct task_struct *p) |
103 | { | 112 | { |
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) | |||
109 | return dl_policy(p->policy); | 118 | return dl_policy(p->policy); |
110 | } | 119 | } |
111 | 120 | ||
112 | static inline bool dl_time_before(u64 a, u64 b) | ||
113 | { | ||
114 | return (s64)(a - b) < 0; | ||
115 | } | ||
116 | |||
117 | /* | 121 | /* |
118 | * Tells if entity @a should preempt entity @b. | 122 | * Tells if entity @a should preempt entity @b. |
119 | */ | 123 | */ |
@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
1003 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 1007 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
1004 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 1008 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
1005 | 1009 | ||
1006 | #ifdef CONFIG_NUMA_BALANCING | 1010 | extern struct static_key_false sched_numa_balancing; |
1007 | #define sched_feat_numa(x) sched_feat(x) | ||
1008 | #ifdef CONFIG_SCHED_DEBUG | ||
1009 | #define numabalancing_enabled sched_feat_numa(NUMA) | ||
1010 | #else | ||
1011 | extern bool numabalancing_enabled; | ||
1012 | #endif /* CONFIG_SCHED_DEBUG */ | ||
1013 | #else | ||
1014 | #define sched_feat_numa(x) (0) | ||
1015 | #define numabalancing_enabled (0) | ||
1016 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1017 | 1011 | ||
1018 | static inline u64 global_rt_period(void) | 1012 | static inline u64 global_rt_period(void) |
1019 | { | 1013 | { |
@@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { | |||
1157 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1151 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1158 | }; | 1152 | }; |
1159 | 1153 | ||
1160 | #define ENQUEUE_WAKEUP 1 | 1154 | #define ENQUEUE_WAKEUP 0x01 |
1161 | #define ENQUEUE_HEAD 2 | 1155 | #define ENQUEUE_HEAD 0x02 |
1162 | #ifdef CONFIG_SMP | 1156 | #ifdef CONFIG_SMP |
1163 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ | 1157 | #define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ |
1164 | #else | 1158 | #else |
1165 | #define ENQUEUE_WAKING 0 | 1159 | #define ENQUEUE_WAKING 0x00 |
1166 | #endif | 1160 | #endif |
1167 | #define ENQUEUE_REPLENISH 8 | 1161 | #define ENQUEUE_REPLENISH 0x08 |
1162 | #define ENQUEUE_RESTORE 0x10 | ||
1168 | 1163 | ||
1169 | #define DEQUEUE_SLEEP 1 | 1164 | #define DEQUEUE_SLEEP 0x01 |
1165 | #define DEQUEUE_SAVE 0x02 | ||
1170 | 1166 | ||
1171 | #define RETRY_TASK ((void *)-1UL) | 1167 | #define RETRY_TASK ((void *)-1UL) |
1172 | 1168 | ||
@@ -1194,7 +1190,7 @@ struct sched_class { | |||
1194 | 1190 | ||
1195 | #ifdef CONFIG_SMP | 1191 | #ifdef CONFIG_SMP |
1196 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1192 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
1197 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1193 | void (*migrate_task_rq)(struct task_struct *p); |
1198 | 1194 | ||
1199 | void (*task_waking) (struct task_struct *task); | 1195 | void (*task_waking) (struct task_struct *task); |
1200 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1196 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); |
@@ -1227,7 +1223,7 @@ struct sched_class { | |||
1227 | void (*update_curr) (struct rq *rq); | 1223 | void (*update_curr) (struct rq *rq); |
1228 | 1224 | ||
1229 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1225 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1230 | void (*task_move_group) (struct task_struct *p, int on_rq); | 1226 | void (*task_move_group) (struct task_struct *p); |
1231 | #endif | 1227 | #endif |
1232 | }; | 1228 | }; |
1233 | 1229 | ||
@@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | |||
1405 | } | 1401 | } |
1406 | #endif | 1402 | #endif |
1407 | 1403 | ||
1404 | #ifndef arch_scale_cpu_capacity | ||
1405 | static __always_inline | ||
1406 | unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
1407 | { | ||
1408 | if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | ||
1409 | return sd->smt_gain / sd->span_weight; | ||
1410 | |||
1411 | return SCHED_CAPACITY_SCALE; | ||
1412 | } | ||
1413 | #endif | ||
1414 | |||
1408 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1415 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1409 | { | 1416 | { |
1410 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); | 1417 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index a818cbc73e14..d264f59bff56 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp | |||
222 | { | 222 | { |
223 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | 223 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); |
224 | 224 | ||
225 | if (ht->pre_unpark) | 225 | if (!ht->selfparking) |
226 | ht->pre_unpark(cpu); | 226 | kthread_unpark(tsk); |
227 | kthread_unpark(tsk); | ||
228 | } | 227 | } |
229 | 228 | ||
230 | void smpboot_unpark_threads(unsigned int cpu) | 229 | void smpboot_unpark_threads(unsigned int cpu) |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12484e5d5c88..867bc20e1ef1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) | |||
73 | } | 73 | } |
74 | } | 74 | } |
75 | 75 | ||
76 | static void __cpu_stop_queue_work(struct cpu_stopper *stopper, | ||
77 | struct cpu_stop_work *work) | ||
78 | { | ||
79 | list_add_tail(&work->list, &stopper->works); | ||
80 | wake_up_process(stopper->thread); | ||
81 | } | ||
82 | |||
76 | /* queue @work to @stopper. if offline, @work is completed immediately */ | 83 | /* queue @work to @stopper. if offline, @work is completed immediately */ |
77 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) | 84 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) |
78 | { | 85 | { |
79 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 86 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
80 | |||
81 | unsigned long flags; | 87 | unsigned long flags; |
82 | 88 | ||
83 | spin_lock_irqsave(&stopper->lock, flags); | 89 | spin_lock_irqsave(&stopper->lock, flags); |
84 | 90 | if (stopper->enabled) | |
85 | if (stopper->enabled) { | 91 | __cpu_stop_queue_work(stopper, work); |
86 | list_add_tail(&work->list, &stopper->works); | 92 | else |
87 | wake_up_process(stopper->thread); | ||
88 | } else | ||
89 | cpu_stop_signal_done(work->done, false); | 93 | cpu_stop_signal_done(work->done, false); |
90 | |||
91 | spin_unlock_irqrestore(&stopper->lock, flags); | 94 | spin_unlock_irqrestore(&stopper->lock, flags); |
92 | } | 95 | } |
93 | 96 | ||
@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data) | |||
213 | return err; | 216 | return err; |
214 | } | 217 | } |
215 | 218 | ||
219 | static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, | ||
220 | int cpu2, struct cpu_stop_work *work2) | ||
221 | { | ||
222 | struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); | ||
223 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); | ||
224 | int err; | ||
225 | |||
226 | lg_double_lock(&stop_cpus_lock, cpu1, cpu2); | ||
227 | spin_lock_irq(&stopper1->lock); | ||
228 | spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); | ||
229 | |||
230 | err = -ENOENT; | ||
231 | if (!stopper1->enabled || !stopper2->enabled) | ||
232 | goto unlock; | ||
233 | |||
234 | err = 0; | ||
235 | __cpu_stop_queue_work(stopper1, work1); | ||
236 | __cpu_stop_queue_work(stopper2, work2); | ||
237 | unlock: | ||
238 | spin_unlock(&stopper2->lock); | ||
239 | spin_unlock_irq(&stopper1->lock); | ||
240 | lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); | ||
241 | |||
242 | return err; | ||
243 | } | ||
216 | /** | 244 | /** |
217 | * stop_two_cpus - stops two cpus | 245 | * stop_two_cpus - stops two cpus |
218 | * @cpu1: the cpu to stop | 246 | * @cpu1: the cpu to stop |
@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
247 | cpu_stop_init_done(&done, 2); | 275 | cpu_stop_init_done(&done, 2); |
248 | set_state(&msdata, MULTI_STOP_PREPARE); | 276 | set_state(&msdata, MULTI_STOP_PREPARE); |
249 | 277 | ||
250 | /* | 278 | if (cpu1 > cpu2) |
251 | * If we observe both CPUs active we know _cpu_down() cannot yet have | 279 | swap(cpu1, cpu2); |
252 | * queued its stop_machine works and therefore ours will get executed | 280 | if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { |
253 | * first. Or its not either one of our CPUs that's getting unplugged, | ||
254 | * in which case we don't care. | ||
255 | * | ||
256 | * This relies on the stopper workqueues to be FIFO. | ||
257 | */ | ||
258 | if (!cpu_active(cpu1) || !cpu_active(cpu2)) { | ||
259 | preempt_enable(); | 281 | preempt_enable(); |
260 | return -ENOENT; | 282 | return -ENOENT; |
261 | } | 283 | } |
262 | 284 | ||
263 | lg_double_lock(&stop_cpus_lock, cpu1, cpu2); | ||
264 | cpu_stop_queue_work(cpu1, &work1); | ||
265 | cpu_stop_queue_work(cpu2, &work2); | ||
266 | lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); | ||
267 | |||
268 | preempt_enable(); | 285 | preempt_enable(); |
269 | 286 | ||
270 | wait_for_completion(&done.completion); | 287 | wait_for_completion(&done.completion); |
@@ -452,6 +469,18 @@ repeat: | |||
452 | } | 469 | } |
453 | } | 470 | } |
454 | 471 | ||
472 | void stop_machine_park(int cpu) | ||
473 | { | ||
474 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | ||
475 | /* | ||
476 | * Lockless. cpu_stopper_thread() will take stopper->lock and flush | ||
477 | * the pending works before it parks, until then it is fine to queue | ||
478 | * the new works. | ||
479 | */ | ||
480 | stopper->enabled = false; | ||
481 | kthread_park(stopper->thread); | ||
482 | } | ||
483 | |||
455 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); | 484 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); |
456 | 485 | ||
457 | static void cpu_stop_create(unsigned int cpu) | 486 | static void cpu_stop_create(unsigned int cpu) |
@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu) | |||
462 | static void cpu_stop_park(unsigned int cpu) | 491 | static void cpu_stop_park(unsigned int cpu) |
463 | { | 492 | { |
464 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 493 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
465 | struct cpu_stop_work *work, *tmp; | ||
466 | unsigned long flags; | ||
467 | 494 | ||
468 | /* drain remaining works */ | 495 | WARN_ON(!list_empty(&stopper->works)); |
469 | spin_lock_irqsave(&stopper->lock, flags); | ||
470 | list_for_each_entry_safe(work, tmp, &stopper->works, list) { | ||
471 | list_del_init(&work->list); | ||
472 | cpu_stop_signal_done(work->done, false); | ||
473 | } | ||
474 | stopper->enabled = false; | ||
475 | spin_unlock_irqrestore(&stopper->lock, flags); | ||
476 | } | 496 | } |
477 | 497 | ||
478 | static void cpu_stop_unpark(unsigned int cpu) | 498 | void stop_machine_unpark(int cpu) |
479 | { | 499 | { |
480 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 500 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
481 | 501 | ||
482 | spin_lock_irq(&stopper->lock); | ||
483 | stopper->enabled = true; | 502 | stopper->enabled = true; |
484 | spin_unlock_irq(&stopper->lock); | 503 | kthread_unpark(stopper->thread); |
485 | } | 504 | } |
486 | 505 | ||
487 | static struct smp_hotplug_thread cpu_stop_threads = { | 506 | static struct smp_hotplug_thread cpu_stop_threads = { |
@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = { | |||
490 | .thread_fn = cpu_stopper_thread, | 509 | .thread_fn = cpu_stopper_thread, |
491 | .thread_comm = "migration/%u", | 510 | .thread_comm = "migration/%u", |
492 | .create = cpu_stop_create, | 511 | .create = cpu_stop_create, |
493 | .setup = cpu_stop_unpark, | ||
494 | .park = cpu_stop_park, | 512 | .park = cpu_stop_park, |
495 | .pre_unpark = cpu_stop_unpark, | ||
496 | .selfparking = true, | 513 | .selfparking = true, |
497 | }; | 514 | }; |
498 | 515 | ||
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void) | |||
508 | } | 525 | } |
509 | 526 | ||
510 | BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); | 527 | BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); |
528 | stop_machine_unpark(raw_smp_processor_id()); | ||
511 | stop_machine_initialized = true; | 529 | stop_machine_initialized = true; |
512 | return 0; | 530 | return 0; |
513 | } | 531 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0623ac785a2..00611e95a8ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -5697,7 +5697,7 @@ free: | |||
5697 | } | 5697 | } |
5698 | 5698 | ||
5699 | static void | 5699 | static void |
5700 | ftrace_graph_probe_sched_switch(void *ignore, | 5700 | ftrace_graph_probe_sched_switch(void *ignore, bool preempt, |
5701 | struct task_struct *prev, struct task_struct *next) | 5701 | struct task_struct *prev, struct task_struct *next) |
5702 | { | 5702 | { |
5703 | unsigned long long timestamp; | 5703 | unsigned long long timestamp; |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index f270088e9929..4c896a0101bd 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -16,7 +16,8 @@ static int sched_ref; | |||
16 | static DEFINE_MUTEX(sched_register_mutex); | 16 | static DEFINE_MUTEX(sched_register_mutex); |
17 | 17 | ||
18 | static void | 18 | static void |
19 | probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) | 19 | probe_sched_switch(void *ignore, bool preempt, |
20 | struct task_struct *prev, struct task_struct *next) | ||
20 | { | 21 | { |
21 | if (unlikely(!sched_ref)) | 22 | if (unlikely(!sched_ref)) |
22 | return; | 23 | return; |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 12cbe77b4136..4bcfbac289ff 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
420 | } | 420 | } |
421 | 421 | ||
422 | static void notrace | 422 | static void notrace |
423 | probe_wakeup_sched_switch(void *ignore, | 423 | probe_wakeup_sched_switch(void *ignore, bool preempt, |
424 | struct task_struct *prev, struct task_struct *next) | 424 | struct task_struct *prev, struct task_struct *next) |
425 | { | 425 | { |
426 | struct trace_array_cpu *data; | 426 | struct trace_array_cpu *data; |