aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-03 21:03:50 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-03 21:03:50 -0500
commit53528695ff6d8b77011bc818407c13e30914a946 (patch)
tree04acd099c5759bf6f1d728c5415f574d572c6872
parentb831ef2cad979912850e34f82415c0c5d59de8cb (diff)
parente73e85f0593832aa583b252f9a16cf90ed6d30fa (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this cycle were: - sched/fair load tracking fixes and cleanups (Byungchul Park) - Make load tracking frequency scale invariant (Dietmar Eggemann) - sched/deadline updates (Juri Lelli) - stop machine fixes, cleanups and enhancements for bugs triggered by CPU hotplug stress testing (Oleg Nesterov) - scheduler preemption code rework: remove PREEMPT_ACTIVE and related cleanups (Peter Zijlstra) - Rework the sched_info::run_delay code to fix races (Peter Zijlstra) - Optimize per entity utilization tracking (Peter Zijlstra) - ... misc other fixes, cleanups and smaller updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() sched: Start stopper early stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark() stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works() stop_machine: Ensure that a queued callback will be called before cpu_stop_park() sched/x86: Fix typo in __switch_to() comments sched/core: Remove a parameter in the migrate_task_rq() function sched/core: Drop unlikely behind BUG_ON() sched/core: Fix task and run queue sched_info::run_delay inconsistencies sched/numa: Fix task_tick_fair() from disabling numa_balancing sched/core: Add preempt_count invariant check sched/core: More notrace annotations sched/core: Kill PREEMPT_ACTIVE sched/core, sched/x86: Kill thread_info::saved_preempt_count sched/core: Simplify preempt_count tests sched/core: Robustify preemption leak checks sched/core: Stop setting PREEMPT_ACTIVE ...
-rw-r--r--arch/x86/include/asm/preempt.h5
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--include/asm-generic/preempt.h2
-rw-r--r--include/linux/preempt.h20
-rw-r--r--include/linux/sched.h36
-rw-r--r--include/linux/sched/deadline.h5
-rw-r--r--include/linux/smpboot.h4
-rw-r--r--include/linux/stop_machine.h2
-rw-r--r--include/trace/events/sched.h22
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/locking/rtmutex.c3
-rw-r--r--kernel/sched/core.c203
-rw-r--r--kernel/sched/cpudeadline.c5
-rw-r--r--kernel/sched/cpudeadline.h1
-rw-r--r--kernel/sched/fair.c419
-rw-r--r--kernel/sched/features.h21
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h55
-rw-r--r--kernel/smpboot.c5
-rw-r--r--kernel/stop_machine.c90
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
26 files changed, 492 insertions, 469 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index b12f81022a6b..01bcde84d3e4 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
30/* 30/*
31 * must be macros to avoid header recursion hell 31 * must be macros to avoid header recursion hell
32 */ 32 */
33#define init_task_preempt_count(p) do { \ 33#define init_task_preempt_count(p) do { } while (0)
34 task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
35} while (0)
36 34
37#define init_idle_preempt_count(p, cpu) do { \ 35#define init_idle_preempt_count(p, cpu) do { \
38 task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
39 per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ 36 per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
40} while (0) 37} while (0)
41 38
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8afdc3e44247..809877e9030b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
57 __u32 flags; /* low level flags */ 57 __u32 flags; /* low level flags */
58 __u32 status; /* thread synchronous flags */ 58 __u32 status; /* thread synchronous flags */
59 __u32 cpu; /* current CPU */ 59 __u32 cpu; /* current CPU */
60 int saved_preempt_count;
61 mm_segment_t addr_limit; 60 mm_segment_t addr_limit;
62 void __user *sysenter_return; 61 void __user *sysenter_return;
63 unsigned int sig_on_uaccess_error:1; 62 unsigned int sig_on_uaccess_error:1;
@@ -69,7 +68,6 @@ struct thread_info {
69 .task = &tsk, \ 68 .task = &tsk, \
70 .flags = 0, \ 69 .flags = 0, \
71 .cpu = 0, \ 70 .cpu = 0, \
72 .saved_preempt_count = INIT_PREEMPT_COUNT, \
73 .addr_limit = KERNEL_DS, \ 71 .addr_limit = KERNEL_DS, \
74} 72}
75 73
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 737527b40e5b..9f950917528b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -280,14 +280,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
280 set_iopl_mask(next->iopl); 280 set_iopl_mask(next->iopl);
281 281
282 /* 282 /*
283 * If it were not for PREEMPT_ACTIVE we could guarantee that the
284 * preempt_count of all tasks was equal here and this would not be
285 * needed.
286 */
287 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
288 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
289
290 /*
291 * Now maybe handle debug registers and/or IO bitmaps 283 * Now maybe handle debug registers and/or IO bitmaps
292 */ 284 */
293 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || 285 if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b35921a670b2..e835d263a33b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
332 /* 332 /*
333 * Switch FS and GS. 333 * Switch FS and GS.
334 * 334 *
335 * These are even more complicated than FS and GS: they have 335 * These are even more complicated than DS and ES: they have
336 * 64-bit bases are that controlled by arch_prctl. Those bases 336 * 64-bit bases are that controlled by arch_prctl. Those bases
337 * only differ from the values in the GDT or LDT if the selector 337 * only differ from the values in the GDT or LDT if the selector
338 * is 0. 338 * is 0.
@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
401 */ 401 */
402 this_cpu_write(current_task, next_p); 402 this_cpu_write(current_task, next_p);
403 403
404 /*
405 * If it were not for PREEMPT_ACTIVE we could guarantee that the
406 * preempt_count of all tasks was equal here and this would not be
407 * needed.
408 */
409 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
410 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
411
412 /* Reload esp0 and ss1. This changes current_thread_info(). */ 404 /* Reload esp0 and ss1. This changes current_thread_info(). */
413 load_sp0(tss, next); 405 load_sp0(tss, next);
414 406
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index 0bec580a4885..5d8ffa3e6f8c 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc)
24 * must be macros to avoid header recursion hell 24 * must be macros to avoid header recursion hell
25 */ 25 */
26#define init_task_preempt_count(p) do { \ 26#define init_task_preempt_count(p) do { \
27 task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ 27 task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
28} while (0) 28} while (0)
29 29
30#define init_idle_preempt_count(p, cpu) do { \ 30#define init_idle_preempt_count(p, cpu) do { \
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index bea8dd8ff5e0..75e4e30677f1 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -26,7 +26,6 @@
26 * SOFTIRQ_MASK: 0x0000ff00 26 * SOFTIRQ_MASK: 0x0000ff00
27 * HARDIRQ_MASK: 0x000f0000 27 * HARDIRQ_MASK: 0x000f0000
28 * NMI_MASK: 0x00100000 28 * NMI_MASK: 0x00100000
29 * PREEMPT_ACTIVE: 0x00200000
30 * PREEMPT_NEED_RESCHED: 0x80000000 29 * PREEMPT_NEED_RESCHED: 0x80000000
31 */ 30 */
32#define PREEMPT_BITS 8 31#define PREEMPT_BITS 8
@@ -53,10 +52,6 @@
53 52
54#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) 53#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
55 54
56#define PREEMPT_ACTIVE_BITS 1
57#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
58#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
59
60/* We use the MSB mostly because its available */ 55/* We use the MSB mostly because its available */
61#define PREEMPT_NEED_RESCHED 0x80000000 56#define PREEMPT_NEED_RESCHED 0x80000000
62 57
@@ -126,8 +121,7 @@
126 * Check whether we were atomic before we did preempt_disable(): 121 * Check whether we were atomic before we did preempt_disable():
127 * (used by the scheduler) 122 * (used by the scheduler)
128 */ 123 */
129#define in_atomic_preempt_off() \ 124#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
130 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
131 125
132#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) 126#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
133extern void preempt_count_add(int val); 127extern void preempt_count_add(int val);
@@ -146,18 +140,6 @@ extern void preempt_count_sub(int val);
146#define preempt_count_inc() preempt_count_add(1) 140#define preempt_count_inc() preempt_count_add(1)
147#define preempt_count_dec() preempt_count_sub(1) 141#define preempt_count_dec() preempt_count_sub(1)
148 142
149#define preempt_active_enter() \
150do { \
151 preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
152 barrier(); \
153} while (0)
154
155#define preempt_active_exit() \
156do { \
157 barrier(); \
158 preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
159} while (0)
160
161#ifdef CONFIG_PREEMPT_COUNT 143#ifdef CONFIG_PREEMPT_COUNT
162 144
163#define preempt_disable() \ 145#define preempt_disable() \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56667292d1e4..9e1e06c3ce05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -599,20 +599,26 @@ struct task_cputime_atomic {
599 .sum_exec_runtime = ATOMIC64_INIT(0), \ 599 .sum_exec_runtime = ATOMIC64_INIT(0), \
600 } 600 }
601 601
602#ifdef CONFIG_PREEMPT_COUNT 602#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
603#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) 603
604#else 604/*
605#define PREEMPT_DISABLED PREEMPT_ENABLED 605 * Disable preemption until the scheduler is running -- use an unconditional
606#endif 606 * value so that it also works on !PREEMPT_COUNT kernels.
607 *
608 * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
609 */
610#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
607 611
608/* 612/*
609 * Disable preemption until the scheduler is running. 613 * Initial preempt_count value; reflects the preempt_count schedule invariant
610 * Reset by start_kernel()->sched_init()->init_idle(). 614 * which states that during context switches:
611 * 615 *
612 * We include PREEMPT_ACTIVE to avoid cond_resched() from working 616 * preempt_count() == 2*PREEMPT_DISABLE_OFFSET
613 * before the scheduler is active -- see should_resched(). 617 *
618 * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
619 * Note: See finish_task_switch().
614 */ 620 */
615#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) 621#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
616 622
617/** 623/**
618 * struct thread_group_cputimer - thread group interval timer counts 624 * struct thread_group_cputimer - thread group interval timer counts
@@ -1142,8 +1148,6 @@ struct sched_domain_topology_level {
1142#endif 1148#endif
1143}; 1149};
1144 1150
1145extern struct sched_domain_topology_level *sched_domain_topology;
1146
1147extern void set_sched_topology(struct sched_domain_topology_level *tl); 1151extern void set_sched_topology(struct sched_domain_topology_level *tl);
1148extern void wake_up_if_idle(int cpu); 1152extern void wake_up_if_idle(int cpu);
1149 1153
@@ -1192,10 +1196,10 @@ struct load_weight {
1192 1196
1193/* 1197/*
1194 * The load_avg/util_avg accumulates an infinite geometric series. 1198 * The load_avg/util_avg accumulates an infinite geometric series.
1195 * 1) load_avg factors the amount of time that a sched_entity is 1199 * 1) load_avg factors frequency scaling into the amount of time that a
1196 * runnable on a rq into its weight. For cfs_rq, it is the aggregated 1200 * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
1197 * such weights of all runnable and blocked sched_entities. 1201 * aggregated such weights of all runnable and blocked sched_entities.
1198 * 2) util_avg factors frequency scaling into the amount of time 1202 * 2) util_avg factors frequency and cpu scaling into the amount of time
1199 * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. 1203 * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
1200 * For cfs_rq, it is the aggregated such times of all runnable and 1204 * For cfs_rq, it is the aggregated such times of all runnable and
1201 * blocked sched_entities. 1205 * blocked sched_entities.
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9d303b8847df..9089a2ae913d 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p)
21 return dl_prio(p->prio); 21 return dl_prio(p->prio);
22} 22}
23 23
24static inline bool dl_time_before(u64 a, u64 b)
25{
26 return (s64)(a - b) < 0;
27}
28
24#endif /* _SCHED_DEADLINE_H */ 29#endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index e6109a6cd8f6..12910cf19869 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -24,9 +24,6 @@ struct smpboot_thread_data;
24 * parked (cpu offline) 24 * parked (cpu offline)
25 * @unpark: Optional unpark function, called when the thread is 25 * @unpark: Optional unpark function, called when the thread is
26 * unparked (cpu online) 26 * unparked (cpu online)
27 * @pre_unpark: Optional unpark function, called before the thread is
28 * unparked (cpu online). This is not guaranteed to be
29 * called on the target cpu of the thread. Careful!
30 * @cpumask: Internal state. To update which threads are unparked, 27 * @cpumask: Internal state. To update which threads are unparked,
31 * call smpboot_update_cpumask_percpu_thread(). 28 * call smpboot_update_cpumask_percpu_thread().
32 * @selfparking: Thread is not parked by the park function. 29 * @selfparking: Thread is not parked by the park function.
@@ -42,7 +39,6 @@ struct smp_hotplug_thread {
42 void (*cleanup)(unsigned int cpu, bool online); 39 void (*cleanup)(unsigned int cpu, bool online);
43 void (*park)(unsigned int cpu); 40 void (*park)(unsigned int cpu);
44 void (*unpark)(unsigned int cpu); 41 void (*unpark)(unsigned int cpu);
45 void (*pre_unpark)(unsigned int cpu);
46 cpumask_var_t cpumask; 42 cpumask_var_t cpumask;
47 bool selfparking; 43 bool selfparking;
48 const char *thread_comm; 44 const char *thread_comm;
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 414d924318ce..0adedca24c5b 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
33 struct cpu_stop_work *work_buf); 33 struct cpu_stop_work *work_buf);
34int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); 34int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
35int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); 35int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
36void stop_machine_park(int cpu);
37void stop_machine_unpark(int cpu);
36 38
37#else /* CONFIG_SMP */ 39#else /* CONFIG_SMP */
38 40
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 539d6bc3216a..9b90c57517a9 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
104 TP_ARGS(p)); 104 TP_ARGS(p));
105 105
106#ifdef CREATE_TRACE_POINTS 106#ifdef CREATE_TRACE_POINTS
107static inline long __trace_sched_switch_state(struct task_struct *p) 107static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
108{ 108{
109 long state = p->state;
110
111#ifdef CONFIG_PREEMPT
112#ifdef CONFIG_SCHED_DEBUG 109#ifdef CONFIG_SCHED_DEBUG
113 BUG_ON(p != current); 110 BUG_ON(p != current);
114#endif /* CONFIG_SCHED_DEBUG */ 111#endif /* CONFIG_SCHED_DEBUG */
112
115 /* 113 /*
116 * For all intents and purposes a preempted task is a running task. 114 * Preemption ignores task state, therefore preempted tasks are always
115 * RUNNING (we will not have dequeued if state != RUNNING).
117 */ 116 */
118 if (preempt_count() & PREEMPT_ACTIVE) 117 return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
119 state = TASK_RUNNING | TASK_STATE_MAX;
120#endif /* CONFIG_PREEMPT */
121
122 return state;
123} 118}
124#endif /* CREATE_TRACE_POINTS */ 119#endif /* CREATE_TRACE_POINTS */
125 120
@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
128 */ 123 */
129TRACE_EVENT(sched_switch, 124TRACE_EVENT(sched_switch,
130 125
131 TP_PROTO(struct task_struct *prev, 126 TP_PROTO(bool preempt,
127 struct task_struct *prev,
132 struct task_struct *next), 128 struct task_struct *next),
133 129
134 TP_ARGS(prev, next), 130 TP_ARGS(preempt, prev, next),
135 131
136 TP_STRUCT__entry( 132 TP_STRUCT__entry(
137 __array( char, prev_comm, TASK_COMM_LEN ) 133 __array( char, prev_comm, TASK_COMM_LEN )
@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch,
147 memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); 143 memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
148 __entry->prev_pid = prev->pid; 144 __entry->prev_pid = prev->pid;
149 __entry->prev_prio = prev->prio; 145 __entry->prev_prio = prev->prio;
150 __entry->prev_state = __trace_sched_switch_state(prev); 146 __entry->prev_state = __trace_sched_switch_state(preempt, prev);
151 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); 147 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
152 __entry->next_pid = next->pid; 148 __entry->next_pid = next->pid;
153 __entry->next_prio = next->prio; 149 __entry->next_prio = next->prio;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 14a9cdf8abe9..85ff5e26e23b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
291{ 291{
292 struct task_struct *g, *p; 292 struct task_struct *g, *p;
293 293
294 read_lock_irq(&tasklist_lock); 294 read_lock(&tasklist_lock);
295 do_each_thread(g, p) { 295 for_each_process_thread(g, p) {
296 if (!p->on_rq) 296 if (!p->on_rq)
297 continue; 297 continue;
298 /* 298 /*
@@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
307 307
308 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", 308 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
309 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); 309 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
310 } while_each_thread(g, p); 310 }
311 read_unlock_irq(&tasklist_lock); 311 read_unlock(&tasklist_lock);
312} 312}
313 313
314struct take_cpu_down_param { 314struct take_cpu_down_param {
@@ -331,7 +331,7 @@ static int take_cpu_down(void *_param)
331 /* Give up timekeeping duties */ 331 /* Give up timekeeping duties */
332 tick_handover_do_timer(); 332 tick_handover_do_timer();
333 /* Park the stopper thread */ 333 /* Park the stopper thread */
334 kthread_park(current); 334 stop_machine_park((long)param->hcpu);
335 return 0; 335 return 0;
336} 336}
337 337
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e93b63bbc59..07110c6020a0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
706 smp_mb(); 706 smp_mb();
707 raw_spin_unlock_wait(&tsk->pi_lock); 707 raw_spin_unlock_wait(&tsk->pi_lock);
708 708
709 if (unlikely(in_atomic())) 709 if (unlikely(in_atomic())) {
710 pr_info("note: %s[%d] exited with preempt_count %d\n", 710 pr_info("note: %s[%d] exited with preempt_count %d\n",
711 current->comm, task_pid_nr(current), 711 current->comm, task_pid_nr(current),
712 preempt_count()); 712 preempt_count());
713 preempt_count_set(PREEMPT_ENABLED);
714 }
713 715
714 /* sync mm's RSS info before statistics gathering */ 716 /* sync mm's RSS info before statistics gathering */
715 if (tsk->mm) 717 if (tsk->mm)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index bbb72b4f64a1..8251e75dd9c0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
170 * then right waiter has a dl_prio() too. 170 * then right waiter has a dl_prio() too.
171 */ 171 */
172 if (dl_prio(left->prio)) 172 if (dl_prio(left->prio))
173 return (left->task->dl.deadline < right->task->dl.deadline); 173 return dl_time_before(left->task->dl.deadline,
174 right->task->dl.deadline);
174 175
175 return 0; 176 return 0;
176} 177}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f7402f7eb448..aa5973220ad2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
817 /* 817 /*
818 * SCHED_IDLE tasks get minimal weight: 818 * SCHED_IDLE tasks get minimal weight:
819 */ 819 */
820 if (p->policy == SCHED_IDLE) { 820 if (idle_policy(p->policy)) {
821 load->weight = scale_load(WEIGHT_IDLEPRIO); 821 load->weight = scale_load(WEIGHT_IDLEPRIO);
822 load->inv_weight = WMULT_IDLEPRIO; 822 load->inv_weight = WMULT_IDLEPRIO;
823 return; 823 return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
827 load->inv_weight = prio_to_wmult[prio]; 827 load->inv_weight = prio_to_wmult[prio];
828} 828}
829 829
830static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 830static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
831{ 831{
832 update_rq_clock(rq); 832 update_rq_clock(rq);
833 sched_info_queued(rq, p); 833 if (!(flags & ENQUEUE_RESTORE))
834 sched_info_queued(rq, p);
834 p->sched_class->enqueue_task(rq, p, flags); 835 p->sched_class->enqueue_task(rq, p, flags);
835} 836}
836 837
837static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 838static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
838{ 839{
839 update_rq_clock(rq); 840 update_rq_clock(rq);
840 sched_info_dequeued(rq, p); 841 if (!(flags & DEQUEUE_SAVE))
842 sched_info_dequeued(rq, p);
841 p->sched_class->dequeue_task(rq, p, flags); 843 p->sched_class->dequeue_task(rq, p, flags);
842} 844}
843 845
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1178 * holding rq->lock. 1180 * holding rq->lock.
1179 */ 1181 */
1180 lockdep_assert_held(&rq->lock); 1182 lockdep_assert_held(&rq->lock);
1181 dequeue_task(rq, p, 0); 1183 dequeue_task(rq, p, DEQUEUE_SAVE);
1182 } 1184 }
1183 if (running) 1185 if (running)
1184 put_prev_task(rq, p); 1186 put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1188 if (running) 1190 if (running)
1189 p->sched_class->set_curr_task(rq); 1191 p->sched_class->set_curr_task(rq);
1190 if (queued) 1192 if (queued)
1191 enqueue_task(rq, p, 0); 1193 enqueue_task(rq, p, ENQUEUE_RESTORE);
1192} 1194}
1193 1195
1194/* 1196/*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1292 1294
1293 if (task_cpu(p) != new_cpu) { 1295 if (task_cpu(p) != new_cpu) {
1294 if (p->sched_class->migrate_task_rq) 1296 if (p->sched_class->migrate_task_rq)
1295 p->sched_class->migrate_task_rq(p, new_cpu); 1297 p->sched_class->migrate_task_rq(p);
1296 p->se.nr_migrations++; 1298 p->se.nr_migrations++;
1297 perf_event_task_migrate(p); 1299 perf_event_task_migrate(p);
1298 } 1300 }
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
1333 struct rq *src_rq, *dst_rq; 1335 struct rq *src_rq, *dst_rq;
1334 int ret = -EAGAIN; 1336 int ret = -EAGAIN;
1335 1337
1338 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1339 return -EAGAIN;
1340
1336 src_rq = cpu_rq(arg->src_cpu); 1341 src_rq = cpu_rq(arg->src_cpu);
1337 dst_rq = cpu_rq(arg->dst_cpu); 1342 dst_rq = cpu_rq(arg->dst_cpu);
1338 1343
1339 double_raw_lock(&arg->src_task->pi_lock, 1344 double_raw_lock(&arg->src_task->pi_lock,
1340 &arg->dst_task->pi_lock); 1345 &arg->dst_task->pi_lock);
1341 double_rq_lock(src_rq, dst_rq); 1346 double_rq_lock(src_rq, dst_rq);
1347
1342 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1348 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1343 goto unlock; 1349 goto unlock;
1344 1350
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1574 goto out; 1580 goto out;
1575 } 1581 }
1576 1582
1583 /* No more Mr. Nice Guy. */
1577 switch (state) { 1584 switch (state) {
1578 case cpuset: 1585 case cpuset:
1579 /* No more Mr. Nice Guy. */ 1586 if (IS_ENABLED(CONFIG_CPUSETS)) {
1580 cpuset_cpus_allowed_fallback(p); 1587 cpuset_cpus_allowed_fallback(p);
1581 state = possible; 1588 state = possible;
1582 break; 1589 break;
1583 1590 }
1591 /* fall-through */
1584 case possible: 1592 case possible:
1585 do_set_cpus_allowed(p, cpu_possible_mask); 1593 do_set_cpus_allowed(p, cpu_possible_mask);
1586 state = fail; 1594 state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1692#endif /* CONFIG_SCHEDSTATS */ 1700#endif /* CONFIG_SCHEDSTATS */
1693} 1701}
1694 1702
1695static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1703static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1696{ 1704{
1697 activate_task(rq, p, en_flags); 1705 activate_task(rq, p, en_flags);
1698 p->on_rq = TASK_ON_RQ_QUEUED; 1706 p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2114#endif /* CONFIG_NUMA_BALANCING */ 2122#endif /* CONFIG_NUMA_BALANCING */
2115} 2123}
2116 2124
2125DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2126
2117#ifdef CONFIG_NUMA_BALANCING 2127#ifdef CONFIG_NUMA_BALANCING
2118#ifdef CONFIG_SCHED_DEBUG 2128
2119void set_numabalancing_state(bool enabled) 2129void set_numabalancing_state(bool enabled)
2120{ 2130{
2121 if (enabled) 2131 if (enabled)
2122 sched_feat_set("NUMA"); 2132 static_branch_enable(&sched_numa_balancing);
2123 else 2133 else
2124 sched_feat_set("NO_NUMA"); 2134 static_branch_disable(&sched_numa_balancing);
2125} 2135}
2126#else
2127__read_mostly bool numabalancing_enabled;
2128
2129void set_numabalancing_state(bool enabled)
2130{
2131 numabalancing_enabled = enabled;
2132}
2133#endif /* CONFIG_SCHED_DEBUG */
2134 2136
2135#ifdef CONFIG_PROC_SYSCTL 2137#ifdef CONFIG_PROC_SYSCTL
2136int sysctl_numa_balancing(struct ctl_table *table, int write, 2138int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
2138{ 2140{
2139 struct ctl_table t; 2141 struct ctl_table t;
2140 int err; 2142 int err;
2141 int state = numabalancing_enabled; 2143 int state = static_branch_likely(&sched_numa_balancing);
2142 2144
2143 if (write && !capable(CAP_SYS_ADMIN)) 2145 if (write && !capable(CAP_SYS_ADMIN))
2144 return -EPERM; 2146 return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
2349 struct rq *rq; 2351 struct rq *rq;
2350 2352
2351 raw_spin_lock_irqsave(&p->pi_lock, flags); 2353 raw_spin_lock_irqsave(&p->pi_lock, flags);
2354 /* Initialize new task's runnable average */
2355 init_entity_runnable_average(&p->se);
2352#ifdef CONFIG_SMP 2356#ifdef CONFIG_SMP
2353 /* 2357 /*
2354 * Fork balancing, do it here and not earlier because: 2358 * Fork balancing, do it here and not earlier because:
@@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p)
2358 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2362 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2359#endif 2363#endif
2360 2364
2361 /* Initialize new task's runnable average */
2362 init_entity_runnable_average(&p->se);
2363 rq = __task_rq_lock(p); 2365 rq = __task_rq_lock(p);
2364 activate_task(rq, p, 0); 2366 activate_task(rq, p, 0);
2365 p->on_rq = TASK_ON_RQ_QUEUED; 2367 p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2483,7 +2485,6 @@ static inline void
2483prepare_task_switch(struct rq *rq, struct task_struct *prev, 2485prepare_task_switch(struct rq *rq, struct task_struct *prev,
2484 struct task_struct *next) 2486 struct task_struct *next)
2485{ 2487{
2486 trace_sched_switch(prev, next);
2487 sched_info_switch(rq, prev, next); 2488 sched_info_switch(rq, prev, next);
2488 perf_event_task_sched_out(prev, next); 2489 perf_event_task_sched_out(prev, next);
2489 fire_sched_out_preempt_notifiers(prev, next); 2490 fire_sched_out_preempt_notifiers(prev, next);
@@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2517 struct mm_struct *mm = rq->prev_mm; 2518 struct mm_struct *mm = rq->prev_mm;
2518 long prev_state; 2519 long prev_state;
2519 2520
2521 /*
2522 * The previous task will have left us with a preempt_count of 2
2523 * because it left us after:
2524 *
2525 * schedule()
2526 * preempt_disable(); // 1
2527 * __schedule()
2528 * raw_spin_lock_irq(&rq->lock) // 2
2529 *
2530 * Also, see FORK_PREEMPT_COUNT.
2531 */
2532 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2533 "corrupted preempt_count: %s/%d/0x%x\n",
2534 current->comm, current->pid, preempt_count()))
2535 preempt_count_set(FORK_PREEMPT_COUNT);
2536
2520 rq->prev_mm = NULL; 2537 rq->prev_mm = NULL;
2521 2538
2522 /* 2539 /*
@@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
2601{ 2618{
2602 struct rq *rq; 2619 struct rq *rq;
2603 2620
2604 /* finish_task_switch() drops rq->lock and enables preemtion */ 2621 /*
2605 preempt_disable(); 2622 * New tasks start with FORK_PREEMPT_COUNT, see there and
2623 * finish_task_switch() for details.
2624 *
2625 * finish_task_switch() will drop rq->lock() and lower preempt_count
2626 * and the preempt_enable() will end up enabling preemption (on
2627 * PREEMPT_COUNT kernels).
2628 */
2629
2606 rq = finish_task_switch(prev); 2630 rq = finish_task_switch(prev);
2607 balance_callback(rq); 2631 balance_callback(rq);
2608 preempt_enable(); 2632 preempt_enable();
@@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
2960static inline void schedule_debug(struct task_struct *prev) 2984static inline void schedule_debug(struct task_struct *prev)
2961{ 2985{
2962#ifdef CONFIG_SCHED_STACK_END_CHECK 2986#ifdef CONFIG_SCHED_STACK_END_CHECK
2963 BUG_ON(unlikely(task_stack_end_corrupted(prev))); 2987 BUG_ON(task_stack_end_corrupted(prev));
2964#endif 2988#endif
2965 /* 2989
2966 * Test if we are atomic. Since do_exit() needs to call into 2990 if (unlikely(in_atomic_preempt_off())) {
2967 * schedule() atomically, we ignore that path. Otherwise whine
2968 * if we are scheduling when we should not.
2969 */
2970 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2971 __schedule_bug(prev); 2991 __schedule_bug(prev);
2992 preempt_count_set(PREEMPT_DISABLED);
2993 }
2972 rcu_sleep_check(); 2994 rcu_sleep_check();
2973 2995
2974 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2996 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3054,7 +3076,7 @@ again:
3054 * 3076 *
3055 * WARNING: must be called with preemption disabled! 3077 * WARNING: must be called with preemption disabled!
3056 */ 3078 */
3057static void __sched __schedule(void) 3079static void __sched notrace __schedule(bool preempt)
3058{ 3080{
3059 struct task_struct *prev, *next; 3081 struct task_struct *prev, *next;
3060 unsigned long *switch_count; 3082 unsigned long *switch_count;
@@ -3066,6 +3088,17 @@ static void __sched __schedule(void)
3066 rcu_note_context_switch(); 3088 rcu_note_context_switch();
3067 prev = rq->curr; 3089 prev = rq->curr;
3068 3090
3091 /*
3092 * do_exit() calls schedule() with preemption disabled as an exception;
3093 * however we must fix that up, otherwise the next task will see an
3094 * inconsistent (higher) preempt count.
3095 *
3096 * It also avoids the below schedule_debug() test from complaining
3097 * about this.
3098 */
3099 if (unlikely(prev->state == TASK_DEAD))
3100 preempt_enable_no_resched_notrace();
3101
3069 schedule_debug(prev); 3102 schedule_debug(prev);
3070 3103
3071 if (sched_feat(HRTICK)) 3104 if (sched_feat(HRTICK))
@@ -3083,7 +3116,7 @@ static void __sched __schedule(void)
3083 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ 3116 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
3084 3117
3085 switch_count = &prev->nivcsw; 3118 switch_count = &prev->nivcsw;
3086 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3119 if (!preempt && prev->state) {
3087 if (unlikely(signal_pending_state(prev->state, prev))) { 3120 if (unlikely(signal_pending_state(prev->state, prev))) {
3088 prev->state = TASK_RUNNING; 3121 prev->state = TASK_RUNNING;
3089 } else { 3122 } else {
@@ -3119,6 +3152,7 @@ static void __sched __schedule(void)
3119 rq->curr = next; 3152 rq->curr = next;
3120 ++*switch_count; 3153 ++*switch_count;
3121 3154
3155 trace_sched_switch(preempt, prev, next);
3122 rq = context_switch(rq, prev, next); /* unlocks the rq */ 3156 rq = context_switch(rq, prev, next); /* unlocks the rq */
3123 cpu = cpu_of(rq); 3157 cpu = cpu_of(rq);
3124 } else { 3158 } else {
@@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
3148 sched_submit_work(tsk); 3182 sched_submit_work(tsk);
3149 do { 3183 do {
3150 preempt_disable(); 3184 preempt_disable();
3151 __schedule(); 3185 __schedule(false);
3152 sched_preempt_enable_no_resched(); 3186 sched_preempt_enable_no_resched();
3153 } while (need_resched()); 3187 } while (need_resched());
3154} 3188}
@@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
3188static void __sched notrace preempt_schedule_common(void) 3222static void __sched notrace preempt_schedule_common(void)
3189{ 3223{
3190 do { 3224 do {
3191 preempt_active_enter(); 3225 preempt_disable_notrace();
3192 __schedule(); 3226 __schedule(true);
3193 preempt_active_exit(); 3227 preempt_enable_no_resched_notrace();
3194 3228
3195 /* 3229 /*
3196 * Check again in case we missed a preemption opportunity 3230 * Check again in case we missed a preemption opportunity
@@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3241 return; 3275 return;
3242 3276
3243 do { 3277 do {
3244 /* 3278 preempt_disable_notrace();
3245 * Use raw __prempt_count() ops that don't call function.
3246 * We can't call functions before disabling preemption which
3247 * disarm preemption tracing recursions.
3248 */
3249 __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
3250 barrier();
3251 /* 3279 /*
3252 * Needs preempt disabled in case user_exit() is traced 3280 * Needs preempt disabled in case user_exit() is traced
3253 * and the tracer calls preempt_enable_notrace() causing 3281 * and the tracer calls preempt_enable_notrace() causing
3254 * an infinite recursion. 3282 * an infinite recursion.
3255 */ 3283 */
3256 prev_ctx = exception_enter(); 3284 prev_ctx = exception_enter();
3257 __schedule(); 3285 __schedule(true);
3258 exception_exit(prev_ctx); 3286 exception_exit(prev_ctx);
3259 3287
3260 barrier(); 3288 preempt_enable_no_resched_notrace();
3261 __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
3262 } while (need_resched()); 3289 } while (need_resched());
3263} 3290}
3264EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 3291EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
3281 prev_state = exception_enter(); 3308 prev_state = exception_enter();
3282 3309
3283 do { 3310 do {
3284 preempt_active_enter(); 3311 preempt_disable();
3285 local_irq_enable(); 3312 local_irq_enable();
3286 __schedule(); 3313 __schedule(true);
3287 local_irq_disable(); 3314 local_irq_disable();
3288 preempt_active_exit(); 3315 sched_preempt_enable_no_resched();
3289 } while (need_resched()); 3316 } while (need_resched());
3290 3317
3291 exception_exit(prev_state); 3318 exception_exit(prev_state);
@@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
3313 */ 3340 */
3314void rt_mutex_setprio(struct task_struct *p, int prio) 3341void rt_mutex_setprio(struct task_struct *p, int prio)
3315{ 3342{
3316 int oldprio, queued, running, enqueue_flag = 0; 3343 int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
3317 struct rq *rq; 3344 struct rq *rq;
3318 const struct sched_class *prev_class; 3345 const struct sched_class *prev_class;
3319 3346
@@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3345 queued = task_on_rq_queued(p); 3372 queued = task_on_rq_queued(p);
3346 running = task_current(rq, p); 3373 running = task_current(rq, p);
3347 if (queued) 3374 if (queued)
3348 dequeue_task(rq, p, 0); 3375 dequeue_task(rq, p, DEQUEUE_SAVE);
3349 if (running) 3376 if (running)
3350 put_prev_task(rq, p); 3377 put_prev_task(rq, p);
3351 3378
@@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3363 if (!dl_prio(p->normal_prio) || 3390 if (!dl_prio(p->normal_prio) ||
3364 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3391 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3365 p->dl.dl_boosted = 1; 3392 p->dl.dl_boosted = 1;
3366 enqueue_flag = ENQUEUE_REPLENISH; 3393 enqueue_flag |= ENQUEUE_REPLENISH;
3367 } else 3394 } else
3368 p->dl.dl_boosted = 0; 3395 p->dl.dl_boosted = 0;
3369 p->sched_class = &dl_sched_class; 3396 p->sched_class = &dl_sched_class;
@@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3371 if (dl_prio(oldprio)) 3398 if (dl_prio(oldprio))
3372 p->dl.dl_boosted = 0; 3399 p->dl.dl_boosted = 0;
3373 if (oldprio < prio) 3400 if (oldprio < prio)
3374 enqueue_flag = ENQUEUE_HEAD; 3401 enqueue_flag |= ENQUEUE_HEAD;
3375 p->sched_class = &rt_sched_class; 3402 p->sched_class = &rt_sched_class;
3376 } else { 3403 } else {
3377 if (dl_prio(oldprio)) 3404 if (dl_prio(oldprio))
@@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
3423 } 3450 }
3424 queued = task_on_rq_queued(p); 3451 queued = task_on_rq_queued(p);
3425 if (queued) 3452 if (queued)
3426 dequeue_task(rq, p, 0); 3453 dequeue_task(rq, p, DEQUEUE_SAVE);
3427 3454
3428 p->static_prio = NICE_TO_PRIO(nice); 3455 p->static_prio = NICE_TO_PRIO(nice);
3429 set_load_weight(p); 3456 set_load_weight(p);
@@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
3432 delta = p->prio - old_prio; 3459 delta = p->prio - old_prio;
3433 3460
3434 if (queued) { 3461 if (queued) {
3435 enqueue_task(rq, p, 0); 3462 enqueue_task(rq, p, ENQUEUE_RESTORE);
3436 /* 3463 /*
3437 * If the task increased its priority or is running and 3464 * If the task increased its priority or is running and
3438 * lowered its priority, then reschedule its CPU: 3465 * lowered its priority, then reschedule its CPU:
@@ -3753,10 +3780,7 @@ recheck:
3753 } else { 3780 } else {
3754 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3781 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3755 3782
3756 if (policy != SCHED_DEADLINE && 3783 if (!valid_policy(policy))
3757 policy != SCHED_FIFO && policy != SCHED_RR &&
3758 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3759 policy != SCHED_IDLE)
3760 return -EINVAL; 3784 return -EINVAL;
3761 } 3785 }
3762 3786
@@ -3812,7 +3836,7 @@ recheck:
3812 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3836 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3813 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3837 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3814 */ 3838 */
3815 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3839 if (idle_policy(p->policy) && !idle_policy(policy)) {
3816 if (!can_nice(p, task_nice(p))) 3840 if (!can_nice(p, task_nice(p)))
3817 return -EPERM; 3841 return -EPERM;
3818 } 3842 }
@@ -3937,7 +3961,7 @@ change:
3937 queued = task_on_rq_queued(p); 3961 queued = task_on_rq_queued(p);
3938 running = task_current(rq, p); 3962 running = task_current(rq, p);
3939 if (queued) 3963 if (queued)
3940 dequeue_task(rq, p, 0); 3964 dequeue_task(rq, p, DEQUEUE_SAVE);
3941 if (running) 3965 if (running)
3942 put_prev_task(rq, p); 3966 put_prev_task(rq, p);
3943 3967
@@ -3947,11 +3971,15 @@ change:
3947 if (running) 3971 if (running)
3948 p->sched_class->set_curr_task(rq); 3972 p->sched_class->set_curr_task(rq);
3949 if (queued) { 3973 if (queued) {
3974 int enqueue_flags = ENQUEUE_RESTORE;
3950 /* 3975 /*
3951 * We enqueue to tail when the priority of a task is 3976 * We enqueue to tail when the priority of a task is
3952 * increased (user space view). 3977 * increased (user space view).
3953 */ 3978 */
3954 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); 3979 if (oldprio <= p->prio)
3980 enqueue_flags |= ENQUEUE_HEAD;
3981
3982 enqueue_task(rq, p, enqueue_flags);
3955 } 3983 }
3956 3984
3957 check_class_changed(rq, p, prev_class, oldprio); 3985 check_class_changed(rq, p, prev_class, oldprio);
@@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
5101 running = task_current(rq, p); 5129 running = task_current(rq, p);
5102 5130
5103 if (queued) 5131 if (queued)
5104 dequeue_task(rq, p, 0); 5132 dequeue_task(rq, p, DEQUEUE_SAVE);
5105 if (running) 5133 if (running)
5106 put_prev_task(rq, p); 5134 put_prev_task(rq, p);
5107 5135
@@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
5110 if (running) 5138 if (running)
5111 p->sched_class->set_curr_task(rq); 5139 p->sched_class->set_curr_task(rq);
5112 if (queued) 5140 if (queued)
5113 enqueue_task(rq, p, 0); 5141 enqueue_task(rq, p, ENQUEUE_RESTORE);
5114 task_rq_unlock(rq, p, &flags); 5142 task_rq_unlock(rq, p, &flags);
5115} 5143}
5116#endif /* CONFIG_NUMA_BALANCING */ 5144#endif /* CONFIG_NUMA_BALANCING */
@@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
5531static int sched_cpu_active(struct notifier_block *nfb, 5559static int sched_cpu_active(struct notifier_block *nfb,
5532 unsigned long action, void *hcpu) 5560 unsigned long action, void *hcpu)
5533{ 5561{
5562 int cpu = (long)hcpu;
5563
5534 switch (action & ~CPU_TASKS_FROZEN) { 5564 switch (action & ~CPU_TASKS_FROZEN) {
5535 case CPU_STARTING: 5565 case CPU_STARTING:
5536 set_cpu_rq_start_time(); 5566 set_cpu_rq_start_time();
5537 return NOTIFY_OK; 5567 return NOTIFY_OK;
5568
5538 case CPU_ONLINE: 5569 case CPU_ONLINE:
5539 /* 5570 /*
5540 * At this point a starting CPU has marked itself as online via 5571 * At this point a starting CPU has marked itself as online via
5541 * set_cpu_online(). But it might not yet have marked itself 5572 * set_cpu_online(). But it might not yet have marked itself
5542 * as active, which is essential from here on. 5573 * as active, which is essential from here on.
5543 *
5544 * Thus, fall-through and help the starting CPU along.
5545 */ 5574 */
5575 set_cpu_active(cpu, true);
5576 stop_machine_unpark(cpu);
5577 return NOTIFY_OK;
5578
5546 case CPU_DOWN_FAILED: 5579 case CPU_DOWN_FAILED:
5547 set_cpu_active((long)hcpu, true); 5580 set_cpu_active(cpu, true);
5548 return NOTIFY_OK; 5581 return NOTIFY_OK;
5582
5549 default: 5583 default:
5550 return NOTIFY_DONE; 5584 return NOTIFY_DONE;
5551 } 5585 }
@@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
6477 { NULL, }, 6511 { NULL, },
6478}; 6512};
6479 6513
6480struct sched_domain_topology_level *sched_domain_topology = default_topology; 6514static struct sched_domain_topology_level *sched_domain_topology =
6515 default_topology;
6481 6516
6482#define for_each_sd_topology(tl) \ 6517#define for_each_sd_topology(tl) \
6483 for (tl = sched_domain_topology; tl->mask; tl++) 6518 for (tl = sched_domain_topology; tl->mask; tl++)
@@ -7478,7 +7513,7 @@ void __init sched_init(void)
7478#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 7513#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7479static inline int preempt_count_equals(int preempt_offset) 7514static inline int preempt_count_equals(int preempt_offset)
7480{ 7515{
7481 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7516 int nested = preempt_count() + rcu_preempt_depth();
7482 7517
7483 return (nested == preempt_offset); 7518 return (nested == preempt_offset);
7484} 7519}
@@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
7725 queued = task_on_rq_queued(tsk); 7760 queued = task_on_rq_queued(tsk);
7726 7761
7727 if (queued) 7762 if (queued)
7728 dequeue_task(rq, tsk, 0); 7763 dequeue_task(rq, tsk, DEQUEUE_SAVE);
7729 if (unlikely(running)) 7764 if (unlikely(running))
7730 put_prev_task(rq, tsk); 7765 put_prev_task(rq, tsk);
7731 7766
@@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
7741 7776
7742#ifdef CONFIG_FAIR_GROUP_SCHED 7777#ifdef CONFIG_FAIR_GROUP_SCHED
7743 if (tsk->sched_class->task_move_group) 7778 if (tsk->sched_class->task_move_group)
7744 tsk->sched_class->task_move_group(tsk, queued); 7779 tsk->sched_class->task_move_group(tsk);
7745 else 7780 else
7746#endif 7781#endif
7747 set_task_rq(tsk, task_cpu(tsk)); 7782 set_task_rq(tsk, task_cpu(tsk));
@@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
7749 if (unlikely(running)) 7784 if (unlikely(running))
7750 tsk->sched_class->set_curr_task(rq); 7785 tsk->sched_class->set_curr_task(rq);
7751 if (queued) 7786 if (queued)
7752 enqueue_task(rq, tsk, 0); 7787 enqueue_task(rq, tsk, ENQUEUE_RESTORE);
7753 7788
7754 task_rq_unlock(rq, tsk, &flags); 7789 task_rq_unlock(rq, tsk, &flags);
7755} 7790}
@@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
8213 struct cgroup_subsys_state *old_css, 8248 struct cgroup_subsys_state *old_css,
8214 struct task_struct *task) 8249 struct task_struct *task)
8215{ 8250{
8216 /*
8217 * cgroup_exit() is called in the copy_process() failure path.
8218 * Ignore this case since the task hasn't ran yet, this avoids
8219 * trying to poke a half freed task state from generic code.
8220 */
8221 if (!(task->flags & PF_EXITING))
8222 return;
8223
8224 sched_move_task(task); 8251 sched_move_task(task);
8225} 8252}
8226 8253
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index c6acb07466bb..5a75b08cfd85 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,11 +31,6 @@ static inline int right_child(int i)
31 return (i << 1) + 2; 31 return (i << 1) + 2;
32} 32}
33 33
34static inline int dl_time_before(u64 a, u64 b)
35{
36 return (s64)(a - b) < 0;
37}
38
39static void cpudl_exchange(struct cpudl *cp, int a, int b) 34static void cpudl_exchange(struct cpudl *cp, int a, int b)
40{ 35{
41 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 36 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 1a0a6ef2fbe1..fcbdf83fed7e 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -2,6 +2,7 @@
2#define _LINUX_CPUDL_H 2#define _LINUX_CPUDL_H
3 3
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/sched/deadline.h>
5 6
6#define IDX_INVALID -1 7#define IDX_INVALID -1
7 8
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60fe721a..824aa9f501a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p);
661 661
662/* 662/*
663 * We choose a half-life close to 1 scheduling period. 663 * We choose a half-life close to 1 scheduling period.
664 * Note: The tables below are dependent on this value. 664 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
665 * dependent on this value.
665 */ 666 */
666#define LOAD_AVG_PERIOD 32 667#define LOAD_AVG_PERIOD 32
667#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ 668#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
668#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ 669#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
669 670
670/* Give new sched_entity start runnable values to heavy its load in infant time */ 671/* Give new sched_entity start runnable values to heavy its load in infant time */
671void init_entity_runnable_average(struct sched_entity *se) 672void init_entity_runnable_average(struct sched_entity *se)
@@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
682 sa->load_avg = scale_load_down(se->load.weight); 683 sa->load_avg = scale_load_down(se->load.weight);
683 sa->load_sum = sa->load_avg * LOAD_AVG_MAX; 684 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
684 sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); 685 sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
685 sa->util_sum = LOAD_AVG_MAX; 686 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
686 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 687 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
687} 688}
688 689
@@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2069 int local = !!(flags & TNF_FAULT_LOCAL); 2070 int local = !!(flags & TNF_FAULT_LOCAL);
2070 int priv; 2071 int priv;
2071 2072
2072 if (!numabalancing_enabled) 2073 if (!static_branch_likely(&sched_numa_balancing))
2073 return; 2074 return;
2074 2075
2075 /* for example, ksmd faulting in a user's mm */ 2076 /* for example, ksmd faulting in a user's mm */
@@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work)
2157 struct vm_area_struct *vma; 2158 struct vm_area_struct *vma;
2158 unsigned long start, end; 2159 unsigned long start, end;
2159 unsigned long nr_pte_updates = 0; 2160 unsigned long nr_pte_updates = 0;
2160 long pages; 2161 long pages, virtpages;
2161 2162
2162 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 2163 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2163 2164
@@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work)
2203 start = mm->numa_scan_offset; 2204 start = mm->numa_scan_offset;
2204 pages = sysctl_numa_balancing_scan_size; 2205 pages = sysctl_numa_balancing_scan_size;
2205 pages <<= 20 - PAGE_SHIFT; /* MB in pages */ 2206 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2207 virtpages = pages * 8; /* Scan up to this much virtual space */
2206 if (!pages) 2208 if (!pages)
2207 return; 2209 return;
2208 2210
2211
2209 down_read(&mm->mmap_sem); 2212 down_read(&mm->mmap_sem);
2210 vma = find_vma(mm, start); 2213 vma = find_vma(mm, start);
2211 if (!vma) { 2214 if (!vma) {
@@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work)
2240 start = max(start, vma->vm_start); 2243 start = max(start, vma->vm_start);
2241 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 2244 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2242 end = min(end, vma->vm_end); 2245 end = min(end, vma->vm_end);
2243 nr_pte_updates += change_prot_numa(vma, start, end); 2246 nr_pte_updates = change_prot_numa(vma, start, end);
2244 2247
2245 /* 2248 /*
2246 * Scan sysctl_numa_balancing_scan_size but ensure that 2249 * Try to scan sysctl_numa_balancing_size worth of
2247 * at least one PTE is updated so that unused virtual 2250 * hpages that have at least one present PTE that
2248 * address space is quickly skipped. 2251 * is not already pte-numa. If the VMA contains
2252 * areas that are unused or already full of prot_numa
2253 * PTEs, scan up to virtpages, to skip through those
2254 * areas faster.
2249 */ 2255 */
2250 if (nr_pte_updates) 2256 if (nr_pte_updates)
2251 pages -= (end - start) >> PAGE_SHIFT; 2257 pages -= (end - start) >> PAGE_SHIFT;
2258 virtpages -= (end - start) >> PAGE_SHIFT;
2252 2259
2253 start = end; 2260 start = end;
2254 if (pages <= 0) 2261 if (pages <= 0 || virtpages <= 0)
2255 goto out; 2262 goto out;
2256 2263
2257 cond_resched(); 2264 cond_resched();
@@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n)
2515 return contrib + runnable_avg_yN_sum[n]; 2522 return contrib + runnable_avg_yN_sum[n];
2516} 2523}
2517 2524
2525#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2526#error "load tracking assumes 2^10 as unit"
2527#endif
2528
2529#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2530
2518/* 2531/*
2519 * We can represent the historical contribution to runnable average as the 2532 * We can represent the historical contribution to runnable average as the
2520 * coefficients of a geometric series. To do this we sub-divide our runnable 2533 * coefficients of a geometric series. To do this we sub-divide our runnable
@@ -2547,10 +2560,10 @@ static __always_inline int
2547__update_load_avg(u64 now, int cpu, struct sched_avg *sa, 2560__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2548 unsigned long weight, int running, struct cfs_rq *cfs_rq) 2561 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2549{ 2562{
2550 u64 delta, periods; 2563 u64 delta, scaled_delta, periods;
2551 u32 contrib; 2564 u32 contrib;
2552 int delta_w, decayed = 0; 2565 unsigned int delta_w, scaled_delta_w, decayed = 0;
2553 unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); 2566 unsigned long scale_freq, scale_cpu;
2554 2567
2555 delta = now - sa->last_update_time; 2568 delta = now - sa->last_update_time;
2556 /* 2569 /*
@@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2571 return 0; 2584 return 0;
2572 sa->last_update_time = now; 2585 sa->last_update_time = now;
2573 2586
2587 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2588 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2589
2574 /* delta_w is the amount already accumulated against our next period */ 2590 /* delta_w is the amount already accumulated against our next period */
2575 delta_w = sa->period_contrib; 2591 delta_w = sa->period_contrib;
2576 if (delta + delta_w >= 1024) { 2592 if (delta + delta_w >= 1024) {
@@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2585 * period and accrue it. 2601 * period and accrue it.
2586 */ 2602 */
2587 delta_w = 1024 - delta_w; 2603 delta_w = 1024 - delta_w;
2604 scaled_delta_w = cap_scale(delta_w, scale_freq);
2588 if (weight) { 2605 if (weight) {
2589 sa->load_sum += weight * delta_w; 2606 sa->load_sum += weight * scaled_delta_w;
2590 if (cfs_rq) 2607 if (cfs_rq) {
2591 cfs_rq->runnable_load_sum += weight * delta_w; 2608 cfs_rq->runnable_load_sum +=
2609 weight * scaled_delta_w;
2610 }
2592 } 2611 }
2593 if (running) 2612 if (running)
2594 sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; 2613 sa->util_sum += scaled_delta_w * scale_cpu;
2595 2614
2596 delta -= delta_w; 2615 delta -= delta_w;
2597 2616
@@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2608 2627
2609 /* Efficiently calculate \sum (1..n_period) 1024*y^i */ 2628 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2610 contrib = __compute_runnable_contrib(periods); 2629 contrib = __compute_runnable_contrib(periods);
2630 contrib = cap_scale(contrib, scale_freq);
2611 if (weight) { 2631 if (weight) {
2612 sa->load_sum += weight * contrib; 2632 sa->load_sum += weight * contrib;
2613 if (cfs_rq) 2633 if (cfs_rq)
2614 cfs_rq->runnable_load_sum += weight * contrib; 2634 cfs_rq->runnable_load_sum += weight * contrib;
2615 } 2635 }
2616 if (running) 2636 if (running)
2617 sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; 2637 sa->util_sum += contrib * scale_cpu;
2618 } 2638 }
2619 2639
2620 /* Remainder of delta accrued against u_0` */ 2640 /* Remainder of delta accrued against u_0` */
2641 scaled_delta = cap_scale(delta, scale_freq);
2621 if (weight) { 2642 if (weight) {
2622 sa->load_sum += weight * delta; 2643 sa->load_sum += weight * scaled_delta;
2623 if (cfs_rq) 2644 if (cfs_rq)
2624 cfs_rq->runnable_load_sum += weight * delta; 2645 cfs_rq->runnable_load_sum += weight * scaled_delta;
2625 } 2646 }
2626 if (running) 2647 if (running)
2627 sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; 2648 sa->util_sum += scaled_delta * scale_cpu;
2628 2649
2629 sa->period_contrib += delta; 2650 sa->period_contrib += delta;
2630 2651
@@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2634 cfs_rq->runnable_load_avg = 2655 cfs_rq->runnable_load_avg =
2635 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); 2656 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2636 } 2657 }
2637 sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; 2658 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2638 } 2659 }
2639 2660
2640 return decayed; 2661 return decayed;
@@ -2677,8 +2698,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2677 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 2698 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2678 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); 2699 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2679 sa->util_avg = max_t(long, sa->util_avg - r, 0); 2700 sa->util_avg = max_t(long, sa->util_avg - r, 0);
2680 sa->util_sum = max_t(s32, sa->util_sum - 2701 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
2681 ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
2682 } 2702 }
2683 2703
2684 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, 2704 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2696,33 +2716,70 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2696static inline void update_load_avg(struct sched_entity *se, int update_tg) 2716static inline void update_load_avg(struct sched_entity *se, int update_tg)
2697{ 2717{
2698 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2718 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2699 int cpu = cpu_of(rq_of(cfs_rq));
2700 u64 now = cfs_rq_clock_task(cfs_rq); 2719 u64 now = cfs_rq_clock_task(cfs_rq);
2720 int cpu = cpu_of(rq_of(cfs_rq));
2701 2721
2702 /* 2722 /*
2703 * Track task load average for carrying it to new CPU after migrated, and 2723 * Track task load average for carrying it to new CPU after migrated, and
2704 * track group sched_entity load average for task_h_load calc in migration 2724 * track group sched_entity load average for task_h_load calc in migration
2705 */ 2725 */
2706 __update_load_avg(now, cpu, &se->avg, 2726 __update_load_avg(now, cpu, &se->avg,
2707 se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); 2727 se->on_rq * scale_load_down(se->load.weight),
2728 cfs_rq->curr == se, NULL);
2708 2729
2709 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) 2730 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2710 update_tg_load_avg(cfs_rq, 0); 2731 update_tg_load_avg(cfs_rq, 0);
2711} 2732}
2712 2733
2734static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2735{
2736 if (!sched_feat(ATTACH_AGE_LOAD))
2737 goto skip_aging;
2738
2739 /*
2740 * If we got migrated (either between CPUs or between cgroups) we'll
2741 * have aged the average right before clearing @last_update_time.
2742 */
2743 if (se->avg.last_update_time) {
2744 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2745 &se->avg, 0, 0, NULL);
2746
2747 /*
2748 * XXX: we could have just aged the entire load away if we've been
2749 * absent from the fair class for too long.
2750 */
2751 }
2752
2753skip_aging:
2754 se->avg.last_update_time = cfs_rq->avg.last_update_time;
2755 cfs_rq->avg.load_avg += se->avg.load_avg;
2756 cfs_rq->avg.load_sum += se->avg.load_sum;
2757 cfs_rq->avg.util_avg += se->avg.util_avg;
2758 cfs_rq->avg.util_sum += se->avg.util_sum;
2759}
2760
2761static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762{
2763 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2764 &se->avg, se->on_rq * scale_load_down(se->load.weight),
2765 cfs_rq->curr == se, NULL);
2766
2767 cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
2768 cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
2769 cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
2770 cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
2771}
2772
2713/* Add the load generated by se into cfs_rq's load average */ 2773/* Add the load generated by se into cfs_rq's load average */
2714static inline void 2774static inline void
2715enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2775enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2716{ 2776{
2717 struct sched_avg *sa = &se->avg; 2777 struct sched_avg *sa = &se->avg;
2718 u64 now = cfs_rq_clock_task(cfs_rq); 2778 u64 now = cfs_rq_clock_task(cfs_rq);
2719 int migrated = 0, decayed; 2779 int migrated, decayed;
2720 2780
2721 if (sa->last_update_time == 0) { 2781 migrated = !sa->last_update_time;
2722 sa->last_update_time = now; 2782 if (!migrated) {
2723 migrated = 1;
2724 }
2725 else {
2726 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, 2783 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2727 se->on_rq * scale_load_down(se->load.weight), 2784 se->on_rq * scale_load_down(se->load.weight),
2728 cfs_rq->curr == se, NULL); 2785 cfs_rq->curr == se, NULL);
@@ -2733,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2733 cfs_rq->runnable_load_avg += sa->load_avg; 2790 cfs_rq->runnable_load_avg += sa->load_avg;
2734 cfs_rq->runnable_load_sum += sa->load_sum; 2791 cfs_rq->runnable_load_sum += sa->load_sum;
2735 2792
2736 if (migrated) { 2793 if (migrated)
2737 cfs_rq->avg.load_avg += sa->load_avg; 2794 attach_entity_load_avg(cfs_rq, se);
2738 cfs_rq->avg.load_sum += sa->load_sum;
2739 cfs_rq->avg.util_avg += sa->util_avg;
2740 cfs_rq->avg.util_sum += sa->util_sum;
2741 }
2742 2795
2743 if (decayed || migrated) 2796 if (decayed || migrated)
2744 update_tg_load_avg(cfs_rq, 0); 2797 update_tg_load_avg(cfs_rq, 0);
@@ -2753,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2753 cfs_rq->runnable_load_avg = 2806 cfs_rq->runnable_load_avg =
2754 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); 2807 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2755 cfs_rq->runnable_load_sum = 2808 cfs_rq->runnable_load_sum =
2756 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); 2809 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2757} 2810}
2758 2811
2759/* 2812/*
@@ -2821,6 +2874,11 @@ static inline void
2821dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 2874dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2822static inline void remove_entity_load_avg(struct sched_entity *se) {} 2875static inline void remove_entity_load_avg(struct sched_entity *se) {}
2823 2876
2877static inline void
2878attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2879static inline void
2880detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2881
2824static inline int idle_balance(struct rq *rq) 2882static inline int idle_balance(struct rq *rq)
2825{ 2883{
2826 return 0; 2884 return 0;
@@ -4817,32 +4875,39 @@ next:
4817done: 4875done:
4818 return target; 4876 return target;
4819} 4877}
4878
4820/* 4879/*
4821 * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS 4880 * cpu_util returns the amount of capacity of a CPU that is used by CFS
4822 * tasks. The unit of the return value must be the one of capacity so we can 4881 * tasks. The unit of the return value must be the one of capacity so we can
4823 * compare the usage with the capacity of the CPU that is available for CFS 4882 * compare the utilization with the capacity of the CPU that is available for
4824 * task (ie cpu_capacity). 4883 * CFS task (ie cpu_capacity).
4825 * cfs.avg.util_avg is the sum of running time of runnable tasks on a 4884 *
4826 * CPU. It represents the amount of utilization of a CPU in the range 4885 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
4827 * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full 4886 * recent utilization of currently non-runnable tasks on a CPU. It represents
4828 * capacity of the CPU because it's about the running time on this CPU. 4887 * the amount of utilization of a CPU in the range [0..capacity_orig] where
4829 * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE 4888 * capacity_orig is the cpu_capacity available at the highest frequency
4830 * because of unfortunate rounding in util_avg or just 4889 * (arch_scale_freq_capacity()).
4831 * after migrating tasks until the average stabilizes with the new running 4890 * The utilization of a CPU converges towards a sum equal to or less than the
4832 * time. So we need to check that the usage stays into the range 4891 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
4833 * [0..cpu_capacity_orig] and cap if necessary. 4892 * the running time on this CPU scaled by capacity_curr.
4834 * Without capping the usage, a group could be seen as overloaded (CPU0 usage 4893 *
4835 * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity 4894 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
4895 * higher than capacity_orig because of unfortunate rounding in
4896 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
4897 * the average stabilizes with the new running time. We need to check that the
4898 * utilization stays within the range of [0..capacity_orig] and cap it if
4899 * necessary. Without utilization capping, a group could be seen as overloaded
4900 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
4901 * available capacity. We allow utilization to overshoot capacity_curr (but not
4902 * capacity_orig) as it useful for predicting the capacity required after task
4903 * migrations (scheduler-driven DVFS).
4836 */ 4904 */
4837static int get_cpu_usage(int cpu) 4905static int cpu_util(int cpu)
4838{ 4906{
4839 unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; 4907 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
4840 unsigned long capacity = capacity_orig_of(cpu); 4908 unsigned long capacity = capacity_orig_of(cpu);
4841 4909
4842 if (usage >= SCHED_LOAD_SCALE) 4910 return (util >= capacity) ? capacity : util;
4843 return capacity;
4844
4845 return (usage * capacity) >> SCHED_LOAD_SHIFT;
4846} 4911}
4847 4912
4848/* 4913/*
@@ -4945,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4945 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 5010 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
4946 * other assumptions, including the state of rq->lock, should be made. 5011 * other assumptions, including the state of rq->lock, should be made.
4947 */ 5012 */
4948static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) 5013static void migrate_task_rq_fair(struct task_struct *p)
4949{ 5014{
4950 /* 5015 /*
4951 * We are supposed to update the task to "current" time, then its up to date 5016 * We are supposed to update the task to "current" time, then its up to date
@@ -5525,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5525 unsigned long src_faults, dst_faults; 5590 unsigned long src_faults, dst_faults;
5526 int src_nid, dst_nid; 5591 int src_nid, dst_nid;
5527 5592
5528 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5593 if (!static_branch_likely(&sched_numa_balancing))
5529 return -1; 5594 return -1;
5530 5595
5531 if (!sched_feat(NUMA)) 5596 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5532 return -1; 5597 return -1;
5533 5598
5534 src_nid = cpu_to_node(env->src_cpu); 5599 src_nid = cpu_to_node(env->src_cpu);
@@ -5934,7 +5999,7 @@ struct sg_lb_stats {
5934 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 5999 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5935 unsigned long load_per_task; 6000 unsigned long load_per_task;
5936 unsigned long group_capacity; 6001 unsigned long group_capacity;
5937 unsigned long group_usage; /* Total usage of the group */ 6002 unsigned long group_util; /* Total utilization of the group */
5938 unsigned int sum_nr_running; /* Nr tasks running in the group */ 6003 unsigned int sum_nr_running; /* Nr tasks running in the group */
5939 unsigned int idle_cpus; 6004 unsigned int idle_cpus;
5940 unsigned int group_weight; 6005 unsigned int group_weight;
@@ -6010,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
6010 return load_idx; 6075 return load_idx;
6011} 6076}
6012 6077
6013static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
6014{
6015 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
6016 return sd->smt_gain / sd->span_weight;
6017
6018 return SCHED_CAPACITY_SCALE;
6019}
6020
6021unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
6022{
6023 return default_scale_cpu_capacity(sd, cpu);
6024}
6025
6026static unsigned long scale_rt_capacity(int cpu) 6078static unsigned long scale_rt_capacity(int cpu)
6027{ 6079{
6028 struct rq *rq = cpu_rq(cpu); 6080 struct rq *rq = cpu_rq(cpu);
@@ -6052,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu)
6052 6104
6053static void update_cpu_capacity(struct sched_domain *sd, int cpu) 6105static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6054{ 6106{
6055 unsigned long capacity = SCHED_CAPACITY_SCALE; 6107 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6056 struct sched_group *sdg = sd->groups; 6108 struct sched_group *sdg = sd->groups;
6057 6109
6058 if (sched_feat(ARCH_CAPACITY))
6059 capacity *= arch_scale_cpu_capacity(sd, cpu);
6060 else
6061 capacity *= default_scale_cpu_capacity(sd, cpu);
6062
6063 capacity >>= SCHED_CAPACITY_SHIFT;
6064
6065 cpu_rq(cpu)->cpu_capacity_orig = capacity; 6110 cpu_rq(cpu)->cpu_capacity_orig = capacity;
6066 6111
6067 capacity *= scale_rt_capacity(cpu); 6112 capacity *= scale_rt_capacity(cpu);
@@ -6187,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group)
6187 * group_has_capacity returns true if the group has spare capacity that could 6232 * group_has_capacity returns true if the group has spare capacity that could
6188 * be used by some tasks. 6233 * be used by some tasks.
6189 * We consider that a group has spare capacity if the * number of task is 6234 * We consider that a group has spare capacity if the * number of task is
6190 * smaller than the number of CPUs or if the usage is lower than the available 6235 * smaller than the number of CPUs or if the utilization is lower than the
6191 * capacity for CFS tasks. 6236 * available capacity for CFS tasks.
6192 * For the latter, we use a threshold to stabilize the state, to take into 6237 * For the latter, we use a threshold to stabilize the state, to take into
6193 * account the variance of the tasks' load and to return true if the available 6238 * account the variance of the tasks' load and to return true if the available
6194 * capacity in meaningful for the load balancer. 6239 * capacity in meaningful for the load balancer.
@@ -6202,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6202 return true; 6247 return true;
6203 6248
6204 if ((sgs->group_capacity * 100) > 6249 if ((sgs->group_capacity * 100) >
6205 (sgs->group_usage * env->sd->imbalance_pct)) 6250 (sgs->group_util * env->sd->imbalance_pct))
6206 return true; 6251 return true;
6207 6252
6208 return false; 6253 return false;
@@ -6223,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6223 return false; 6268 return false;
6224 6269
6225 if ((sgs->group_capacity * 100) < 6270 if ((sgs->group_capacity * 100) <
6226 (sgs->group_usage * env->sd->imbalance_pct)) 6271 (sgs->group_util * env->sd->imbalance_pct))
6227 return true; 6272 return true;
6228 6273
6229 return false; 6274 return false;
6230} 6275}
6231 6276
6232static enum group_type group_classify(struct lb_env *env, 6277static inline enum
6233 struct sched_group *group, 6278group_type group_classify(struct sched_group *group,
6234 struct sg_lb_stats *sgs) 6279 struct sg_lb_stats *sgs)
6235{ 6280{
6236 if (sgs->group_no_capacity) 6281 if (sgs->group_no_capacity)
6237 return group_overloaded; 6282 return group_overloaded;
@@ -6271,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6271 load = source_load(i, load_idx); 6316 load = source_load(i, load_idx);
6272 6317
6273 sgs->group_load += load; 6318 sgs->group_load += load;
6274 sgs->group_usage += get_cpu_usage(i); 6319 sgs->group_util += cpu_util(i);
6275 sgs->sum_nr_running += rq->cfs.h_nr_running; 6320 sgs->sum_nr_running += rq->cfs.h_nr_running;
6276 6321
6277 if (rq->nr_running > 1) 6322 if (rq->nr_running > 1)
@@ -6296,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6296 sgs->group_weight = group->group_weight; 6341 sgs->group_weight = group->group_weight;
6297 6342
6298 sgs->group_no_capacity = group_is_overloaded(env, sgs); 6343 sgs->group_no_capacity = group_is_overloaded(env, sgs);
6299 sgs->group_type = group_classify(env, group, sgs); 6344 sgs->group_type = group_classify(group, sgs);
6300} 6345}
6301 6346
6302/** 6347/**
@@ -6430,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6430 group_has_capacity(env, &sds->local_stat) && 6475 group_has_capacity(env, &sds->local_stat) &&
6431 (sgs->sum_nr_running > 1)) { 6476 (sgs->sum_nr_running > 1)) {
6432 sgs->group_no_capacity = 1; 6477 sgs->group_no_capacity = 1;
6433 sgs->group_type = group_overloaded; 6478 sgs->group_type = group_classify(sg, sgs);
6434 } 6479 }
6435 6480
6436 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6481 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -7610,8 +7655,22 @@ out:
7610 * When the cpu is attached to null domain for ex, it will not be 7655 * When the cpu is attached to null domain for ex, it will not be
7611 * updated. 7656 * updated.
7612 */ 7657 */
7613 if (likely(update_next_balance)) 7658 if (likely(update_next_balance)) {
7614 rq->next_balance = next_balance; 7659 rq->next_balance = next_balance;
7660
7661#ifdef CONFIG_NO_HZ_COMMON
7662 /*
7663 * If this CPU has been elected to perform the nohz idle
7664 * balance. Other idle CPUs have already rebalanced with
7665 * nohz_idle_balance() and nohz.next_balance has been
7666 * updated accordingly. This CPU is now running the idle load
7667 * balance for itself and we need to update the
7668 * nohz.next_balance accordingly.
7669 */
7670 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
7671 nohz.next_balance = rq->next_balance;
7672#endif
7673 }
7615} 7674}
7616 7675
7617#ifdef CONFIG_NO_HZ_COMMON 7676#ifdef CONFIG_NO_HZ_COMMON
@@ -7624,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7624 int this_cpu = this_rq->cpu; 7683 int this_cpu = this_rq->cpu;
7625 struct rq *rq; 7684 struct rq *rq;
7626 int balance_cpu; 7685 int balance_cpu;
7686 /* Earliest time when we have to do rebalance again */
7687 unsigned long next_balance = jiffies + 60*HZ;
7688 int update_next_balance = 0;
7627 7689
7628 if (idle != CPU_IDLE || 7690 if (idle != CPU_IDLE ||
7629 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) 7691 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
@@ -7655,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7655 rebalance_domains(rq, CPU_IDLE); 7717 rebalance_domains(rq, CPU_IDLE);
7656 } 7718 }
7657 7719
7658 if (time_after(this_rq->next_balance, rq->next_balance)) 7720 if (time_after(next_balance, rq->next_balance)) {
7659 this_rq->next_balance = rq->next_balance; 7721 next_balance = rq->next_balance;
7722 update_next_balance = 1;
7723 }
7660 } 7724 }
7661 nohz.next_balance = this_rq->next_balance; 7725
7726 /*
7727 * next_balance will be updated only when there is a need.
7728 * When the CPU is attached to null domain for ex, it will not be
7729 * updated.
7730 */
7731 if (likely(update_next_balance))
7732 nohz.next_balance = next_balance;
7662end: 7733end:
7663 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); 7734 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7664} 7735}
@@ -7811,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7811 entity_tick(cfs_rq, se, queued); 7882 entity_tick(cfs_rq, se, queued);
7812 } 7883 }
7813 7884
7814 if (numabalancing_enabled) 7885 if (static_branch_unlikely(&sched_numa_balancing))
7815 task_tick_numa(rq, curr); 7886 task_tick_numa(rq, curr);
7816} 7887}
7817 7888
@@ -7887,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7887 check_preempt_curr(rq, p, 0); 7958 check_preempt_curr(rq, p, 0);
7888} 7959}
7889 7960
7890static void switched_from_fair(struct rq *rq, struct task_struct *p) 7961static inline bool vruntime_normalized(struct task_struct *p)
7891{ 7962{
7892 struct sched_entity *se = &p->se; 7963 struct sched_entity *se = &p->se;
7893 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7894 7964
7895 /* 7965 /*
7896 * Ensure the task's vruntime is normalized, so that when it's 7966 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
7897 * switched back to the fair class the enqueue_entity(.flags=0) will 7967 * the dequeue_entity(.flags=0) will already have normalized the
7898 * do the right thing. 7968 * vruntime.
7969 */
7970 if (p->on_rq)
7971 return true;
7972
7973 /*
7974 * When !on_rq, vruntime of the task has usually NOT been normalized.
7975 * But there are some cases where it has already been normalized:
7899 * 7976 *
7900 * If it's queued, then the dequeue_entity(.flags=0) will already 7977 * - A forked child which is waiting for being woken up by
7901 * have normalized the vruntime, if it's !queued, then only when 7978 * wake_up_new_task().
7902 * the task is sleeping will it still have non-normalized vruntime. 7979 * - A task which has been woken up by try_to_wake_up() and
7980 * waiting for actually being woken up by sched_ttwu_pending().
7903 */ 7981 */
7904 if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { 7982 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
7983 return true;
7984
7985 return false;
7986}
7987
7988static void detach_task_cfs_rq(struct task_struct *p)
7989{
7990 struct sched_entity *se = &p->se;
7991 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7992
7993 if (!vruntime_normalized(p)) {
7905 /* 7994 /*
7906 * Fix up our vruntime so that the current sleep doesn't 7995 * Fix up our vruntime so that the current sleep doesn't
7907 * cause 'unlimited' sleep bonus. 7996 * cause 'unlimited' sleep bonus.
@@ -7910,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7910 se->vruntime -= cfs_rq->min_vruntime; 7999 se->vruntime -= cfs_rq->min_vruntime;
7911 } 8000 }
7912 8001
7913#ifdef CONFIG_SMP
7914 /* Catch up with the cfs_rq and remove our load when we leave */ 8002 /* Catch up with the cfs_rq and remove our load when we leave */
7915 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, 8003 detach_entity_load_avg(cfs_rq, se);
7916 se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
7917
7918 cfs_rq->avg.load_avg =
7919 max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
7920 cfs_rq->avg.load_sum =
7921 max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
7922 cfs_rq->avg.util_avg =
7923 max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
7924 cfs_rq->avg.util_sum =
7925 max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
7926#endif
7927} 8004}
7928 8005
7929/* 8006static void attach_task_cfs_rq(struct task_struct *p)
7930 * We switched to the sched_fair class.
7931 */
7932static void switched_to_fair(struct rq *rq, struct task_struct *p)
7933{ 8007{
7934 struct sched_entity *se = &p->se; 8008 struct sched_entity *se = &p->se;
8009 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7935 8010
7936#ifdef CONFIG_FAIR_GROUP_SCHED 8011#ifdef CONFIG_FAIR_GROUP_SCHED
7937 /* 8012 /*
@@ -7941,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
7941 se->depth = se->parent ? se->parent->depth + 1 : 0; 8016 se->depth = se->parent ? se->parent->depth + 1 : 0;
7942#endif 8017#endif
7943 8018
7944 if (!task_on_rq_queued(p)) { 8019 /* Synchronize task with its cfs_rq */
8020 attach_entity_load_avg(cfs_rq, se);
7945 8021
8022 if (!vruntime_normalized(p))
8023 se->vruntime += cfs_rq->min_vruntime;
8024}
8025
8026static void switched_from_fair(struct rq *rq, struct task_struct *p)
8027{
8028 detach_task_cfs_rq(p);
8029}
8030
8031static void switched_to_fair(struct rq *rq, struct task_struct *p)
8032{
8033 attach_task_cfs_rq(p);
8034
8035 if (task_on_rq_queued(p)) {
7946 /* 8036 /*
7947 * Ensure the task has a non-normalized vruntime when it is switched 8037 * We were most likely switched from sched_rt, so
7948 * back to the fair class with !queued, so that enqueue_entity() at 8038 * kick off the schedule if running, otherwise just see
7949 * wake-up time will do the right thing. 8039 * if we can still preempt the current task.
7950 *
7951 * If it's queued, then the enqueue_entity(.flags=0) makes the task
7952 * has non-normalized vruntime, if it's !queued, then it still has
7953 * normalized vruntime.
7954 */ 8040 */
7955 if (p->state != TASK_RUNNING) 8041 if (rq->curr == p)
7956 se->vruntime += cfs_rq_of(se)->min_vruntime; 8042 resched_curr(rq);
7957 return; 8043 else
8044 check_preempt_curr(rq, p, 0);
7958 } 8045 }
7959
7960 /*
7961 * We were most likely switched from sched_rt, so
7962 * kick off the schedule if running, otherwise just see
7963 * if we can still preempt the current task.
7964 */
7965 if (rq->curr == p)
7966 resched_curr(rq);
7967 else
7968 check_preempt_curr(rq, p, 0);
7969} 8046}
7970 8047
7971/* Account for a task changing its policy or group. 8048/* Account for a task changing its policy or group.
@@ -8000,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
8000} 8077}
8001 8078
8002#ifdef CONFIG_FAIR_GROUP_SCHED 8079#ifdef CONFIG_FAIR_GROUP_SCHED
8003static void task_move_group_fair(struct task_struct *p, int queued) 8080static void task_move_group_fair(struct task_struct *p)
8004{ 8081{
8005 struct sched_entity *se = &p->se; 8082 detach_task_cfs_rq(p);
8006 struct cfs_rq *cfs_rq;
8007
8008 /*
8009 * If the task was not on the rq at the time of this cgroup movement
8010 * it must have been asleep, sleeping tasks keep their ->vruntime
8011 * absolute on their old rq until wakeup (needed for the fair sleeper
8012 * bonus in place_entity()).
8013 *
8014 * If it was on the rq, we've just 'preempted' it, which does convert
8015 * ->vruntime to a relative base.
8016 *
8017 * Make sure both cases convert their relative position when migrating
8018 * to another cgroup's rq. This does somewhat interfere with the
8019 * fair sleeper stuff for the first placement, but who cares.
8020 */
8021 /*
8022 * When !queued, vruntime of the task has usually NOT been normalized.
8023 * But there are some cases where it has already been normalized:
8024 *
8025 * - Moving a forked child which is waiting for being woken up by
8026 * wake_up_new_task().
8027 * - Moving a task which has been woken up by try_to_wake_up() and
8028 * waiting for actually being woken up by sched_ttwu_pending().
8029 *
8030 * To prevent boost or penalty in the new cfs_rq caused by delta
8031 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
8032 */
8033 if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
8034 queued = 1;
8035
8036 if (!queued)
8037 se->vruntime -= cfs_rq_of(se)->min_vruntime;
8038 set_task_rq(p, task_cpu(p)); 8083 set_task_rq(p, task_cpu(p));
8039 se->depth = se->parent ? se->parent->depth + 1 : 0;
8040 if (!queued) {
8041 cfs_rq = cfs_rq_of(se);
8042 se->vruntime += cfs_rq->min_vruntime;
8043 8084
8044#ifdef CONFIG_SMP 8085#ifdef CONFIG_SMP
8045 /* Virtually synchronize task with its new cfs_rq */ 8086 /* Tell se's cfs_rq has been changed -- migrated */
8046 p->se.avg.last_update_time = cfs_rq->avg.last_update_time; 8087 p->se.avg.last_update_time = 0;
8047 cfs_rq->avg.load_avg += p->se.avg.load_avg;
8048 cfs_rq->avg.load_sum += p->se.avg.load_sum;
8049 cfs_rq->avg.util_avg += p->se.avg.util_avg;
8050 cfs_rq->avg.util_sum += p->se.avg.util_sum;
8051#endif 8088#endif
8052 } 8089 attach_task_cfs_rq(p);
8053} 8090}
8054 8091
8055void free_fair_sched_group(struct task_group *tg) 8092void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 83a50e7ca533..69631fa46c2f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
36 */ 36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true) 37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38 38
39/*
40 * Use arch dependent cpu capacity functions
41 */
42SCHED_FEAT(ARCH_CAPACITY, true)
43
44SCHED_FEAT(HRTICK, false) 39SCHED_FEAT(HRTICK, false)
45SCHED_FEAT(DOUBLE_TICK, false) 40SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 41SCHED_FEAT(LB_BIAS, true)
@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
72SCHED_FEAT(FORCE_SD_OVERLAP, false) 67SCHED_FEAT(FORCE_SD_OVERLAP, false)
73SCHED_FEAT(RT_RUNTIME_SHARE, true) 68SCHED_FEAT(RT_RUNTIME_SHARE, true)
74SCHED_FEAT(LB_MIN, false) 69SCHED_FEAT(LB_MIN, false)
70SCHED_FEAT(ATTACH_AGE_LOAD, true)
75 71
76/*
77 * Apply the automatic NUMA scheduling policy. Enabled automatically
78 * at runtime if running on a NUMA machine. Can be controlled via
79 * numa_balancing=
80 */
81#ifdef CONFIG_NUMA_BALANCING
82
83/*
84 * NUMA will favor moving tasks towards nodes where a higher number of
85 * hinting faults are recorded during active load balancing. It will
86 * resist moving tasks towards nodes where a lower number of hinting
87 * faults have been recorded.
88 */
89SCHED_FEAT(NUMA, true)
90#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d2ea59364a1c..e3cc16312046 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
635/* 635/*
636 * We ran out of runtime, see if we can borrow some from our neighbours. 636 * We ran out of runtime, see if we can borrow some from our neighbours.
637 */ 637 */
638static int do_balance_runtime(struct rt_rq *rt_rq) 638static void do_balance_runtime(struct rt_rq *rt_rq)
639{ 639{
640 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 640 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
641 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 641 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
642 int i, weight, more = 0; 642 int i, weight;
643 u64 rt_period; 643 u64 rt_period;
644 644
645 weight = cpumask_weight(rd->span); 645 weight = cpumask_weight(rd->span);
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
673 diff = rt_period - rt_rq->rt_runtime; 673 diff = rt_period - rt_rq->rt_runtime;
674 iter->rt_runtime -= diff; 674 iter->rt_runtime -= diff;
675 rt_rq->rt_runtime += diff; 675 rt_rq->rt_runtime += diff;
676 more = 1;
677 if (rt_rq->rt_runtime == rt_period) { 676 if (rt_rq->rt_runtime == rt_period) {
678 raw_spin_unlock(&iter->rt_runtime_lock); 677 raw_spin_unlock(&iter->rt_runtime_lock);
679 break; 678 break;
@@ -683,8 +682,6 @@ next:
683 raw_spin_unlock(&iter->rt_runtime_lock); 682 raw_spin_unlock(&iter->rt_runtime_lock);
684 } 683 }
685 raw_spin_unlock(&rt_b->rt_runtime_lock); 684 raw_spin_unlock(&rt_b->rt_runtime_lock);
686
687 return more;
688} 685}
689 686
690/* 687/*
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
796 } 793 }
797} 794}
798 795
799static int balance_runtime(struct rt_rq *rt_rq) 796static void balance_runtime(struct rt_rq *rt_rq)
800{ 797{
801 int more = 0;
802
803 if (!sched_feat(RT_RUNTIME_SHARE)) 798 if (!sched_feat(RT_RUNTIME_SHARE))
804 return more; 799 return;
805 800
806 if (rt_rq->rt_time > rt_rq->rt_runtime) { 801 if (rt_rq->rt_time > rt_rq->rt_runtime) {
807 raw_spin_unlock(&rt_rq->rt_runtime_lock); 802 raw_spin_unlock(&rt_rq->rt_runtime_lock);
808 more = do_balance_runtime(rt_rq); 803 do_balance_runtime(rt_rq);
809 raw_spin_lock(&rt_rq->rt_runtime_lock); 804 raw_spin_lock(&rt_rq->rt_runtime_lock);
810 } 805 }
811
812 return more;
813} 806}
814#else /* !CONFIG_SMP */ 807#else /* !CONFIG_SMP */
815static inline int balance_runtime(struct rt_rq *rt_rq) 808static inline void balance_runtime(struct rt_rq *rt_rq) {}
816{
817 return 0;
818}
819#endif /* CONFIG_SMP */ 809#endif /* CONFIG_SMP */
820 810
821static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 811static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d2a119c7ad9..efd3bfc7e347 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
84 */ 84 */
85#define RUNTIME_INF ((u64)~0ULL) 85#define RUNTIME_INF ((u64)~0ULL)
86 86
87static inline int idle_policy(int policy)
88{
89 return policy == SCHED_IDLE;
90}
87static inline int fair_policy(int policy) 91static inline int fair_policy(int policy)
88{ 92{
89 return policy == SCHED_NORMAL || policy == SCHED_BATCH; 93 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
98{ 102{
99 return policy == SCHED_DEADLINE; 103 return policy == SCHED_DEADLINE;
100} 104}
105static inline bool valid_policy(int policy)
106{
107 return idle_policy(policy) || fair_policy(policy) ||
108 rt_policy(policy) || dl_policy(policy);
109}
101 110
102static inline int task_has_rt_policy(struct task_struct *p) 111static inline int task_has_rt_policy(struct task_struct *p)
103{ 112{
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
109 return dl_policy(p->policy); 118 return dl_policy(p->policy);
110} 119}
111 120
112static inline bool dl_time_before(u64 a, u64 b)
113{
114 return (s64)(a - b) < 0;
115}
116
117/* 121/*
118 * Tells if entity @a should preempt entity @b. 122 * Tells if entity @a should preempt entity @b.
119 */ 123 */
@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
1003#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 1007#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1004#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1008#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
1005 1009
1006#ifdef CONFIG_NUMA_BALANCING 1010extern struct static_key_false sched_numa_balancing;
1007#define sched_feat_numa(x) sched_feat(x)
1008#ifdef CONFIG_SCHED_DEBUG
1009#define numabalancing_enabled sched_feat_numa(NUMA)
1010#else
1011extern bool numabalancing_enabled;
1012#endif /* CONFIG_SCHED_DEBUG */
1013#else
1014#define sched_feat_numa(x) (0)
1015#define numabalancing_enabled (0)
1016#endif /* CONFIG_NUMA_BALANCING */
1017 1011
1018static inline u64 global_rt_period(void) 1012static inline u64 global_rt_period(void)
1019{ 1013{
@@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
1157 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1151 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1158}; 1152};
1159 1153
1160#define ENQUEUE_WAKEUP 1 1154#define ENQUEUE_WAKEUP 0x01
1161#define ENQUEUE_HEAD 2 1155#define ENQUEUE_HEAD 0x02
1162#ifdef CONFIG_SMP 1156#ifdef CONFIG_SMP
1163#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ 1157#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
1164#else 1158#else
1165#define ENQUEUE_WAKING 0 1159#define ENQUEUE_WAKING 0x00
1166#endif 1160#endif
1167#define ENQUEUE_REPLENISH 8 1161#define ENQUEUE_REPLENISH 0x08
1162#define ENQUEUE_RESTORE 0x10
1168 1163
1169#define DEQUEUE_SLEEP 1 1164#define DEQUEUE_SLEEP 0x01
1165#define DEQUEUE_SAVE 0x02
1170 1166
1171#define RETRY_TASK ((void *)-1UL) 1167#define RETRY_TASK ((void *)-1UL)
1172 1168
@@ -1194,7 +1190,7 @@ struct sched_class {
1194 1190
1195#ifdef CONFIG_SMP 1191#ifdef CONFIG_SMP
1196 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1192 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1197 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1193 void (*migrate_task_rq)(struct task_struct *p);
1198 1194
1199 void (*task_waking) (struct task_struct *task); 1195 void (*task_waking) (struct task_struct *task);
1200 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1196 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1227,7 +1223,7 @@ struct sched_class {
1227 void (*update_curr) (struct rq *rq); 1223 void (*update_curr) (struct rq *rq);
1228 1224
1229#ifdef CONFIG_FAIR_GROUP_SCHED 1225#ifdef CONFIG_FAIR_GROUP_SCHED
1230 void (*task_move_group) (struct task_struct *p, int on_rq); 1226 void (*task_move_group) (struct task_struct *p);
1231#endif 1227#endif
1232}; 1228};
1233 1229
@@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
1405} 1401}
1406#endif 1402#endif
1407 1403
1404#ifndef arch_scale_cpu_capacity
1405static __always_inline
1406unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
1407{
1408 if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
1409 return sd->smt_gain / sd->span_weight;
1410
1411 return SCHED_CAPACITY_SCALE;
1412}
1413#endif
1414
1408static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1415static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1409{ 1416{
1410 rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); 1417 rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index a818cbc73e14..d264f59bff56 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
222{ 222{
223 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 223 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
224 224
225 if (ht->pre_unpark) 225 if (!ht->selfparking)
226 ht->pre_unpark(cpu); 226 kthread_unpark(tsk);
227 kthread_unpark(tsk);
228} 227}
229 228
230void smpboot_unpark_threads(unsigned int cpu) 229void smpboot_unpark_threads(unsigned int cpu)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12484e5d5c88..867bc20e1ef1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
73 } 73 }
74} 74}
75 75
76static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
77 struct cpu_stop_work *work)
78{
79 list_add_tail(&work->list, &stopper->works);
80 wake_up_process(stopper->thread);
81}
82
76/* queue @work to @stopper. if offline, @work is completed immediately */ 83/* queue @work to @stopper. if offline, @work is completed immediately */
77static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) 84static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
78{ 85{
79 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 86 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
80
81 unsigned long flags; 87 unsigned long flags;
82 88
83 spin_lock_irqsave(&stopper->lock, flags); 89 spin_lock_irqsave(&stopper->lock, flags);
84 90 if (stopper->enabled)
85 if (stopper->enabled) { 91 __cpu_stop_queue_work(stopper, work);
86 list_add_tail(&work->list, &stopper->works); 92 else
87 wake_up_process(stopper->thread);
88 } else
89 cpu_stop_signal_done(work->done, false); 93 cpu_stop_signal_done(work->done, false);
90
91 spin_unlock_irqrestore(&stopper->lock, flags); 94 spin_unlock_irqrestore(&stopper->lock, flags);
92} 95}
93 96
@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data)
213 return err; 216 return err;
214} 217}
215 218
219static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
220 int cpu2, struct cpu_stop_work *work2)
221{
222 struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
223 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
224 int err;
225
226 lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
227 spin_lock_irq(&stopper1->lock);
228 spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
229
230 err = -ENOENT;
231 if (!stopper1->enabled || !stopper2->enabled)
232 goto unlock;
233
234 err = 0;
235 __cpu_stop_queue_work(stopper1, work1);
236 __cpu_stop_queue_work(stopper2, work2);
237unlock:
238 spin_unlock(&stopper2->lock);
239 spin_unlock_irq(&stopper1->lock);
240 lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
241
242 return err;
243}
216/** 244/**
217 * stop_two_cpus - stops two cpus 245 * stop_two_cpus - stops two cpus
218 * @cpu1: the cpu to stop 246 * @cpu1: the cpu to stop
@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
247 cpu_stop_init_done(&done, 2); 275 cpu_stop_init_done(&done, 2);
248 set_state(&msdata, MULTI_STOP_PREPARE); 276 set_state(&msdata, MULTI_STOP_PREPARE);
249 277
250 /* 278 if (cpu1 > cpu2)
251 * If we observe both CPUs active we know _cpu_down() cannot yet have 279 swap(cpu1, cpu2);
252 * queued its stop_machine works and therefore ours will get executed 280 if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
253 * first. Or its not either one of our CPUs that's getting unplugged,
254 * in which case we don't care.
255 *
256 * This relies on the stopper workqueues to be FIFO.
257 */
258 if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
259 preempt_enable(); 281 preempt_enable();
260 return -ENOENT; 282 return -ENOENT;
261 } 283 }
262 284
263 lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
264 cpu_stop_queue_work(cpu1, &work1);
265 cpu_stop_queue_work(cpu2, &work2);
266 lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
267
268 preempt_enable(); 285 preempt_enable();
269 286
270 wait_for_completion(&done.completion); 287 wait_for_completion(&done.completion);
@@ -452,6 +469,18 @@ repeat:
452 } 469 }
453} 470}
454 471
472void stop_machine_park(int cpu)
473{
474 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
475 /*
476 * Lockless. cpu_stopper_thread() will take stopper->lock and flush
477 * the pending works before it parks, until then it is fine to queue
478 * the new works.
479 */
480 stopper->enabled = false;
481 kthread_park(stopper->thread);
482}
483
455extern void sched_set_stop_task(int cpu, struct task_struct *stop); 484extern void sched_set_stop_task(int cpu, struct task_struct *stop);
456 485
457static void cpu_stop_create(unsigned int cpu) 486static void cpu_stop_create(unsigned int cpu)
@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu)
462static void cpu_stop_park(unsigned int cpu) 491static void cpu_stop_park(unsigned int cpu)
463{ 492{
464 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 493 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
465 struct cpu_stop_work *work, *tmp;
466 unsigned long flags;
467 494
468 /* drain remaining works */ 495 WARN_ON(!list_empty(&stopper->works));
469 spin_lock_irqsave(&stopper->lock, flags);
470 list_for_each_entry_safe(work, tmp, &stopper->works, list) {
471 list_del_init(&work->list);
472 cpu_stop_signal_done(work->done, false);
473 }
474 stopper->enabled = false;
475 spin_unlock_irqrestore(&stopper->lock, flags);
476} 496}
477 497
478static void cpu_stop_unpark(unsigned int cpu) 498void stop_machine_unpark(int cpu)
479{ 499{
480 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 500 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
481 501
482 spin_lock_irq(&stopper->lock);
483 stopper->enabled = true; 502 stopper->enabled = true;
484 spin_unlock_irq(&stopper->lock); 503 kthread_unpark(stopper->thread);
485} 504}
486 505
487static struct smp_hotplug_thread cpu_stop_threads = { 506static struct smp_hotplug_thread cpu_stop_threads = {
@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
490 .thread_fn = cpu_stopper_thread, 509 .thread_fn = cpu_stopper_thread,
491 .thread_comm = "migration/%u", 510 .thread_comm = "migration/%u",
492 .create = cpu_stop_create, 511 .create = cpu_stop_create,
493 .setup = cpu_stop_unpark,
494 .park = cpu_stop_park, 512 .park = cpu_stop_park,
495 .pre_unpark = cpu_stop_unpark,
496 .selfparking = true, 513 .selfparking = true,
497}; 514};
498 515
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
508 } 525 }
509 526
510 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); 527 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
528 stop_machine_unpark(raw_smp_processor_id());
511 stop_machine_initialized = true; 529 stop_machine_initialized = true;
512 return 0; 530 return 0;
513} 531}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b0623ac785a2..00611e95a8ee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5697,7 +5697,7 @@ free:
5697} 5697}
5698 5698
5699static void 5699static void
5700ftrace_graph_probe_sched_switch(void *ignore, 5700ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
5701 struct task_struct *prev, struct task_struct *next) 5701 struct task_struct *prev, struct task_struct *next)
5702{ 5702{
5703 unsigned long long timestamp; 5703 unsigned long long timestamp;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index f270088e9929..4c896a0101bd 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@ static int sched_ref;
16static DEFINE_MUTEX(sched_register_mutex); 16static DEFINE_MUTEX(sched_register_mutex);
17 17
18static void 18static void
19probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) 19probe_sched_switch(void *ignore, bool preempt,
20 struct task_struct *prev, struct task_struct *next)
20{ 21{
21 if (unlikely(!sched_ref)) 22 if (unlikely(!sched_ref))
22 return; 23 return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 12cbe77b4136..4bcfbac289ff 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
420} 420}
421 421
422static void notrace 422static void notrace
423probe_wakeup_sched_switch(void *ignore, 423probe_wakeup_sched_switch(void *ignore, bool preempt,
424 struct task_struct *prev, struct task_struct *next) 424 struct task_struct *prev, struct task_struct *next)
425{ 425{
426 struct trace_array_cpu *data; 426 struct trace_array_cpu *data;