aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 14:25:07 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-13 14:25:07 -0400
commitf7951c33f0fed14ee26651a70a46899a59a31e18 (patch)
treedff372035ceaa7b3a01e2f15c885ff0ff2510e68
parent2406fb8d94fb17fee3ace0c09427c08825eacb16 (diff)
parent1b6266ebe3da8198e9a02fbad77bbb56e2f7ce2e (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner: - Cleanup and improvement of NUMA balancing - Refactoring and improvements to the PELT (Per Entity Load Tracking) code - Watchdog simplification and related cleanups - The usual pile of small incremental fixes and improvements * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits) watchdog: Reduce message verbosity stop_machine: Reflow cpu_stop_queue_two_works() sched/numa: Move task_numa_placement() closer to numa_migrate_preferred() sched/numa: Use group_weights to identify if migration degrades locality sched/numa: Update the scan period without holding the numa_group lock sched/numa: Remove numa_has_capacity() sched/numa: Modify migrate_swap() to accept additional parameters sched/numa: Remove unused task_capacity from 'struct numa_stats' sched/numa: Skip nodes that are at 'hoplimit' sched/debug: Reverse the order of printing faults sched/numa: Use task faults only if numa_group is not yet set up sched/numa: Set preferred_node based on best_cpu sched/numa: Simplify load_too_imbalanced() sched/numa: Evaluate move once per node sched/numa: Remove redundant field sched/debug: Show the sum wait time of a task group sched/fair: Remove #ifdefs from scale_rt_capacity() sched/core: Remove get_cpu() from sched_fork() sched/cpufreq: Clarify sugov_get_util() sched/sysctl: Remove unused sched_time_avg_ms sysctl ...
-rw-r--r--arch/mips/kvm/mips.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv.c6
-rw-r--r--arch/s390/kvm/interrupt.c2
-rw-r--r--arch/x86/kernel/kvm.c4
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/nmi.h10
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--include/linux/smpboot.h15
-rw-r--r--include/linux/swait.h36
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/rcu/srcutiny.c4
-rw-r--r--kernel/rcu/tree.c8
-rw-r--r--kernel/rcu/tree_exp.h4
-rw-r--r--kernel/rcu/tree_plugin.h12
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c72
-rw-r--r--kernel/sched/cpufreq_schedutil.c103
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/debug.c35
-rw-r--r--kernel/sched/fair.c663
-rw-r--r--kernel/sched/pelt.c399
-rw-r--r--kernel/sched/pelt.h72
-rw-r--r--kernel/sched/rt.c15
-rw-r--r--kernel/sched/sched.h87
-rw-r--r--kernel/sched/swait.c32
-rw-r--r--kernel/smpboot.c54
-rw-r--r--kernel/stop_machine.c41
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/watchdog.c147
-rw-r--r--kernel/watchdog_hld.c4
-rw-r--r--virt/kvm/arm/arm.c4
-rw-r--r--virt/kvm/arm/psci.c2
-rw-r--r--virt/kvm/async_pf.c2
-rw-r--r--virt/kvm/kvm_main.c4
38 files changed, 1009 insertions, 870 deletions
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7cd76f93a438..f7ea8e21656b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
515 dvcpu->arch.wait = 0; 515 dvcpu->arch.wait = 0;
516 516
517 if (swq_has_sleeper(&dvcpu->wq)) 517 if (swq_has_sleeper(&dvcpu->wq))
518 swake_up(&dvcpu->wq); 518 swake_up_one(&dvcpu->wq);
519 519
520 return 0; 520 return 0;
521} 521}
@@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
1204 1204
1205 vcpu->arch.wait = 0; 1205 vcpu->arch.wait = 0;
1206 if (swq_has_sleeper(&vcpu->wq)) 1206 if (swq_has_sleeper(&vcpu->wq))
1207 swake_up(&vcpu->wq); 1207 swake_up_one(&vcpu->wq);
1208} 1208}
1209 1209
1210/* low level hrtimer wake routine */ 1210/* low level hrtimer wake routine */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index de686b340f4a..ee4a8854985e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
216 216
217 wqp = kvm_arch_vcpu_wq(vcpu); 217 wqp = kvm_arch_vcpu_wq(vcpu);
218 if (swq_has_sleeper(wqp)) { 218 if (swq_has_sleeper(wqp)) {
219 swake_up(wqp); 219 swake_up_one(wqp);
220 ++vcpu->stat.halt_wakeup; 220 ++vcpu->stat.halt_wakeup;
221 } 221 }
222 222
@@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
3188 } 3188 }
3189 } 3189 }
3190 3190
3191 prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 3191 prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
3192 3192
3193 if (kvmppc_vcore_check_block(vc)) { 3193 if (kvmppc_vcore_check_block(vc)) {
3194 finish_swait(&vc->wq, &wait); 3194 finish_swait(&vc->wq, &wait);
@@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3311 kvmppc_start_thread(vcpu, vc); 3311 kvmppc_start_thread(vcpu, vc);
3312 trace_kvm_guest_enter(vcpu); 3312 trace_kvm_guest_enter(vcpu);
3313 } else if (vc->vcore_state == VCORE_SLEEPING) { 3313 } else if (vc->vcore_state == VCORE_SLEEPING) {
3314 swake_up(&vc->wq); 3314 swake_up_one(&vc->wq);
3315 } 3315 }
3316 3316
3317 } 3317 }
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index daa09f89ca2d..fcb55b02990e 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
1145 * yield-candidate. 1145 * yield-candidate.
1146 */ 1146 */
1147 vcpu->preempted = true; 1147 vcpu->preempted = true;
1148 swake_up(&vcpu->wq); 1148 swake_up_one(&vcpu->wq);
1149 vcpu->stat.halt_wakeup++; 1149 vcpu->stat.halt_wakeup++;
1150 } 1150 }
1151 /* 1151 /*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5b2300b818af..a37bda38d205 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
154 154
155 for (;;) { 155 for (;;) {
156 if (!n.halted) 156 if (!n.halted)
157 prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); 157 prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
158 if (hlist_unhashed(&n.link)) 158 if (hlist_unhashed(&n.link))
159 break; 159 break;
160 160
@@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
188 if (n->halted) 188 if (n->halted)
189 smp_send_reschedule(n->cpu); 189 smp_send_reschedule(n->cpu);
190 else if (swq_has_sleeper(&n->wq)) 190 else if (swq_has_sleeper(&n->wq))
191 swake_up(&n->wq); 191 swake_up_one(&n->wq);
192} 192}
193 193
194static void apf_task_wake_all(void) 194static void apf_task_wake_all(void)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b5cd8465d44f..d536d457517b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
1379 * using swait_active() is safe. 1379 * using swait_active() is safe.
1380 */ 1380 */
1381 if (swait_active(q)) 1381 if (swait_active(q))
1382 swake_up(q); 1382 swake_up_one(q);
1383 1383
1384 if (apic_lvtt_tscdeadline(apic)) 1384 if (apic_lvtt_tscdeadline(apic))
1385 ktimer->expired_tscdeadline = ktimer->tscdeadline; 1385 ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 8796ba387152..4cf06a64bc02 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -164,6 +164,7 @@ enum cpuhp_state {
164 CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, 164 CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
165 CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, 165 CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
166 CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, 166 CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
167 CPUHP_AP_WATCHDOG_ONLINE,
167 CPUHP_AP_WORKQUEUE_ONLINE, 168 CPUHP_AP_WORKQUEUE_ONLINE,
168 CPUHP_AP_RCUTREE_ONLINE, 169 CPUHP_AP_RCUTREE_ONLINE,
169 CPUHP_AP_ONLINE_DYN, 170 CPUHP_AP_ONLINE_DYN,
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b8d868d23e79..08f9247e9827 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -45,12 +45,18 @@ extern void touch_softlockup_watchdog(void);
45extern void touch_softlockup_watchdog_sync(void); 45extern void touch_softlockup_watchdog_sync(void);
46extern void touch_all_softlockup_watchdogs(void); 46extern void touch_all_softlockup_watchdogs(void);
47extern unsigned int softlockup_panic; 47extern unsigned int softlockup_panic;
48#else 48
49extern int lockup_detector_online_cpu(unsigned int cpu);
50extern int lockup_detector_offline_cpu(unsigned int cpu);
51#else /* CONFIG_SOFTLOCKUP_DETECTOR */
49static inline void touch_softlockup_watchdog_sched(void) { } 52static inline void touch_softlockup_watchdog_sched(void) { }
50static inline void touch_softlockup_watchdog(void) { } 53static inline void touch_softlockup_watchdog(void) { }
51static inline void touch_softlockup_watchdog_sync(void) { } 54static inline void touch_softlockup_watchdog_sync(void) { }
52static inline void touch_all_softlockup_watchdogs(void) { } 55static inline void touch_all_softlockup_watchdogs(void) { }
53#endif 56
57#define lockup_detector_online_cpu NULL
58#define lockup_detector_offline_cpu NULL
59#endif /* CONFIG_SOFTLOCKUP_DETECTOR */
54 60
55#ifdef CONFIG_DETECT_HUNG_TASK 61#ifdef CONFIG_DETECT_HUNG_TASK
56void reset_hung_task_detector(void); 62void reset_hung_task_detector(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43731fe51c97..e0f4f56c9310 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1017,7 +1017,6 @@ struct task_struct {
1017 u64 last_sum_exec_runtime; 1017 u64 last_sum_exec_runtime;
1018 struct callback_head numa_work; 1018 struct callback_head numa_work;
1019 1019
1020 struct list_head numa_entry;
1021 struct numa_group *numa_group; 1020 struct numa_group *numa_group;
1022 1021
1023 /* 1022 /*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 1c1a1512ec55..913488d828cb 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
40#ifdef CONFIG_SCHED_DEBUG 40#ifdef CONFIG_SCHED_DEBUG
41extern __read_mostly unsigned int sysctl_sched_migration_cost; 41extern __read_mostly unsigned int sysctl_sched_migration_cost;
42extern __read_mostly unsigned int sysctl_sched_nr_migrate; 42extern __read_mostly unsigned int sysctl_sched_nr_migrate;
43extern __read_mostly unsigned int sysctl_sched_time_avg;
44 43
45int sched_proc_update_handler(struct ctl_table *table, int write, 44int sched_proc_update_handler(struct ctl_table *table, int write,
46 void __user *buffer, size_t *length, 45 void __user *buffer, size_t *length,
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index c174844cf663..d0884b525001 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -25,8 +25,6 @@ struct smpboot_thread_data;
25 * parked (cpu offline) 25 * parked (cpu offline)
26 * @unpark: Optional unpark function, called when the thread is 26 * @unpark: Optional unpark function, called when the thread is
27 * unparked (cpu online) 27 * unparked (cpu online)
28 * @cpumask: Internal state. To update which threads are unparked,
29 * call smpboot_update_cpumask_percpu_thread().
30 * @selfparking: Thread is not parked by the park function. 28 * @selfparking: Thread is not parked by the park function.
31 * @thread_comm: The base name of the thread 29 * @thread_comm: The base name of the thread
32 */ 30 */
@@ -40,23 +38,12 @@ struct smp_hotplug_thread {
40 void (*cleanup)(unsigned int cpu, bool online); 38 void (*cleanup)(unsigned int cpu, bool online);
41 void (*park)(unsigned int cpu); 39 void (*park)(unsigned int cpu);
42 void (*unpark)(unsigned int cpu); 40 void (*unpark)(unsigned int cpu);
43 cpumask_var_t cpumask;
44 bool selfparking; 41 bool selfparking;
45 const char *thread_comm; 42 const char *thread_comm;
46}; 43};
47 44
48int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, 45int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
49 const struct cpumask *cpumask);
50
51static inline int
52smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
53{
54 return smpboot_register_percpu_thread_cpumask(plug_thread,
55 cpu_possible_mask);
56}
57 46
58void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); 47void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
59void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
60 const struct cpumask *);
61 48
62#endif 49#endif
diff --git a/include/linux/swait.h b/include/linux/swait.h
index bf8cb0dee23c..73e06e9986d4 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -16,7 +16,7 @@
16 * wait-queues, but the semantics are actually completely different, and 16 * wait-queues, but the semantics are actually completely different, and
17 * every single user we have ever had has been buggy (or pointless). 17 * every single user we have ever had has been buggy (or pointless).
18 * 18 *
19 * A "swake_up()" only wakes up _one_ waiter, which is not at all what 19 * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
20 * "wake_up()" does, and has led to problems. In other cases, it has 20 * "wake_up()" does, and has led to problems. In other cases, it has
21 * been fine, because there's only ever one waiter (kvm), but in that 21 * been fine, because there's only ever one waiter (kvm), but in that
22 * case gthe whole "simple" wait-queue is just pointless to begin with, 22 * case gthe whole "simple" wait-queue is just pointless to begin with,
@@ -38,8 +38,8 @@
38 * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right 38 * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
39 * sleeper state. 39 * sleeper state.
40 * 40 *
41 * - the exclusive mode; because this requires preserving the list order 41 * - the !exclusive mode; because that leads to O(n) wakeups, everything is
42 * and this is hard. 42 * exclusive.
43 * 43 *
44 * - custom wake callback functions; because you cannot give any guarantees 44 * - custom wake callback functions; because you cannot give any guarantees
45 * about random code. This also allows swait to be used in RT, such that 45 * about random code. This also allows swait to be used in RT, such that
@@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
115 * CPU0 - waker CPU1 - waiter 115 * CPU0 - waker CPU1 - waiter
116 * 116 *
117 * for (;;) { 117 * for (;;) {
118 * @cond = true; prepare_to_swait(&wq_head, &wait, state); 118 * @cond = true; prepare_to_swait_exclusive(&wq_head, &wait, state);
119 * smp_mb(); // smp_mb() from set_current_state() 119 * smp_mb(); // smp_mb() from set_current_state()
120 * if (swait_active(wq_head)) if (@cond) 120 * if (swait_active(wq_head)) if (@cond)
121 * wake_up(wq_head); break; 121 * wake_up(wq_head); break;
@@ -157,20 +157,20 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
157 return swait_active(wq); 157 return swait_active(wq);
158} 158}
159 159
160extern void swake_up(struct swait_queue_head *q); 160extern void swake_up_one(struct swait_queue_head *q);
161extern void swake_up_all(struct swait_queue_head *q); 161extern void swake_up_all(struct swait_queue_head *q);
162extern void swake_up_locked(struct swait_queue_head *q); 162extern void swake_up_locked(struct swait_queue_head *q);
163 163
164extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); 164extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
165extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
166extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); 165extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
167 166
168extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); 167extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
169extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); 168extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
170 169
171/* as per ___wait_event() but for swait, therefore "exclusive == 0" */ 170/* as per ___wait_event() but for swait, therefore "exclusive == 1" */
172#define ___swait_event(wq, condition, state, ret, cmd) \ 171#define ___swait_event(wq, condition, state, ret, cmd) \
173({ \ 172({ \
173 __label__ __out; \
174 struct swait_queue __wait; \ 174 struct swait_queue __wait; \
175 long __ret = ret; \ 175 long __ret = ret; \
176 \ 176 \
@@ -183,20 +183,20 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
183 \ 183 \
184 if (___wait_is_interruptible(state) && __int) { \ 184 if (___wait_is_interruptible(state) && __int) { \
185 __ret = __int; \ 185 __ret = __int; \
186 break; \ 186 goto __out; \
187 } \ 187 } \
188 \ 188 \
189 cmd; \ 189 cmd; \
190 } \ 190 } \
191 finish_swait(&wq, &__wait); \ 191 finish_swait(&wq, &__wait); \
192 __ret; \ 192__out: __ret; \
193}) 193})
194 194
195#define __swait_event(wq, condition) \ 195#define __swait_event(wq, condition) \
196 (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ 196 (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
197 schedule()) 197 schedule())
198 198
199#define swait_event(wq, condition) \ 199#define swait_event_exclusive(wq, condition) \
200do { \ 200do { \
201 if (condition) \ 201 if (condition) \
202 break; \ 202 break; \
@@ -208,7 +208,7 @@ do { \
208 TASK_UNINTERRUPTIBLE, timeout, \ 208 TASK_UNINTERRUPTIBLE, timeout, \
209 __ret = schedule_timeout(__ret)) 209 __ret = schedule_timeout(__ret))
210 210
211#define swait_event_timeout(wq, condition, timeout) \ 211#define swait_event_timeout_exclusive(wq, condition, timeout) \
212({ \ 212({ \
213 long __ret = timeout; \ 213 long __ret = timeout; \
214 if (!___wait_cond_timeout(condition)) \ 214 if (!___wait_cond_timeout(condition)) \
@@ -220,7 +220,7 @@ do { \
220 ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ 220 ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \
221 schedule()) 221 schedule())
222 222
223#define swait_event_interruptible(wq, condition) \ 223#define swait_event_interruptible_exclusive(wq, condition) \
224({ \ 224({ \
225 int __ret = 0; \ 225 int __ret = 0; \
226 if (!(condition)) \ 226 if (!(condition)) \
@@ -233,7 +233,7 @@ do { \
233 TASK_INTERRUPTIBLE, timeout, \ 233 TASK_INTERRUPTIBLE, timeout, \
234 __ret = schedule_timeout(__ret)) 234 __ret = schedule_timeout(__ret))
235 235
236#define swait_event_interruptible_timeout(wq, condition, timeout) \ 236#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\
237({ \ 237({ \
238 long __ret = timeout; \ 238 long __ret = timeout; \
239 if (!___wait_cond_timeout(condition)) \ 239 if (!___wait_cond_timeout(condition)) \
@@ -246,7 +246,7 @@ do { \
246 (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule()) 246 (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
247 247
248/** 248/**
249 * swait_event_idle - wait without system load contribution 249 * swait_event_idle_exclusive - wait without system load contribution
250 * @wq: the waitqueue to wait on 250 * @wq: the waitqueue to wait on
251 * @condition: a C expression for the event to wait for 251 * @condition: a C expression for the event to wait for
252 * 252 *
@@ -257,7 +257,7 @@ do { \
257 * condition and doesn't want to contribute to system load. Signals are 257 * condition and doesn't want to contribute to system load. Signals are
258 * ignored. 258 * ignored.
259 */ 259 */
260#define swait_event_idle(wq, condition) \ 260#define swait_event_idle_exclusive(wq, condition) \
261do { \ 261do { \
262 if (condition) \ 262 if (condition) \
263 break; \ 263 break; \
@@ -270,7 +270,7 @@ do { \
270 __ret = schedule_timeout(__ret)) 270 __ret = schedule_timeout(__ret))
271 271
272/** 272/**
273 * swait_event_idle_timeout - wait up to timeout without load contribution 273 * swait_event_idle_timeout_exclusive - wait up to timeout without load contribution
274 * @wq: the waitqueue to wait on 274 * @wq: the waitqueue to wait on
275 * @condition: a C expression for the event to wait for 275 * @condition: a C expression for the event to wait for
276 * @timeout: timeout at which we'll give up in jiffies 276 * @timeout: timeout at which we'll give up in jiffies
@@ -288,7 +288,7 @@ do { \
288 * or the remaining jiffies (at least 1) if the @condition evaluated 288 * or the remaining jiffies (at least 1) if the @condition evaluated
289 * to %true before the @timeout elapsed. 289 * to %true before the @timeout elapsed.
290 */ 290 */
291#define swait_event_idle_timeout(wq, condition, timeout) \ 291#define swait_event_idle_timeout_exclusive(wq, condition, timeout) \
292({ \ 292({ \
293 long __ret = timeout; \ 293 long __ret = timeout; \
294 if (!___wait_cond_timeout(condition)) \ 294 if (!___wait_cond_timeout(condition)) \
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2f8f338e77cf..15be70aae8ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
1344 .startup.single = perf_event_init_cpu, 1344 .startup.single = perf_event_init_cpu,
1345 .teardown.single = perf_event_exit_cpu, 1345 .teardown.single = perf_event_exit_cpu,
1346 }, 1346 },
1347 [CPUHP_AP_WATCHDOG_ONLINE] = {
1348 .name = "lockup_detector:online",
1349 .startup.single = lockup_detector_online_cpu,
1350 .teardown.single = lockup_detector_offline_cpu,
1351 },
1347 [CPUHP_AP_WORKQUEUE_ONLINE] = { 1352 [CPUHP_AP_WORKQUEUE_ONLINE] = {
1348 .name = "workqueue:online", 1353 .name = "workqueue:online",
1349 .startup.single = workqueue_online_cpu, 1354 .startup.single = workqueue_online_cpu,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 486dedbd9af5..087d18d771b5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self)
190 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) 190 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
191 break; 191 break;
192 192
193 complete_all(&self->parked); 193 complete(&self->parked);
194 schedule(); 194 schedule();
195 } 195 }
196 __set_current_state(TASK_RUNNING); 196 __set_current_state(TASK_RUNNING);
@@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k)
471 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) 471 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
472 __kthread_bind(k, kthread->cpu, TASK_PARKED); 472 __kthread_bind(k, kthread->cpu, TASK_PARKED);
473 473
474 reinit_completion(&kthread->parked);
475 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 474 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
476 /* 475 /*
477 * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. 476 * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
@@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k)
499 if (WARN_ON(k->flags & PF_EXITING)) 498 if (WARN_ON(k->flags & PF_EXITING))
500 return -ENOSYS; 499 return -ENOSYS;
501 500
501 if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
502 return -EBUSY;
503
502 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 504 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
503 if (k != current) { 505 if (k != current) {
504 wake_up_process(k); 506 wake_up_process(k);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 87331565e505..70178f6ffdc4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -92,7 +92,7 @@ static void s2idle_enter(void)
92 /* Push all the CPUs into the idle loop. */ 92 /* Push all the CPUs into the idle loop. */
93 wake_up_all_idle_cpus(); 93 wake_up_all_idle_cpus();
94 /* Make the current CPU wait so it can enter the idle loop too. */ 94 /* Make the current CPU wait so it can enter the idle loop too. */
95 swait_event(s2idle_wait_head, 95 swait_event_exclusive(s2idle_wait_head,
96 s2idle_state == S2IDLE_STATE_WAKE); 96 s2idle_state == S2IDLE_STATE_WAKE);
97 97
98 cpuidle_pause(); 98 cpuidle_pause();
@@ -160,7 +160,7 @@ void s2idle_wake(void)
160 raw_spin_lock_irqsave(&s2idle_lock, flags); 160 raw_spin_lock_irqsave(&s2idle_lock, flags);
161 if (s2idle_state > S2IDLE_STATE_NONE) { 161 if (s2idle_state > S2IDLE_STATE_NONE) {
162 s2idle_state = S2IDLE_STATE_WAKE; 162 s2idle_state = S2IDLE_STATE_WAKE;
163 swake_up(&s2idle_wait_head); 163 swake_up_one(&s2idle_wait_head);
164 } 164 }
165 raw_spin_unlock_irqrestore(&s2idle_lock, flags); 165 raw_spin_unlock_irqrestore(&s2idle_lock, flags);
166} 166}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 622792abe41a..04fc2ed71af8 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
110 110
111 WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); 111 WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
112 if (!newval && READ_ONCE(sp->srcu_gp_waiting)) 112 if (!newval && READ_ONCE(sp->srcu_gp_waiting))
113 swake_up(&sp->srcu_wq); 113 swake_up_one(&sp->srcu_wq);
114} 114}
115EXPORT_SYMBOL_GPL(__srcu_read_unlock); 115EXPORT_SYMBOL_GPL(__srcu_read_unlock);
116 116
@@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp)
140 idx = sp->srcu_idx; 140 idx = sp->srcu_idx;
141 WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); 141 WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
142 WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ 142 WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
143 swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); 143 swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
144 WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ 144 WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
145 145
146 /* Invoke the callbacks we removed above. */ 146 /* Invoke the callbacks we removed above. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6930934e8b9f..0b760c1369f7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1701 !READ_ONCE(rsp->gp_flags) || 1701 !READ_ONCE(rsp->gp_flags) ||
1702 !rsp->gp_kthread) 1702 !rsp->gp_kthread)
1703 return; 1703 return;
1704 swake_up(&rsp->gp_wq); 1704 swake_up_one(&rsp->gp_wq);
1705} 1705}
1706 1706
1707/* 1707/*
@@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
2015} 2015}
2016 2016
2017/* 2017/*
2018 * Helper function for swait_event_idle() wakeup at force-quiescent-state 2018 * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
2019 * time. 2019 * time.
2020 */ 2020 */
2021static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) 2021static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
@@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
2163 READ_ONCE(rsp->gp_seq), 2163 READ_ONCE(rsp->gp_seq),
2164 TPS("reqwait")); 2164 TPS("reqwait"));
2165 rsp->gp_state = RCU_GP_WAIT_GPS; 2165 rsp->gp_state = RCU_GP_WAIT_GPS;
2166 swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & 2166 swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
2167 RCU_GP_FLAG_INIT); 2167 RCU_GP_FLAG_INIT);
2168 rsp->gp_state = RCU_GP_DONE_GPS; 2168 rsp->gp_state = RCU_GP_DONE_GPS;
2169 /* Locking provides needed memory barrier. */ 2169 /* Locking provides needed memory barrier. */
@@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
2191 READ_ONCE(rsp->gp_seq), 2191 READ_ONCE(rsp->gp_seq),
2192 TPS("fqswait")); 2192 TPS("fqswait"));
2193 rsp->gp_state = RCU_GP_WAIT_FQS; 2193 rsp->gp_state = RCU_GP_WAIT_FQS;
2194 ret = swait_event_idle_timeout(rsp->gp_wq, 2194 ret = swait_event_idle_timeout_exclusive(rsp->gp_wq,
2195 rcu_gp_fqs_check_wake(rsp, &gf), j); 2195 rcu_gp_fqs_check_wake(rsp, &gf), j);
2196 rsp->gp_state = RCU_GP_DOING_FQS; 2196 rsp->gp_state = RCU_GP_DOING_FQS;
2197 /* Locking provides needed memory barriers. */ 2197 /* Locking provides needed memory barriers. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index b3df3b770afb..0b2c2ad69629 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
212 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 212 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
213 if (wake) { 213 if (wake) {
214 smp_mb(); /* EGP done before wake_up(). */ 214 smp_mb(); /* EGP done before wake_up(). */
215 swake_up(&rsp->expedited_wq); 215 swake_up_one(&rsp->expedited_wq);
216 } 216 }
217 break; 217 break;
218 } 218 }
@@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
526 jiffies_start = jiffies; 526 jiffies_start = jiffies;
527 527
528 for (;;) { 528 for (;;) {
529 ret = swait_event_timeout( 529 ret = swait_event_timeout_exclusive(
530 rsp->expedited_wq, 530 rsp->expedited_wq,
531 sync_rcu_preempt_exp_done_unlocked(rnp_root), 531 sync_rcu_preempt_exp_done_unlocked(rnp_root),
532 jiffies_stall); 532 jiffies_stall);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c1b17f5b9361..a97c20ea9bce 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
1926 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1926 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
1927 del_timer(&rdp->nocb_timer); 1927 del_timer(&rdp->nocb_timer);
1928 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1928 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1929 smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ 1929 smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
1930 swake_up(&rdp_leader->nocb_wq); 1930 swake_up_one(&rdp_leader->nocb_wq);
1931 } else { 1931 } else {
1932 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1932 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1933 } 1933 }
@@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2159 */ 2159 */
2160 trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); 2160 trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
2161 for (;;) { 2161 for (;;) {
2162 swait_event_interruptible( 2162 swait_event_interruptible_exclusive(
2163 rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], 2163 rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
2164 (d = rcu_seq_done(&rnp->gp_seq, c))); 2164 (d = rcu_seq_done(&rnp->gp_seq, c)));
2165 if (likely(d)) 2165 if (likely(d))
@@ -2188,7 +2188,7 @@ wait_again:
2188 /* Wait for callbacks to appear. */ 2188 /* Wait for callbacks to appear. */
2189 if (!rcu_nocb_poll) { 2189 if (!rcu_nocb_poll) {
2190 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); 2190 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
2191 swait_event_interruptible(my_rdp->nocb_wq, 2191 swait_event_interruptible_exclusive(my_rdp->nocb_wq,
2192 !READ_ONCE(my_rdp->nocb_leader_sleep)); 2192 !READ_ONCE(my_rdp->nocb_leader_sleep));
2193 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); 2193 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
2194 my_rdp->nocb_leader_sleep = true; 2194 my_rdp->nocb_leader_sleep = true;
@@ -2253,7 +2253,7 @@ wait_again:
2253 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2253 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2254 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2254 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2255 /* List was empty, so wake up the follower. */ 2255 /* List was empty, so wake up the follower. */
2256 swake_up(&rdp->nocb_wq); 2256 swake_up_one(&rdp->nocb_wq);
2257 } 2257 }
2258 } 2258 }
2259 2259
@@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
2270{ 2270{
2271 for (;;) { 2271 for (;;) {
2272 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); 2272 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
2273 swait_event_interruptible(rdp->nocb_wq, 2273 swait_event_interruptible_exclusive(rdp->nocb_wq,
2274 READ_ONCE(rdp->nocb_follower_head)); 2274 READ_ONCE(rdp->nocb_follower_head));
2275 if (smp_load_acquire(&rdp->nocb_follower_head)) { 2275 if (smp_load_acquire(&rdp->nocb_follower_head)) {
2276 /* ^^^ Ensure CB invocation follows _head test. */ 2276 /* ^^^ Ensure CB invocation follows _head test. */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index d9a02b318108..7fe183404c38 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
20obj-y += idle.o fair.o rt.o deadline.o 20obj-y += idle.o fair.o rt.o deadline.o
21obj-y += wait.o wait_bit.o swait.o completion.o 21obj-y += wait.o wait_bit.o swait.o completion.o
22 22
23obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o 23obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
24obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o 24obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
25obj-$(CONFIG_SCHEDSTATS) += stats.o 25obj-$(CONFIG_SCHEDSTATS) += stats.o
26obj-$(CONFIG_SCHED_DEBUG) += debug.o 26obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe365c9a08e9..deafa9fe602b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -17,6 +17,8 @@
17#include "../workqueue_internal.h" 17#include "../workqueue_internal.h"
18#include "../smpboot.h" 18#include "../smpboot.h"
19 19
20#include "pelt.h"
21
20#define CREATE_TRACE_POINTS 22#define CREATE_TRACE_POINTS
21#include <trace/events/sched.h> 23#include <trace/events/sched.h>
22 24
@@ -45,14 +47,6 @@ const_debug unsigned int sysctl_sched_features =
45const_debug unsigned int sysctl_sched_nr_migrate = 32; 47const_debug unsigned int sysctl_sched_nr_migrate = 32;
46 48
47/* 49/*
48 * period over which we average the RT time consumption, measured
49 * in ms.
50 *
51 * default: 1s
52 */
53const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
54
55/*
56 * period over which we measure -rt task CPU usage in us. 50 * period over which we measure -rt task CPU usage in us.
57 * default: 1s 51 * default: 1s
58 */ 52 */
@@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
183 177
184 rq->clock_task += delta; 178 rq->clock_task += delta;
185 179
186#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 180#ifdef HAVE_SCHED_AVG_IRQ
187 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 181 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
188 sched_rt_avg_update(rq, irq_delta + steal); 182 update_irq_load_avg(rq, irq_delta + steal);
189#endif 183#endif
190} 184}
191 185
@@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)
649 return true; 643 return true;
650} 644}
651#endif /* CONFIG_NO_HZ_FULL */ 645#endif /* CONFIG_NO_HZ_FULL */
652
653void sched_avg_update(struct rq *rq)
654{
655 s64 period = sched_avg_period();
656
657 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
658 /*
659 * Inline assembly required to prevent the compiler
660 * optimising this loop into a divmod call.
661 * See __iter_div_u64_rem() for another example of this.
662 */
663 asm("" : "+rm" (rq->age_stamp));
664 rq->age_stamp += period;
665 rq->rt_avg /= 2;
666 }
667}
668
669#endif /* CONFIG_SMP */ 646#endif /* CONFIG_SMP */
670 647
671#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 648#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1199 __set_task_cpu(p, new_cpu); 1176 __set_task_cpu(p, new_cpu);
1200} 1177}
1201 1178
1179#ifdef CONFIG_NUMA_BALANCING
1202static void __migrate_swap_task(struct task_struct *p, int cpu) 1180static void __migrate_swap_task(struct task_struct *p, int cpu)
1203{ 1181{
1204 if (task_on_rq_queued(p)) { 1182 if (task_on_rq_queued(p)) {
@@ -1280,16 +1258,17 @@ unlock:
1280/* 1258/*
1281 * Cross migrate two tasks 1259 * Cross migrate two tasks
1282 */ 1260 */
1283int migrate_swap(struct task_struct *cur, struct task_struct *p) 1261int migrate_swap(struct task_struct *cur, struct task_struct *p,
1262 int target_cpu, int curr_cpu)
1284{ 1263{
1285 struct migration_swap_arg arg; 1264 struct migration_swap_arg arg;
1286 int ret = -EINVAL; 1265 int ret = -EINVAL;
1287 1266
1288 arg = (struct migration_swap_arg){ 1267 arg = (struct migration_swap_arg){
1289 .src_task = cur, 1268 .src_task = cur,
1290 .src_cpu = task_cpu(cur), 1269 .src_cpu = curr_cpu,
1291 .dst_task = p, 1270 .dst_task = p,
1292 .dst_cpu = task_cpu(p), 1271 .dst_cpu = target_cpu,
1293 }; 1272 };
1294 1273
1295 if (arg.src_cpu == arg.dst_cpu) 1274 if (arg.src_cpu == arg.dst_cpu)
@@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
1314out: 1293out:
1315 return ret; 1294 return ret;
1316} 1295}
1296#endif /* CONFIG_NUMA_BALANCING */
1317 1297
1318/* 1298/*
1319 * wait_task_inactive - wait for a thread to unschedule. 1299 * wait_task_inactive - wait for a thread to unschedule.
@@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {}
2317int sched_fork(unsigned long clone_flags, struct task_struct *p) 2297int sched_fork(unsigned long clone_flags, struct task_struct *p)
2318{ 2298{
2319 unsigned long flags; 2299 unsigned long flags;
2320 int cpu = get_cpu();
2321 2300
2322 __sched_fork(clone_flags, p); 2301 __sched_fork(clone_flags, p);
2323 /* 2302 /*
@@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2353 p->sched_reset_on_fork = 0; 2332 p->sched_reset_on_fork = 0;
2354 } 2333 }
2355 2334
2356 if (dl_prio(p->prio)) { 2335 if (dl_prio(p->prio))
2357 put_cpu();
2358 return -EAGAIN; 2336 return -EAGAIN;
2359 } else if (rt_prio(p->prio)) { 2337 else if (rt_prio(p->prio))
2360 p->sched_class = &rt_sched_class; 2338 p->sched_class = &rt_sched_class;
2361 } else { 2339 else
2362 p->sched_class = &fair_sched_class; 2340 p->sched_class = &fair_sched_class;
2363 }
2364 2341
2365 init_entity_runnable_average(&p->se); 2342 init_entity_runnable_average(&p->se);
2366 2343
@@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2376 * We're setting the CPU for the first time, we don't migrate, 2353 * We're setting the CPU for the first time, we don't migrate,
2377 * so use __set_task_cpu(). 2354 * so use __set_task_cpu().
2378 */ 2355 */
2379 __set_task_cpu(p, cpu); 2356 __set_task_cpu(p, smp_processor_id());
2380 if (p->sched_class->task_fork) 2357 if (p->sched_class->task_fork)
2381 p->sched_class->task_fork(p); 2358 p->sched_class->task_fork(p);
2382 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2359 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2393 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2370 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2394 RB_CLEAR_NODE(&p->pushable_dl_tasks); 2371 RB_CLEAR_NODE(&p->pushable_dl_tasks);
2395#endif 2372#endif
2396
2397 put_cpu();
2398 return 0; 2373 return 0;
2399} 2374}
2400 2375
@@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq)
5714 } 5689 }
5715} 5690}
5716 5691
5717static void set_cpu_rq_start_time(unsigned int cpu)
5718{
5719 struct rq *rq = cpu_rq(cpu);
5720
5721 rq->age_stamp = sched_clock_cpu(cpu);
5722}
5723
5724/* 5692/*
5725 * used to mark begin/end of suspend/resume: 5693 * used to mark begin/end of suspend/resume:
5726 */ 5694 */
@@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
5838 5806
5839int sched_cpu_starting(unsigned int cpu) 5807int sched_cpu_starting(unsigned int cpu)
5840{ 5808{
5841 set_cpu_rq_start_time(cpu);
5842 sched_rq_cpu_starting(cpu); 5809 sched_rq_cpu_starting(cpu);
5843 sched_tick_start(cpu); 5810 sched_tick_start(cpu);
5844 return 0; 5811 return 0;
@@ -6106,7 +6073,6 @@ void __init sched_init(void)
6106 6073
6107#ifdef CONFIG_SMP 6074#ifdef CONFIG_SMP
6108 idle_thread_set_boot_cpu(); 6075 idle_thread_set_boot_cpu();
6109 set_cpu_rq_start_time(smp_processor_id());
6110#endif 6076#endif
6111 init_sched_fair_class(); 6077 init_sched_fair_class();
6112 6078
@@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
6785 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 6751 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
6786 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 6752 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
6787 6753
6754 if (schedstat_enabled() && tg != &root_task_group) {
6755 u64 ws = 0;
6756 int i;
6757
6758 for_each_possible_cpu(i)
6759 ws += schedstat_val(tg->se[i]->statistics.wait_sum);
6760
6761 seq_printf(sf, "wait_sum %llu\n", ws);
6762 }
6763
6788 return 0; 6764 return 0;
6789} 6765}
6790#endif /* CONFIG_CFS_BANDWIDTH */ 6766#endif /* CONFIG_CFS_BANDWIDTH */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c907fde01eaa..3fffad3bc8a8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,9 +53,7 @@ struct sugov_cpu {
53 unsigned int iowait_boost_max; 53 unsigned int iowait_boost_max;
54 u64 last_update; 54 u64 last_update;
55 55
56 /* The fields below are only needed when sharing a policy: */ 56 unsigned long bw_dl;
57 unsigned long util_cfs;
58 unsigned long util_dl;
59 unsigned long max; 57 unsigned long max;
60 58
61 /* The field below is for single-CPU policies only: */ 59 /* The field below is for single-CPU policies only: */
@@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
179 return cpufreq_driver_resolve_freq(policy, freq); 177 return cpufreq_driver_resolve_freq(policy, freq);
180} 178}
181 179
182static void sugov_get_util(struct sugov_cpu *sg_cpu) 180/*
181 * This function computes an effective utilization for the given CPU, to be
182 * used for frequency selection given the linear relation: f = u * f_max.
183 *
184 * The scheduler tracks the following metrics:
185 *
186 * cpu_util_{cfs,rt,dl,irq}()
187 * cpu_bw_dl()
188 *
189 * Where the cfs,rt and dl util numbers are tracked with the same metric and
190 * synchronized windows and are thus directly comparable.
191 *
192 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
193 * which excludes things like IRQ and steal-time. These latter are then accrued
194 * in the irq utilization.
195 *
196 * The DL bandwidth number otoh is not a measured metric but a value computed
197 * based on the task model parameters and gives the minimal utilization
198 * required to meet deadlines.
199 */
200static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
183{ 201{
184 struct rq *rq = cpu_rq(sg_cpu->cpu); 202 struct rq *rq = cpu_rq(sg_cpu->cpu);
203 unsigned long util, irq, max;
185 204
186 sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); 205 sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
187 sg_cpu->util_cfs = cpu_util_cfs(rq); 206 sg_cpu->bw_dl = cpu_bw_dl(rq);
188 sg_cpu->util_dl = cpu_util_dl(rq);
189}
190
191static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
192{
193 struct rq *rq = cpu_rq(sg_cpu->cpu);
194 207
195 if (rt_rq_is_runnable(&rq->rt)) 208 if (rt_rq_is_runnable(&rq->rt))
196 return sg_cpu->max; 209 return max;
210
211 /*
212 * Early check to see if IRQ/steal time saturates the CPU, can be
213 * because of inaccuracies in how we track these -- see
214 * update_irq_load_avg().
215 */
216 irq = cpu_util_irq(rq);
217 if (unlikely(irq >= max))
218 return max;
219
220 /*
221 * Because the time spend on RT/DL tasks is visible as 'lost' time to
222 * CFS tasks and we use the same metric to track the effective
223 * utilization (PELT windows are synchronized) we can directly add them
224 * to obtain the CPU's actual utilization.
225 */
226 util = cpu_util_cfs(rq);
227 util += cpu_util_rt(rq);
228
229 /*
230 * We do not make cpu_util_dl() a permanent part of this sum because we
231 * want to use cpu_bw_dl() later on, but we need to check if the
232 * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
233 * f_max when there is no idle time.
234 *
235 * NOTE: numerical errors or stop class might cause us to not quite hit
236 * saturation when we should -- something for later.
237 */
238 if ((util + cpu_util_dl(rq)) >= max)
239 return max;
240
241 /*
242 * There is still idle time; further improve the number by using the
243 * irq metric. Because IRQ/steal time is hidden from the task clock we
244 * need to scale the task numbers:
245 *
246 * 1 - irq
247 * U' = irq + ------- * U
248 * max
249 */
250 util = scale_irq_capacity(util, irq, max);
251 util += irq;
197 252
198 /* 253 /*
199 * Utilization required by DEADLINE must always be granted while, for 254 * Bandwidth required by DEADLINE must always be granted while, for
200 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to 255 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
201 * gracefully reduce the frequency when no tasks show up for longer 256 * to gracefully reduce the frequency when no tasks show up for longer
202 * periods of time. 257 * periods of time.
203 * 258 *
204 * Ideally we would like to set util_dl as min/guaranteed freq and 259 * Ideally we would like to set bw_dl as min/guaranteed freq and util +
205 * util_cfs + util_dl as requested freq. However, cpufreq is not yet 260 * bw_dl as requested freq. However, cpufreq is not yet ready for such
206 * ready for such an interface. So, we only do the latter for now. 261 * an interface. So, we only do the latter for now.
207 */ 262 */
208 return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); 263 return min(max, util + sg_cpu->bw_dl);
209} 264}
210 265
211/** 266/**
@@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
360 */ 415 */
361static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) 416static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
362{ 417{
363 if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) 418 if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
364 sg_policy->need_freq_update = true; 419 sg_policy->need_freq_update = true;
365} 420}
366 421
@@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
383 438
384 busy = sugov_cpu_is_busy(sg_cpu); 439 busy = sugov_cpu_is_busy(sg_cpu);
385 440
386 sugov_get_util(sg_cpu); 441 util = sugov_get_util(sg_cpu);
387 max = sg_cpu->max; 442 max = sg_cpu->max;
388 util = sugov_aggregate_util(sg_cpu);
389 sugov_iowait_apply(sg_cpu, time, &util, &max); 443 sugov_iowait_apply(sg_cpu, time, &util, &max);
390 next_f = get_next_freq(sg_policy, util, max); 444 next_f = get_next_freq(sg_policy, util, max);
391 /* 445 /*
@@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
424 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); 478 struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
425 unsigned long j_util, j_max; 479 unsigned long j_util, j_max;
426 480
427 sugov_get_util(j_sg_cpu); 481 j_util = sugov_get_util(j_sg_cpu);
428 j_max = j_sg_cpu->max; 482 j_max = j_sg_cpu->max;
429 j_util = sugov_aggregate_util(j_sg_cpu);
430 sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); 483 sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
431 484
432 if (j_util * max > j_max * util) { 485 if (j_util * max > j_max * util) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5fbdde6afa9..997ea7b839fa 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,7 @@
16 * Fabio Checconi <fchecconi@gmail.com> 16 * Fabio Checconi <fchecconi@gmail.com>
17 */ 17 */
18#include "sched.h" 18#include "sched.h"
19#include "pelt.h"
19 20
20struct dl_bandwidth def_dl_bandwidth; 21struct dl_bandwidth def_dl_bandwidth;
21 22
@@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq)
1179 curr->se.exec_start = now; 1180 curr->se.exec_start = now;
1180 cgroup_account_cputime(curr, delta_exec); 1181 cgroup_account_cputime(curr, delta_exec);
1181 1182
1182 sched_rt_avg_update(rq, delta_exec);
1183
1184 if (dl_entity_is_special(dl_se)) 1183 if (dl_entity_is_special(dl_se))
1185 return; 1184 return;
1186 1185
@@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1761 1760
1762 deadline_queue_push_tasks(rq); 1761 deadline_queue_push_tasks(rq);
1763 1762
1763 if (rq->curr->sched_class != &dl_sched_class)
1764 update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
1765
1764 return p; 1766 return p;
1765} 1767}
1766 1768
@@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1768{ 1770{
1769 update_curr_dl(rq); 1771 update_curr_dl(rq);
1770 1772
1773 update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
1771 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) 1774 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1772 enqueue_pushable_dl_task(rq, p); 1775 enqueue_pushable_dl_task(rq, p);
1773} 1776}
@@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1784{ 1787{
1785 update_curr_dl(rq); 1788 update_curr_dl(rq);
1786 1789
1790 update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
1787 /* 1791 /*
1788 * Even when we have runtime, update_curr_dl() might have resulted in us 1792 * Even when we have runtime, update_curr_dl() might have resulted in us
1789 * not being the leftmost task anymore. In that case NEED_RESCHED will 1793 * not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e593b4118578..870d4f3da285 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp)
111 cmp += 3; 111 cmp += 3;
112 } 112 }
113 113
114 for (i = 0; i < __SCHED_FEAT_NR; i++) { 114 i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
115 if (strcmp(cmp, sched_feat_names[i]) == 0) { 115 if (i < 0)
116 if (neg) { 116 return i;
117 sysctl_sched_features &= ~(1UL << i); 117
118 sched_feat_disable(i); 118 if (neg) {
119 } else { 119 sysctl_sched_features &= ~(1UL << i);
120 sysctl_sched_features |= (1UL << i); 120 sched_feat_disable(i);
121 sched_feat_enable(i); 121 } else {
122 } 122 sysctl_sched_features |= (1UL << i);
123 break; 123 sched_feat_enable(i);
124 }
125 } 124 }
126 125
127 return i; 126 return 0;
128} 127}
129 128
130static ssize_t 129static ssize_t
@@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
133{ 132{
134 char buf[64]; 133 char buf[64];
135 char *cmp; 134 char *cmp;
136 int i; 135 int ret;
137 struct inode *inode; 136 struct inode *inode;
138 137
139 if (cnt > 63) 138 if (cnt > 63)
@@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
148 /* Ensure the static_key remains in a consistent state */ 147 /* Ensure the static_key remains in a consistent state */
149 inode = file_inode(filp); 148 inode = file_inode(filp);
150 inode_lock(inode); 149 inode_lock(inode);
151 i = sched_feat_set(cmp); 150 ret = sched_feat_set(cmp);
152 inode_unlock(inode); 151 inode_unlock(inode);
153 if (i == __SCHED_FEAT_NR) 152 if (ret < 0)
154 return -EINVAL; 153 return ret;
155 154
156 *ppos += cnt; 155 *ppos += cnt;
157 156
@@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
843 unsigned long tpf, unsigned long gsf, unsigned long gpf) 842 unsigned long tpf, unsigned long gsf, unsigned long gpf)
844{ 843{
845 SEQ_printf(m, "numa_faults node=%d ", node); 844 SEQ_printf(m, "numa_faults node=%d ", node);
846 SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); 845 SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
847 SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); 846 SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
848} 847}
849#endif 848#endif
850 849
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2f0a0be4d344..309c93fcc604 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
255 return cfs_rq->rq; 255 return cfs_rq->rq;
256} 256}
257 257
258/* An entity is a task if it doesn't "own" a runqueue */
259#define entity_is_task(se) (!se->my_q)
260
261static inline struct task_struct *task_of(struct sched_entity *se) 258static inline struct task_struct *task_of(struct sched_entity *se)
262{ 259{
263 SCHED_WARN_ON(!entity_is_task(se)); 260 SCHED_WARN_ON(!entity_is_task(se));
@@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
419 return container_of(cfs_rq, struct rq, cfs); 416 return container_of(cfs_rq, struct rq, cfs);
420} 417}
421 418
422#define entity_is_task(se) 1
423 419
424#define for_each_sched_entity(se) \ 420#define for_each_sched_entity(se) \
425 for (; se; se = NULL) 421 for (; se; se = NULL)
@@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
692} 688}
693 689
694#ifdef CONFIG_SMP 690#ifdef CONFIG_SMP
695 691#include "pelt.h"
696#include "sched-pelt.h" 692#include "sched-pelt.h"
697 693
698static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); 694static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
@@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
735 * To solve this problem, we also cap the util_avg of successive tasks to 731 * To solve this problem, we also cap the util_avg of successive tasks to
736 * only 1/2 of the left utilization budget: 732 * only 1/2 of the left utilization budget:
737 * 733 *
738 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n 734 * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
739 * 735 *
740 * where n denotes the nth task. 736 * where n denotes the nth task and cpu_scale the CPU capacity.
741 * 737 *
742 * For example, a simplest series from the beginning would be like: 738 * For example, for a CPU with 1024 of capacity, a simplest series from
739 * the beginning would be like:
743 * 740 *
744 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... 741 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
745 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... 742 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
@@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
751{ 748{
752 struct cfs_rq *cfs_rq = cfs_rq_of(se); 749 struct cfs_rq *cfs_rq = cfs_rq_of(se);
753 struct sched_avg *sa = &se->avg; 750 struct sched_avg *sa = &se->avg;
754 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 751 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
752 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
755 753
756 if (cap > 0) { 754 if (cap > 0) {
757 if (cfs_rq->avg.util_avg != 0) { 755 if (cfs_rq->avg.util_avg != 0) {
@@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1314 * of each group. Skip other nodes. 1312 * of each group. Skip other nodes.
1315 */ 1313 */
1316 if (sched_numa_topology_type == NUMA_BACKPLANE && 1314 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1317 dist > maxdist) 1315 dist >= maxdist)
1318 continue; 1316 continue;
1319 1317
1320 /* Add up the faults from nearby nodes. */ 1318 /* Add up the faults from nearby nodes. */
@@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu);
1452 1450
1453/* Cached statistics for all CPUs within a node */ 1451/* Cached statistics for all CPUs within a node */
1454struct numa_stats { 1452struct numa_stats {
1455 unsigned long nr_running;
1456 unsigned long load; 1453 unsigned long load;
1457 1454
1458 /* Total compute capacity of CPUs on a node */ 1455 /* Total compute capacity of CPUs on a node */
1459 unsigned long compute_capacity; 1456 unsigned long compute_capacity;
1460 1457
1461 /* Approximate capacity in terms of runnable tasks on a node */ 1458 unsigned int nr_running;
1462 unsigned long task_capacity;
1463 int has_free_capacity;
1464}; 1459};
1465 1460
1466/* 1461/*
@@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1487 * the @ns structure is NULL'ed and task_numa_compare() will 1482 * the @ns structure is NULL'ed and task_numa_compare() will
1488 * not find this node attractive. 1483 * not find this node attractive.
1489 * 1484 *
1490 * We'll either bail at !has_free_capacity, or we'll detect a huge 1485 * We'll detect a huge imbalance and bail there.
1491 * imbalance and bail there.
1492 */ 1486 */
1493 if (!cpus) 1487 if (!cpus)
1494 return; 1488 return;
@@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1497 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); 1491 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1498 capacity = cpus / smt; /* cores */ 1492 capacity = cpus / smt; /* cores */
1499 1493
1500 ns->task_capacity = min_t(unsigned, capacity, 1494 capacity = min_t(unsigned, capacity,
1501 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); 1495 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1502 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1503} 1496}
1504 1497
1505struct task_numa_env { 1498struct task_numa_env {
@@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,
1548 src_capacity = env->src_stats.compute_capacity; 1541 src_capacity = env->src_stats.compute_capacity;
1549 dst_capacity = env->dst_stats.compute_capacity; 1542 dst_capacity = env->dst_stats.compute_capacity;
1550 1543
1551 /* We care about the slope of the imbalance, not the direction. */ 1544 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1552 if (dst_load < src_load)
1553 swap(dst_load, src_load);
1554 1545
1555 /* Is the difference below the threshold? */
1556 imb = dst_load * src_capacity * 100 -
1557 src_load * dst_capacity * env->imbalance_pct;
1558 if (imb <= 0)
1559 return false;
1560
1561 /*
1562 * The imbalance is above the allowed threshold.
1563 * Compare it with the old imbalance.
1564 */
1565 orig_src_load = env->src_stats.load; 1546 orig_src_load = env->src_stats.load;
1566 orig_dst_load = env->dst_stats.load; 1547 orig_dst_load = env->dst_stats.load;
1567 1548
1568 if (orig_dst_load < orig_src_load) 1549 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1569 swap(orig_dst_load, orig_src_load);
1570
1571 old_imb = orig_dst_load * src_capacity * 100 -
1572 orig_src_load * dst_capacity * env->imbalance_pct;
1573 1550
1574 /* Would this change make things worse? */ 1551 /* Would this change make things worse? */
1575 return (imb > old_imb); 1552 return (imb > old_imb);
@@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
1582 * be exchanged with the source task 1559 * be exchanged with the source task
1583 */ 1560 */
1584static void task_numa_compare(struct task_numa_env *env, 1561static void task_numa_compare(struct task_numa_env *env,
1585 long taskimp, long groupimp) 1562 long taskimp, long groupimp, bool maymove)
1586{ 1563{
1587 struct rq *src_rq = cpu_rq(env->src_cpu);
1588 struct rq *dst_rq = cpu_rq(env->dst_cpu); 1564 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1589 struct task_struct *cur; 1565 struct task_struct *cur;
1590 long src_load, dst_load; 1566 long src_load, dst_load;
@@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env,
1605 if (cur == env->p) 1581 if (cur == env->p)
1606 goto unlock; 1582 goto unlock;
1607 1583
1584 if (!cur) {
1585 if (maymove || imp > env->best_imp)
1586 goto assign;
1587 else
1588 goto unlock;
1589 }
1590
1608 /* 1591 /*
1609 * "imp" is the fault differential for the source task between the 1592 * "imp" is the fault differential for the source task between the
1610 * source and destination node. Calculate the total differential for 1593 * source and destination node. Calculate the total differential for
1611 * the source task and potential destination task. The more negative 1594 * the source task and potential destination task. The more negative
1612 * the value is, the more rmeote accesses that would be expected to 1595 * the value is, the more remote accesses that would be expected to
1613 * be incurred if the tasks were swapped. 1596 * be incurred if the tasks were swapped.
1614 */ 1597 */
1615 if (cur) { 1598 /* Skip this swap candidate if cannot move to the source cpu */
1616 /* Skip this swap candidate if cannot move to the source CPU: */ 1599 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1617 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1600 goto unlock;
1618 goto unlock;
1619 1601
1602 /*
1603 * If dst and source tasks are in the same NUMA group, or not
1604 * in any group then look only at task weights.
1605 */
1606 if (cur->numa_group == env->p->numa_group) {
1607 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1608 task_weight(cur, env->dst_nid, dist);
1620 /* 1609 /*
1621 * If dst and source tasks are in the same NUMA group, or not 1610 * Add some hysteresis to prevent swapping the
1622 * in any group then look only at task weights. 1611 * tasks within a group over tiny differences.
1623 */ 1612 */
1624 if (cur->numa_group == env->p->numa_group) { 1613 if (cur->numa_group)
1625 imp = taskimp + task_weight(cur, env->src_nid, dist) - 1614 imp -= imp / 16;
1626 task_weight(cur, env->dst_nid, dist); 1615 } else {
1627 /* 1616 /*
1628 * Add some hysteresis to prevent swapping the 1617 * Compare the group weights. If a task is all by itself
1629 * tasks within a group over tiny differences. 1618 * (not part of a group), use the task weight instead.
1630 */ 1619 */
1631 if (cur->numa_group) 1620 if (cur->numa_group && env->p->numa_group)
1632 imp -= imp/16; 1621 imp += group_weight(cur, env->src_nid, dist) -
1633 } else { 1622 group_weight(cur, env->dst_nid, dist);
1634 /* 1623 else
1635 * Compare the group weights. If a task is all by 1624 imp += task_weight(cur, env->src_nid, dist) -
1636 * itself (not part of a group), use the task weight 1625 task_weight(cur, env->dst_nid, dist);
1637 * instead.
1638 */
1639 if (cur->numa_group)
1640 imp += group_weight(cur, env->src_nid, dist) -
1641 group_weight(cur, env->dst_nid, dist);
1642 else
1643 imp += task_weight(cur, env->src_nid, dist) -
1644 task_weight(cur, env->dst_nid, dist);
1645 }
1646 } 1626 }
1647 1627
1648 if (imp <= env->best_imp && moveimp <= env->best_imp) 1628 if (imp <= env->best_imp)
1649 goto unlock; 1629 goto unlock;
1650 1630
1651 if (!cur) { 1631 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1652 /* Is there capacity at our destination? */ 1632 imp = moveimp - 1;
1653 if (env->src_stats.nr_running <= env->src_stats.task_capacity && 1633 cur = NULL;
1654 !env->dst_stats.has_free_capacity)
1655 goto unlock;
1656
1657 goto balance;
1658 }
1659
1660 /* Balance doesn't matter much if we're running a task per CPU: */
1661 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1662 dst_rq->nr_running == 1)
1663 goto assign; 1634 goto assign;
1635 }
1664 1636
1665 /* 1637 /*
1666 * In the overloaded case, try and keep the load balanced. 1638 * In the overloaded case, try and keep the load balanced.
1667 */ 1639 */
1668balance: 1640 load = task_h_load(env->p) - task_h_load(cur);
1669 load = task_h_load(env->p); 1641 if (!load)
1642 goto assign;
1643
1670 dst_load = env->dst_stats.load + load; 1644 dst_load = env->dst_stats.load + load;
1671 src_load = env->src_stats.load - load; 1645 src_load = env->src_stats.load - load;
1672 1646
1673 if (moveimp > imp && moveimp > env->best_imp) {
1674 /*
1675 * If the improvement from just moving env->p direction is
1676 * better than swapping tasks around, check if a move is
1677 * possible. Store a slightly smaller score than moveimp,
1678 * so an actually idle CPU will win.
1679 */
1680 if (!load_too_imbalanced(src_load, dst_load, env)) {
1681 imp = moveimp - 1;
1682 cur = NULL;
1683 goto assign;
1684 }
1685 }
1686
1687 if (imp <= env->best_imp)
1688 goto unlock;
1689
1690 if (cur) {
1691 load = task_h_load(cur);
1692 dst_load -= load;
1693 src_load += load;
1694 }
1695
1696 if (load_too_imbalanced(src_load, dst_load, env)) 1647 if (load_too_imbalanced(src_load, dst_load, env))
1697 goto unlock; 1648 goto unlock;
1698 1649
1650assign:
1699 /* 1651 /*
1700 * One idle CPU per node is evaluated for a task numa move. 1652 * One idle CPU per node is evaluated for a task numa move.
1701 * Call select_idle_sibling to maybe find a better one. 1653 * Call select_idle_sibling to maybe find a better one.
@@ -1711,7 +1663,6 @@ balance:
1711 local_irq_enable(); 1663 local_irq_enable();
1712 } 1664 }
1713 1665
1714assign:
1715 task_numa_assign(env, cur, imp); 1666 task_numa_assign(env, cur, imp);
1716unlock: 1667unlock:
1717 rcu_read_unlock(); 1668 rcu_read_unlock();
@@ -1720,43 +1671,30 @@ unlock:
1720static void task_numa_find_cpu(struct task_numa_env *env, 1671static void task_numa_find_cpu(struct task_numa_env *env,
1721 long taskimp, long groupimp) 1672 long taskimp, long groupimp)
1722{ 1673{
1674 long src_load, dst_load, load;
1675 bool maymove = false;
1723 int cpu; 1676 int cpu;
1724 1677
1678 load = task_h_load(env->p);
1679 dst_load = env->dst_stats.load + load;
1680 src_load = env->src_stats.load - load;
1681
1682 /*
1683 * If the improvement from just moving env->p direction is better
1684 * than swapping tasks around, check if a move is possible.
1685 */
1686 maymove = !load_too_imbalanced(src_load, dst_load, env);
1687
1725 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 1688 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1726 /* Skip this CPU if the source task cannot migrate */ 1689 /* Skip this CPU if the source task cannot migrate */
1727 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) 1690 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
1728 continue; 1691 continue;
1729 1692
1730 env->dst_cpu = cpu; 1693 env->dst_cpu = cpu;
1731 task_numa_compare(env, taskimp, groupimp); 1694 task_numa_compare(env, taskimp, groupimp, maymove);
1732 } 1695 }
1733} 1696}
1734 1697
1735/* Only move tasks to a NUMA node less busy than the current node. */
1736static bool numa_has_capacity(struct task_numa_env *env)
1737{
1738 struct numa_stats *src = &env->src_stats;
1739 struct numa_stats *dst = &env->dst_stats;
1740
1741 if (src->has_free_capacity && !dst->has_free_capacity)
1742 return false;
1743
1744 /*
1745 * Only consider a task move if the source has a higher load
1746 * than the destination, corrected for CPU capacity on each node.
1747 *
1748 * src->load dst->load
1749 * --------------------- vs ---------------------
1750 * src->compute_capacity dst->compute_capacity
1751 */
1752 if (src->load * dst->compute_capacity * env->imbalance_pct >
1753
1754 dst->load * src->compute_capacity * 100)
1755 return true;
1756
1757 return false;
1758}
1759
1760static int task_numa_migrate(struct task_struct *p) 1698static int task_numa_migrate(struct task_struct *p)
1761{ 1699{
1762 struct task_numa_env env = { 1700 struct task_numa_env env = {
@@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)
1797 * elsewhere, so there is no point in (re)trying. 1735 * elsewhere, so there is no point in (re)trying.
1798 */ 1736 */
1799 if (unlikely(!sd)) { 1737 if (unlikely(!sd)) {
1800 p->numa_preferred_nid = task_node(p); 1738 sched_setnuma(p, task_node(p));
1801 return -EINVAL; 1739 return -EINVAL;
1802 } 1740 }
1803 1741
@@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p)
1811 update_numa_stats(&env.dst_stats, env.dst_nid); 1749 update_numa_stats(&env.dst_stats, env.dst_nid);
1812 1750
1813 /* Try to find a spot on the preferred nid. */ 1751 /* Try to find a spot on the preferred nid. */
1814 if (numa_has_capacity(&env)) 1752 task_numa_find_cpu(&env, taskimp, groupimp);
1815 task_numa_find_cpu(&env, taskimp, groupimp);
1816 1753
1817 /* 1754 /*
1818 * Look at other nodes in these cases: 1755 * Look at other nodes in these cases:
@@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p)
1842 env.dist = dist; 1779 env.dist = dist;
1843 env.dst_nid = nid; 1780 env.dst_nid = nid;
1844 update_numa_stats(&env.dst_stats, env.dst_nid); 1781 update_numa_stats(&env.dst_stats, env.dst_nid);
1845 if (numa_has_capacity(&env)) 1782 task_numa_find_cpu(&env, taskimp, groupimp);
1846 task_numa_find_cpu(&env, taskimp, groupimp);
1847 } 1783 }
1848 } 1784 }
1849 1785
@@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)
1856 * trying for a better one later. Do not set the preferred node here. 1792 * trying for a better one later. Do not set the preferred node here.
1857 */ 1793 */
1858 if (p->numa_group) { 1794 if (p->numa_group) {
1859 struct numa_group *ng = p->numa_group;
1860
1861 if (env.best_cpu == -1) 1795 if (env.best_cpu == -1)
1862 nid = env.src_nid; 1796 nid = env.src_nid;
1863 else 1797 else
1864 nid = env.dst_nid; 1798 nid = cpu_to_node(env.best_cpu);
1865 1799
1866 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) 1800 if (nid != p->numa_preferred_nid)
1867 sched_setnuma(p, env.dst_nid); 1801 sched_setnuma(p, nid);
1868 } 1802 }
1869 1803
1870 /* No better CPU than the current one was found. */ 1804 /* No better CPU than the current one was found. */
@@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)
1884 return ret; 1818 return ret;
1885 } 1819 }
1886 1820
1887 ret = migrate_swap(p, env.best_task); 1821 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1822
1888 if (ret != 0) 1823 if (ret != 0)
1889 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); 1824 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1890 put_task_struct(env.best_task); 1825 put_task_struct(env.best_task);
@@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
2144 2079
2145static void task_numa_placement(struct task_struct *p) 2080static void task_numa_placement(struct task_struct *p)
2146{ 2081{
2147 int seq, nid, max_nid = -1, max_group_nid = -1; 2082 int seq, nid, max_nid = -1;
2148 unsigned long max_faults = 0, max_group_faults = 0; 2083 unsigned long max_faults = 0;
2149 unsigned long fault_types[2] = { 0, 0 }; 2084 unsigned long fault_types[2] = { 0, 0 };
2150 unsigned long total_faults; 2085 unsigned long total_faults;
2151 u64 runtime, period; 2086 u64 runtime, period;
@@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)
2224 } 2159 }
2225 } 2160 }
2226 2161
2227 if (faults > max_faults) { 2162 if (!p->numa_group) {
2228 max_faults = faults; 2163 if (faults > max_faults) {
2164 max_faults = faults;
2165 max_nid = nid;
2166 }
2167 } else if (group_faults > max_faults) {
2168 max_faults = group_faults;
2229 max_nid = nid; 2169 max_nid = nid;
2230 } 2170 }
2231
2232 if (group_faults > max_group_faults) {
2233 max_group_faults = group_faults;
2234 max_group_nid = nid;
2235 }
2236 } 2171 }
2237 2172
2238 update_task_scan_period(p, fault_types[0], fault_types[1]);
2239
2240 if (p->numa_group) { 2173 if (p->numa_group) {
2241 numa_group_count_active_nodes(p->numa_group); 2174 numa_group_count_active_nodes(p->numa_group);
2242 spin_unlock_irq(group_lock); 2175 spin_unlock_irq(group_lock);
2243 max_nid = preferred_group_nid(p, max_group_nid); 2176 max_nid = preferred_group_nid(p, max_nid);
2244 } 2177 }
2245 2178
2246 if (max_faults) { 2179 if (max_faults) {
2247 /* Set the new preferred node */ 2180 /* Set the new preferred node */
2248 if (max_nid != p->numa_preferred_nid) 2181 if (max_nid != p->numa_preferred_nid)
2249 sched_setnuma(p, max_nid); 2182 sched_setnuma(p, max_nid);
2250
2251 if (task_node(p) != p->numa_preferred_nid)
2252 numa_migrate_preferred(p);
2253 } 2183 }
2184
2185 update_task_scan_period(p, fault_types[0], fault_types[1]);
2254} 2186}
2255 2187
2256static inline int get_numa_group(struct numa_group *grp) 2188static inline int get_numa_group(struct numa_group *grp)
@@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2450 numa_is_active_node(mem_node, ng)) 2382 numa_is_active_node(mem_node, ng))
2451 local = 1; 2383 local = 1;
2452 2384
2453 task_numa_placement(p);
2454
2455 /* 2385 /*
2456 * Retry task to preferred node migration periodically, in case it 2386 * Retry task to preferred node migration periodically, in case it
2457 * case it previously failed, or the scheduler moved us. 2387 * case it previously failed, or the scheduler moved us.
2458 */ 2388 */
2459 if (time_after(jiffies, p->numa_migrate_retry)) 2389 if (time_after(jiffies, p->numa_migrate_retry)) {
2390 task_numa_placement(p);
2460 numa_migrate_preferred(p); 2391 numa_migrate_preferred(p);
2392 }
2461 2393
2462 if (migrated) 2394 if (migrated)
2463 p->numa_pages_migrated += pages; 2395 p->numa_pages_migrated += pages;
@@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2749} while (0) 2681} while (0)
2750 2682
2751#ifdef CONFIG_SMP 2683#ifdef CONFIG_SMP
2752/*
2753 * XXX we want to get rid of these helpers and use the full load resolution.
2754 */
2755static inline long se_weight(struct sched_entity *se)
2756{
2757 return scale_load_down(se->load.weight);
2758}
2759
2760static inline long se_runnable(struct sched_entity *se)
2761{
2762 return scale_load_down(se->runnable_weight);
2763}
2764
2765static inline void 2684static inline void
2766enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2685enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2767{ 2686{
@@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3062} 2981}
3063 2982
3064#ifdef CONFIG_SMP 2983#ifdef CONFIG_SMP
3065/*
3066 * Approximate:
3067 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
3068 */
3069static u64 decay_load(u64 val, u64 n)
3070{
3071 unsigned int local_n;
3072
3073 if (unlikely(n > LOAD_AVG_PERIOD * 63))
3074 return 0;
3075
3076 /* after bounds checking we can collapse to 32-bit */
3077 local_n = n;
3078
3079 /*
3080 * As y^PERIOD = 1/2, we can combine
3081 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
3082 * With a look-up table which covers y^n (n<PERIOD)
3083 *
3084 * To achieve constant time decay_load.
3085 */
3086 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
3087 val >>= local_n / LOAD_AVG_PERIOD;
3088 local_n %= LOAD_AVG_PERIOD;
3089 }
3090
3091 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
3092 return val;
3093}
3094
3095static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
3096{
3097 u32 c1, c2, c3 = d3; /* y^0 == 1 */
3098
3099 /*
3100 * c1 = d1 y^p
3101 */
3102 c1 = decay_load((u64)d1, periods);
3103
3104 /*
3105 * p-1
3106 * c2 = 1024 \Sum y^n
3107 * n=1
3108 *
3109 * inf inf
3110 * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
3111 * n=0 n=p
3112 */
3113 c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
3114
3115 return c1 + c2 + c3;
3116}
3117
3118/*
3119 * Accumulate the three separate parts of the sum; d1 the remainder
3120 * of the last (incomplete) period, d2 the span of full periods and d3
3121 * the remainder of the (incomplete) current period.
3122 *
3123 * d1 d2 d3
3124 * ^ ^ ^
3125 * | | |
3126 * |<->|<----------------->|<--->|
3127 * ... |---x---|------| ... |------|-----x (now)
3128 *
3129 * p-1
3130 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
3131 * n=1
3132 *
3133 * = u y^p + (Step 1)
3134 *
3135 * p-1
3136 * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
3137 * n=1
3138 */
3139static __always_inline u32
3140accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
3141 unsigned long load, unsigned long runnable, int running)
3142{
3143 unsigned long scale_freq, scale_cpu;
3144 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
3145 u64 periods;
3146
3147 scale_freq = arch_scale_freq_capacity(cpu);
3148 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3149
3150 delta += sa->period_contrib;
3151 periods = delta / 1024; /* A period is 1024us (~1ms) */
3152
3153 /*
3154 * Step 1: decay old *_sum if we crossed period boundaries.
3155 */
3156 if (periods) {
3157 sa->load_sum = decay_load(sa->load_sum, periods);
3158 sa->runnable_load_sum =
3159 decay_load(sa->runnable_load_sum, periods);
3160 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
3161
3162 /*
3163 * Step 2
3164 */
3165 delta %= 1024;
3166 contrib = __accumulate_pelt_segments(periods,
3167 1024 - sa->period_contrib, delta);
3168 }
3169 sa->period_contrib = delta;
3170
3171 contrib = cap_scale(contrib, scale_freq);
3172 if (load)
3173 sa->load_sum += load * contrib;
3174 if (runnable)
3175 sa->runnable_load_sum += runnable * contrib;
3176 if (running)
3177 sa->util_sum += contrib * scale_cpu;
3178
3179 return periods;
3180}
3181
3182/*
3183 * We can represent the historical contribution to runnable average as the
3184 * coefficients of a geometric series. To do this we sub-divide our runnable
3185 * history into segments of approximately 1ms (1024us); label the segment that
3186 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
3187 *
3188 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
3189 * p0 p1 p2
3190 * (now) (~1ms ago) (~2ms ago)
3191 *
3192 * Let u_i denote the fraction of p_i that the entity was runnable.
3193 *
3194 * We then designate the fractions u_i as our co-efficients, yielding the
3195 * following representation of historical load:
3196 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
3197 *
3198 * We choose y based on the with of a reasonably scheduling period, fixing:
3199 * y^32 = 0.5
3200 *
3201 * This means that the contribution to load ~32ms ago (u_32) will be weighted
3202 * approximately half as much as the contribution to load within the last ms
3203 * (u_0).
3204 *
3205 * When a period "rolls over" and we have new u_0`, multiplying the previous
3206 * sum again by y is sufficient to update:
3207 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
3208 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
3209 */
3210static __always_inline int
3211___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
3212 unsigned long load, unsigned long runnable, int running)
3213{
3214 u64 delta;
3215
3216 delta = now - sa->last_update_time;
3217 /*
3218 * This should only happen when time goes backwards, which it
3219 * unfortunately does during sched clock init when we swap over to TSC.
3220 */
3221 if ((s64)delta < 0) {
3222 sa->last_update_time = now;
3223 return 0;
3224 }
3225
3226 /*
3227 * Use 1024ns as the unit of measurement since it's a reasonable
3228 * approximation of 1us and fast to compute.
3229 */
3230 delta >>= 10;
3231 if (!delta)
3232 return 0;
3233
3234 sa->last_update_time += delta << 10;
3235
3236 /*
3237 * running is a subset of runnable (weight) so running can't be set if
3238 * runnable is clear. But there are some corner cases where the current
3239 * se has been already dequeued but cfs_rq->curr still points to it.
3240 * This means that weight will be 0 but not running for a sched_entity
3241 * but also for a cfs_rq if the latter becomes idle. As an example,
3242 * this happens during idle_balance() which calls
3243 * update_blocked_averages()
3244 */
3245 if (!load)
3246 runnable = running = 0;
3247
3248 /*
3249 * Now we know we crossed measurement unit boundaries. The *_avg
3250 * accrues by two steps:
3251 *
3252 * Step 1: accumulate *_sum since last_update_time. If we haven't
3253 * crossed period boundaries, finish.
3254 */
3255 if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
3256 return 0;
3257
3258 return 1;
3259}
3260
3261static __always_inline void
3262___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
3263{
3264 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3265
3266 /*
3267 * Step 2: update *_avg.
3268 */
3269 sa->load_avg = div_u64(load * sa->load_sum, divider);
3270 sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
3271 sa->util_avg = sa->util_sum / divider;
3272}
3273
3274/*
3275 * When a task is dequeued, its estimated utilization should not be update if
3276 * its util_avg has not been updated at least once.
3277 * This flag is used to synchronize util_avg updates with util_est updates.
3278 * We map this information into the LSB bit of the utilization saved at
3279 * dequeue time (i.e. util_est.dequeued).
3280 */
3281#define UTIL_AVG_UNCHANGED 0x1
3282
3283static inline void cfs_se_util_change(struct sched_avg *avg)
3284{
3285 unsigned int enqueued;
3286
3287 if (!sched_feat(UTIL_EST))
3288 return;
3289
3290 /* Avoid store if the flag has been already set */
3291 enqueued = avg->util_est.enqueued;
3292 if (!(enqueued & UTIL_AVG_UNCHANGED))
3293 return;
3294
3295 /* Reset flag to report util_avg has been updated */
3296 enqueued &= ~UTIL_AVG_UNCHANGED;
3297 WRITE_ONCE(avg->util_est.enqueued, enqueued);
3298}
3299
3300/*
3301 * sched_entity:
3302 *
3303 * task:
3304 * se_runnable() == se_weight()
3305 *
3306 * group: [ see update_cfs_group() ]
3307 * se_weight() = tg->weight * grq->load_avg / tg->load_avg
3308 * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
3309 *
3310 * load_sum := runnable_sum
3311 * load_avg = se_weight(se) * runnable_avg
3312 *
3313 * runnable_load_sum := runnable_sum
3314 * runnable_load_avg = se_runnable(se) * runnable_avg
3315 *
3316 * XXX collapse load_sum and runnable_load_sum
3317 *
3318 * cfq_rs:
3319 *
3320 * load_sum = \Sum se_weight(se) * se->avg.load_sum
3321 * load_avg = \Sum se->avg.load_avg
3322 *
3323 * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
3324 * runnable_load_avg = \Sum se->avg.runable_load_avg
3325 */
3326
3327static int
3328__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
3329{
3330 if (entity_is_task(se))
3331 se->runnable_weight = se->load.weight;
3332
3333 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
3334 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3335 return 1;
3336 }
3337
3338 return 0;
3339}
3340
3341static int
3342__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
3343{
3344 if (entity_is_task(se))
3345 se->runnable_weight = se->load.weight;
3346
3347 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
3348 cfs_rq->curr == se)) {
3349
3350 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3351 cfs_se_util_change(&se->avg);
3352 return 1;
3353 }
3354
3355 return 0;
3356}
3357
3358static int
3359__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
3360{
3361 if (___update_load_sum(now, cpu, &cfs_rq->avg,
3362 scale_load_down(cfs_rq->load.weight),
3363 scale_load_down(cfs_rq->runnable_weight),
3364 cfs_rq->curr != NULL)) {
3365
3366 ___update_load_avg(&cfs_rq->avg, 1, 1);
3367 return 1;
3368 }
3369
3370 return 0;
3371}
3372
3373#ifdef CONFIG_FAIR_GROUP_SCHED 2984#ifdef CONFIG_FAIR_GROUP_SCHED
3374/** 2985/**
3375 * update_tg_load_avg - update the tg's load avg 2986 * update_tg_load_avg - update the tg's load avg
@@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
4037 3648
4038#else /* CONFIG_SMP */ 3649#else /* CONFIG_SMP */
4039 3650
4040static inline int
4041update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4042{
4043 return 0;
4044}
4045
4046#define UPDATE_TG 0x0 3651#define UPDATE_TG 0x0
4047#define SKIP_AGE_LOAD 0x0 3652#define SKIP_AGE_LOAD 0x0
4048#define DO_ATTACH 0x0 3653#define DO_ATTACH 0x0
@@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
4726 throttled_hierarchy(dest_cfs_rq); 4331 throttled_hierarchy(dest_cfs_rq);
4727} 4332}
4728 4333
4729/* updated child weight may affect parent so we have to do this bottom up */
4730static int tg_unthrottle_up(struct task_group *tg, void *data) 4334static int tg_unthrottle_up(struct task_group *tg, void *data)
4731{ 4335{
4732 struct rq *rq = data; 4336 struct rq *rq = data;
@@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5653 5257
5654 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 5258 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5655 } 5259 }
5656
5657 sched_avg_update(this_rq);
5658} 5260}
5659 5261
5660/* Used instead of source_load when we know the type == 0 */ 5262/* Used instead of source_load when we know the type == 0 */
@@ -7294,8 +6896,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
7294static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) 6896static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7295{ 6897{
7296 struct numa_group *numa_group = rcu_dereference(p->numa_group); 6898 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7297 unsigned long src_faults, dst_faults; 6899 unsigned long src_weight, dst_weight;
7298 int src_nid, dst_nid; 6900 int src_nid, dst_nid, dist;
7299 6901
7300 if (!static_branch_likely(&sched_numa_balancing)) 6902 if (!static_branch_likely(&sched_numa_balancing))
7301 return -1; 6903 return -1;
@@ -7322,18 +6924,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7322 return 0; 6924 return 0;
7323 6925
7324 /* Leaving a core idle is often worse than degrading locality. */ 6926 /* Leaving a core idle is often worse than degrading locality. */
7325 if (env->idle != CPU_NOT_IDLE) 6927 if (env->idle == CPU_IDLE)
7326 return -1; 6928 return -1;
7327 6929
6930 dist = node_distance(src_nid, dst_nid);
7328 if (numa_group) { 6931 if (numa_group) {
7329 src_faults = group_faults(p, src_nid); 6932 src_weight = group_weight(p, src_nid, dist);
7330 dst_faults = group_faults(p, dst_nid); 6933 dst_weight = group_weight(p, dst_nid, dist);
7331 } else { 6934 } else {
7332 src_faults = task_faults(p, src_nid); 6935 src_weight = task_weight(p, src_nid, dist);
7333 dst_faults = task_faults(p, dst_nid); 6936 dst_weight = task_weight(p, dst_nid, dist);
7334 } 6937 }
7335 6938
7336 return dst_faults < src_faults; 6939 return dst_weight < src_weight;
7337} 6940}
7338 6941
7339#else 6942#else
@@ -7620,6 +7223,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7620 return false; 7223 return false;
7621} 7224}
7622 7225
7226static inline bool others_have_blocked(struct rq *rq)
7227{
7228 if (READ_ONCE(rq->avg_rt.util_avg))
7229 return true;
7230
7231 if (READ_ONCE(rq->avg_dl.util_avg))
7232 return true;
7233
7234#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
7235 if (READ_ONCE(rq->avg_irq.util_avg))
7236 return true;
7237#endif
7238
7239 return false;
7240}
7241
7623#ifdef CONFIG_FAIR_GROUP_SCHED 7242#ifdef CONFIG_FAIR_GROUP_SCHED
7624 7243
7625static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 7244static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7679,6 +7298,12 @@ static void update_blocked_averages(int cpu)
7679 if (cfs_rq_has_blocked(cfs_rq)) 7298 if (cfs_rq_has_blocked(cfs_rq))
7680 done = false; 7299 done = false;
7681 } 7300 }
7301 update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
7302 update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
7303 update_irq_load_avg(rq, 0);
7304 /* Don't need periodic decay once load/util_avg are null */
7305 if (others_have_blocked(rq))
7306 done = false;
7682 7307
7683#ifdef CONFIG_NO_HZ_COMMON 7308#ifdef CONFIG_NO_HZ_COMMON
7684 rq->last_blocked_load_update_tick = jiffies; 7309 rq->last_blocked_load_update_tick = jiffies;
@@ -7744,9 +7369,12 @@ static inline void update_blocked_averages(int cpu)
7744 rq_lock_irqsave(rq, &rf); 7369 rq_lock_irqsave(rq, &rf);
7745 update_rq_clock(rq); 7370 update_rq_clock(rq);
7746 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); 7371 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7372 update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
7373 update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
7374 update_irq_load_avg(rq, 0);
7747#ifdef CONFIG_NO_HZ_COMMON 7375#ifdef CONFIG_NO_HZ_COMMON
7748 rq->last_blocked_load_update_tick = jiffies; 7376 rq->last_blocked_load_update_tick = jiffies;
7749 if (!cfs_rq_has_blocked(cfs_rq)) 7377 if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
7750 rq->has_blocked_load = 0; 7378 rq->has_blocked_load = 0;
7751#endif 7379#endif
7752 rq_unlock_irqrestore(rq, &rf); 7380 rq_unlock_irqrestore(rq, &rf);
@@ -7856,39 +7484,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
7856static unsigned long scale_rt_capacity(int cpu) 7484static unsigned long scale_rt_capacity(int cpu)
7857{ 7485{
7858 struct rq *rq = cpu_rq(cpu); 7486 struct rq *rq = cpu_rq(cpu);
7859 u64 total, used, age_stamp, avg; 7487 unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
7860 s64 delta; 7488 unsigned long used, free;
7489 unsigned long irq;
7861 7490
7862 /* 7491 irq = cpu_util_irq(rq);
7863 * Since we're reading these variables without serialization make sure
7864 * we read them once before doing sanity checks on them.
7865 */
7866 age_stamp = READ_ONCE(rq->age_stamp);
7867 avg = READ_ONCE(rq->rt_avg);
7868 delta = __rq_clock_broken(rq) - age_stamp;
7869 7492
7870 if (unlikely(delta < 0)) 7493 if (unlikely(irq >= max))
7871 delta = 0; 7494 return 1;
7872 7495
7873 total = sched_avg_period() + delta; 7496 used = READ_ONCE(rq->avg_rt.util_avg);
7497 used += READ_ONCE(rq->avg_dl.util_avg);
7874 7498
7875 used = div_u64(avg, total); 7499 if (unlikely(used >= max))
7500 return 1;
7876 7501
7877 if (likely(used < SCHED_CAPACITY_SCALE)) 7502 free = max - used;
7878 return SCHED_CAPACITY_SCALE - used;
7879 7503
7880 return 1; 7504 return scale_irq_capacity(free, irq, max);
7881} 7505}
7882 7506
7883static void update_cpu_capacity(struct sched_domain *sd, int cpu) 7507static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7884{ 7508{
7885 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); 7509 unsigned long capacity = scale_rt_capacity(cpu);
7886 struct sched_group *sdg = sd->groups; 7510 struct sched_group *sdg = sd->groups;
7887 7511
7888 cpu_rq(cpu)->cpu_capacity_orig = capacity; 7512 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
7889
7890 capacity *= scale_rt_capacity(cpu);
7891 capacity >>= SCHED_CAPACITY_SHIFT;
7892 7513
7893 if (!capacity) 7514 if (!capacity)
7894 capacity = 1; 7515 capacity = 1;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
new file mode 100644
index 000000000000..35475c0c5419
--- /dev/null
+++ b/kernel/sched/pelt.c
@@ -0,0 +1,399 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Per Entity Load Tracking
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 * Various enhancements by Dmitry Adamushko.
11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 * Group scheduling enhancements by Srivatsa Vaddagiri
14 * Copyright IBM Corporation, 2007
15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 * Scaled math optimizations by Thomas Gleixner
18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 *
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 *
23 * Move PELT related code from fair.c into this pelt.c file
24 * Author: Vincent Guittot <vincent.guittot@linaro.org>
25 */
26
27#include <linux/sched.h>
28#include "sched.h"
29#include "sched-pelt.h"
30#include "pelt.h"
31
32/*
33 * Approximate:
34 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
35 */
36static u64 decay_load(u64 val, u64 n)
37{
38 unsigned int local_n;
39
40 if (unlikely(n > LOAD_AVG_PERIOD * 63))
41 return 0;
42
43 /* after bounds checking we can collapse to 32-bit */
44 local_n = n;
45
46 /*
47 * As y^PERIOD = 1/2, we can combine
48 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
49 * With a look-up table which covers y^n (n<PERIOD)
50 *
51 * To achieve constant time decay_load.
52 */
53 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
54 val >>= local_n / LOAD_AVG_PERIOD;
55 local_n %= LOAD_AVG_PERIOD;
56 }
57
58 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
59 return val;
60}
61
62static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
63{
64 u32 c1, c2, c3 = d3; /* y^0 == 1 */
65
66 /*
67 * c1 = d1 y^p
68 */
69 c1 = decay_load((u64)d1, periods);
70
71 /*
72 * p-1
73 * c2 = 1024 \Sum y^n
74 * n=1
75 *
76 * inf inf
77 * = 1024 ( \Sum y^n - \Sum y^n - y^0 )
78 * n=0 n=p
79 */
80 c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
81
82 return c1 + c2 + c3;
83}
84
85#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
86
87/*
88 * Accumulate the three separate parts of the sum; d1 the remainder
89 * of the last (incomplete) period, d2 the span of full periods and d3
90 * the remainder of the (incomplete) current period.
91 *
92 * d1 d2 d3
93 * ^ ^ ^
94 * | | |
95 * |<->|<----------------->|<--->|
96 * ... |---x---|------| ... |------|-----x (now)
97 *
98 * p-1
99 * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
100 * n=1
101 *
102 * = u y^p + (Step 1)
103 *
104 * p-1
105 * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
106 * n=1
107 */
108static __always_inline u32
109accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
110 unsigned long load, unsigned long runnable, int running)
111{
112 unsigned long scale_freq, scale_cpu;
113 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
114 u64 periods;
115
116 scale_freq = arch_scale_freq_capacity(cpu);
117 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
118
119 delta += sa->period_contrib;
120 periods = delta / 1024; /* A period is 1024us (~1ms) */
121
122 /*
123 * Step 1: decay old *_sum if we crossed period boundaries.
124 */
125 if (periods) {
126 sa->load_sum = decay_load(sa->load_sum, periods);
127 sa->runnable_load_sum =
128 decay_load(sa->runnable_load_sum, periods);
129 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
130
131 /*
132 * Step 2
133 */
134 delta %= 1024;
135 contrib = __accumulate_pelt_segments(periods,
136 1024 - sa->period_contrib, delta);
137 }
138 sa->period_contrib = delta;
139
140 contrib = cap_scale(contrib, scale_freq);
141 if (load)
142 sa->load_sum += load * contrib;
143 if (runnable)
144 sa->runnable_load_sum += runnable * contrib;
145 if (running)
146 sa->util_sum += contrib * scale_cpu;
147
148 return periods;
149}
150
151/*
152 * We can represent the historical contribution to runnable average as the
153 * coefficients of a geometric series. To do this we sub-divide our runnable
154 * history into segments of approximately 1ms (1024us); label the segment that
155 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
156 *
157 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
158 * p0 p1 p2
159 * (now) (~1ms ago) (~2ms ago)
160 *
161 * Let u_i denote the fraction of p_i that the entity was runnable.
162 *
163 * We then designate the fractions u_i as our co-efficients, yielding the
164 * following representation of historical load:
165 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
166 *
167 * We choose y based on the with of a reasonably scheduling period, fixing:
168 * y^32 = 0.5
169 *
170 * This means that the contribution to load ~32ms ago (u_32) will be weighted
171 * approximately half as much as the contribution to load within the last ms
172 * (u_0).
173 *
174 * When a period "rolls over" and we have new u_0`, multiplying the previous
175 * sum again by y is sufficient to update:
176 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
177 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
178 */
179static __always_inline int
180___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
181 unsigned long load, unsigned long runnable, int running)
182{
183 u64 delta;
184
185 delta = now - sa->last_update_time;
186 /*
187 * This should only happen when time goes backwards, which it
188 * unfortunately does during sched clock init when we swap over to TSC.
189 */
190 if ((s64)delta < 0) {
191 sa->last_update_time = now;
192 return 0;
193 }
194
195 /*
196 * Use 1024ns as the unit of measurement since it's a reasonable
197 * approximation of 1us and fast to compute.
198 */
199 delta >>= 10;
200 if (!delta)
201 return 0;
202
203 sa->last_update_time += delta << 10;
204
205 /*
206 * running is a subset of runnable (weight) so running can't be set if
207 * runnable is clear. But there are some corner cases where the current
208 * se has been already dequeued but cfs_rq->curr still points to it.
209 * This means that weight will be 0 but not running for a sched_entity
210 * but also for a cfs_rq if the latter becomes idle. As an example,
211 * this happens during idle_balance() which calls
212 * update_blocked_averages()
213 */
214 if (!load)
215 runnable = running = 0;
216
217 /*
218 * Now we know we crossed measurement unit boundaries. The *_avg
219 * accrues by two steps:
220 *
221 * Step 1: accumulate *_sum since last_update_time. If we haven't
222 * crossed period boundaries, finish.
223 */
224 if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
225 return 0;
226
227 return 1;
228}
229
230static __always_inline void
231___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
232{
233 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
234
235 /*
236 * Step 2: update *_avg.
237 */
238 sa->load_avg = div_u64(load * sa->load_sum, divider);
239 sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
240 WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
241}
242
243/*
244 * sched_entity:
245 *
246 * task:
247 * se_runnable() == se_weight()
248 *
249 * group: [ see update_cfs_group() ]
250 * se_weight() = tg->weight * grq->load_avg / tg->load_avg
251 * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
252 *
253 * load_sum := runnable_sum
254 * load_avg = se_weight(se) * runnable_avg
255 *
256 * runnable_load_sum := runnable_sum
257 * runnable_load_avg = se_runnable(se) * runnable_avg
258 *
259 * XXX collapse load_sum and runnable_load_sum
260 *
261 * cfq_rq:
262 *
263 * load_sum = \Sum se_weight(se) * se->avg.load_sum
264 * load_avg = \Sum se->avg.load_avg
265 *
266 * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
267 * runnable_load_avg = \Sum se->avg.runable_load_avg
268 */
269
270int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
271{
272 if (entity_is_task(se))
273 se->runnable_weight = se->load.weight;
274
275 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
276 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
277 return 1;
278 }
279
280 return 0;
281}
282
283int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
284{
285 if (entity_is_task(se))
286 se->runnable_weight = se->load.weight;
287
288 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
289 cfs_rq->curr == se)) {
290
291 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
292 cfs_se_util_change(&se->avg);
293 return 1;
294 }
295
296 return 0;
297}
298
299int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
300{
301 if (___update_load_sum(now, cpu, &cfs_rq->avg,
302 scale_load_down(cfs_rq->load.weight),
303 scale_load_down(cfs_rq->runnable_weight),
304 cfs_rq->curr != NULL)) {
305
306 ___update_load_avg(&cfs_rq->avg, 1, 1);
307 return 1;
308 }
309
310 return 0;
311}
312
313/*
314 * rt_rq:
315 *
316 * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
317 * util_sum = cpu_scale * load_sum
318 * runnable_load_sum = load_sum
319 *
320 * load_avg and runnable_load_avg are not supported and meaningless.
321 *
322 */
323
324int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
325{
326 if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
327 running,
328 running,
329 running)) {
330
331 ___update_load_avg(&rq->avg_rt, 1, 1);
332 return 1;
333 }
334
335 return 0;
336}
337
338/*
339 * dl_rq:
340 *
341 * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
342 * util_sum = cpu_scale * load_sum
343 * runnable_load_sum = load_sum
344 *
345 */
346
347int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
348{
349 if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
350 running,
351 running,
352 running)) {
353
354 ___update_load_avg(&rq->avg_dl, 1, 1);
355 return 1;
356 }
357
358 return 0;
359}
360
361#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
362/*
363 * irq:
364 *
365 * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
366 * util_sum = cpu_scale * load_sum
367 * runnable_load_sum = load_sum
368 *
369 */
370
371int update_irq_load_avg(struct rq *rq, u64 running)
372{
373 int ret = 0;
374 /*
375 * We know the time that has been used by interrupt since last update
376 * but we don't when. Let be pessimistic and assume that interrupt has
377 * happened just before the update. This is not so far from reality
378 * because interrupt will most probably wake up task and trig an update
379 * of rq clock during which the metric si updated.
380 * We start to decay with normal context time and then we add the
381 * interrupt context time.
382 * We can safely remove running from rq->clock because
383 * rq->clock += delta with delta >= running
384 */
385 ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
386 0,
387 0,
388 0);
389 ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
390 1,
391 1,
392 1);
393
394 if (ret)
395 ___update_load_avg(&rq->avg_irq, 1, 1);
396
397 return ret;
398}
399#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
new file mode 100644
index 000000000000..d2894db28955
--- /dev/null
+++ b/kernel/sched/pelt.h
@@ -0,0 +1,72 @@
1#ifdef CONFIG_SMP
2
3int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
4int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
5int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
6int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
7int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
8
9#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
10int update_irq_load_avg(struct rq *rq, u64 running);
11#else
12static inline int
13update_irq_load_avg(struct rq *rq, u64 running)
14{
15 return 0;
16}
17#endif
18
19/*
20 * When a task is dequeued, its estimated utilization should not be update if
21 * its util_avg has not been updated at least once.
22 * This flag is used to synchronize util_avg updates with util_est updates.
23 * We map this information into the LSB bit of the utilization saved at
24 * dequeue time (i.e. util_est.dequeued).
25 */
26#define UTIL_AVG_UNCHANGED 0x1
27
28static inline void cfs_se_util_change(struct sched_avg *avg)
29{
30 unsigned int enqueued;
31
32 if (!sched_feat(UTIL_EST))
33 return;
34
35 /* Avoid store if the flag has been already set */
36 enqueued = avg->util_est.enqueued;
37 if (!(enqueued & UTIL_AVG_UNCHANGED))
38 return;
39
40 /* Reset flag to report util_avg has been updated */
41 enqueued &= ~UTIL_AVG_UNCHANGED;
42 WRITE_ONCE(avg->util_est.enqueued, enqueued);
43}
44
45#else
46
47static inline int
48update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
49{
50 return 0;
51}
52
53static inline int
54update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
55{
56 return 0;
57}
58
59static inline int
60update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
61{
62 return 0;
63}
64
65static inline int
66update_irq_load_avg(struct rq *rq, u64 running)
67{
68 return 0;
69}
70#endif
71
72
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index eaaec8364f96..2e2955a8cf8f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,6 +5,8 @@
5 */ 5 */
6#include "sched.h" 6#include "sched.h"
7 7
8#include "pelt.h"
9
8int sched_rr_timeslice = RR_TIMESLICE; 10int sched_rr_timeslice = RR_TIMESLICE;
9int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 11int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
10 12
@@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq)
973 curr->se.exec_start = now; 975 curr->se.exec_start = now;
974 cgroup_account_cputime(curr, delta_exec); 976 cgroup_account_cputime(curr, delta_exec);
975 977
976 sched_rt_avg_update(rq, delta_exec);
977
978 if (!rt_bandwidth_enabled()) 978 if (!rt_bandwidth_enabled())
979 return; 979 return;
980 980
@@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1578 1578
1579 rt_queue_push_tasks(rq); 1579 rt_queue_push_tasks(rq);
1580 1580
1581 /*
1582 * If prev task was rt, put_prev_task() has already updated the
1583 * utilization. We only care of the case where we start to schedule a
1584 * rt task
1585 */
1586 if (rq->curr->sched_class != &rt_sched_class)
1587 update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
1588
1581 return p; 1589 return p;
1582} 1590}
1583 1591
@@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1585{ 1593{
1586 update_curr_rt(rq); 1594 update_curr_rt(rq);
1587 1595
1596 update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
1597
1588 /* 1598 /*
1589 * The previous task needs to be made eligible for pushing 1599 * The previous task needs to be made eligible for pushing
1590 * if it is still active 1600 * if it is still active
@@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2314 struct sched_rt_entity *rt_se = &p->rt; 2324 struct sched_rt_entity *rt_se = &p->rt;
2315 2325
2316 update_curr_rt(rq); 2326 update_curr_rt(rq);
2327 update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
2317 2328
2318 watchdog(rq, p); 2329 watchdog(rq, p);
2319 2330
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c7742dcc136c..4a2e8cae63c4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -594,6 +594,7 @@ struct rt_rq {
594 unsigned long rt_nr_total; 594 unsigned long rt_nr_total;
595 int overloaded; 595 int overloaded;
596 struct plist_head pushable_tasks; 596 struct plist_head pushable_tasks;
597
597#endif /* CONFIG_SMP */ 598#endif /* CONFIG_SMP */
598 int rt_queued; 599 int rt_queued;
599 600
@@ -673,7 +674,26 @@ struct dl_rq {
673 u64 bw_ratio; 674 u64 bw_ratio;
674}; 675};
675 676
677#ifdef CONFIG_FAIR_GROUP_SCHED
678/* An entity is a task if it doesn't "own" a runqueue */
679#define entity_is_task(se) (!se->my_q)
680#else
681#define entity_is_task(se) 1
682#endif
683
676#ifdef CONFIG_SMP 684#ifdef CONFIG_SMP
685/*
686 * XXX we want to get rid of these helpers and use the full load resolution.
687 */
688static inline long se_weight(struct sched_entity *se)
689{
690 return scale_load_down(se->load.weight);
691}
692
693static inline long se_runnable(struct sched_entity *se)
694{
695 return scale_load_down(se->runnable_weight);
696}
677 697
678static inline bool sched_asym_prefer(int a, int b) 698static inline bool sched_asym_prefer(int a, int b)
679{ 699{
@@ -833,8 +853,12 @@ struct rq {
833 853
834 struct list_head cfs_tasks; 854 struct list_head cfs_tasks;
835 855
836 u64 rt_avg; 856 struct sched_avg avg_rt;
837 u64 age_stamp; 857 struct sched_avg avg_dl;
858#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
859#define HAVE_SCHED_AVG_IRQ
860 struct sched_avg avg_irq;
861#endif
838 u64 idle_stamp; 862 u64 idle_stamp;
839 u64 avg_idle; 863 u64 avg_idle;
840 864
@@ -1075,7 +1099,8 @@ enum numa_faults_stats {
1075}; 1099};
1076extern void sched_setnuma(struct task_struct *p, int node); 1100extern void sched_setnuma(struct task_struct *p, int node);
1077extern int migrate_task_to(struct task_struct *p, int cpu); 1101extern int migrate_task_to(struct task_struct *p, int cpu);
1078extern int migrate_swap(struct task_struct *, struct task_struct *); 1102extern int migrate_swap(struct task_struct *p, struct task_struct *t,
1103 int cpu, int scpu);
1079extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); 1104extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
1080#else 1105#else
1081static inline void 1106static inline void
@@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
1690 1715
1691extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 1716extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
1692 1717
1693extern const_debug unsigned int sysctl_sched_time_avg;
1694extern const_debug unsigned int sysctl_sched_nr_migrate; 1718extern const_debug unsigned int sysctl_sched_nr_migrate;
1695extern const_debug unsigned int sysctl_sched_migration_cost; 1719extern const_debug unsigned int sysctl_sched_migration_cost;
1696 1720
1697static inline u64 sched_avg_period(void)
1698{
1699 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1700}
1701
1702#ifdef CONFIG_SCHED_HRTICK 1721#ifdef CONFIG_SCHED_HRTICK
1703 1722
1704/* 1723/*
@@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
1735#endif 1754#endif
1736 1755
1737#ifdef CONFIG_SMP 1756#ifdef CONFIG_SMP
1738extern void sched_avg_update(struct rq *rq);
1739
1740#ifndef arch_scale_cpu_capacity 1757#ifndef arch_scale_cpu_capacity
1741static __always_inline 1758static __always_inline
1742unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) 1759unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
1747 return SCHED_CAPACITY_SCALE; 1764 return SCHED_CAPACITY_SCALE;
1748} 1765}
1749#endif 1766#endif
1750
1751static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1752{
1753 rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
1754 sched_avg_update(rq);
1755}
1756#else 1767#else
1757#ifndef arch_scale_cpu_capacity 1768#ifndef arch_scale_cpu_capacity
1758static __always_inline 1769static __always_inline
@@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
1761 return SCHED_CAPACITY_SCALE; 1772 return SCHED_CAPACITY_SCALE;
1762} 1773}
1763#endif 1774#endif
1764static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
1765static inline void sched_avg_update(struct rq *rq) { }
1766#endif 1775#endif
1767 1776
1768struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 1777struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2177#endif 2186#endif
2178 2187
2179#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2188#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2180static inline unsigned long cpu_util_dl(struct rq *rq) 2189static inline unsigned long cpu_bw_dl(struct rq *rq)
2181{ 2190{
2182 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2191 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
2183} 2192}
2184 2193
2194static inline unsigned long cpu_util_dl(struct rq *rq)
2195{
2196 return READ_ONCE(rq->avg_dl.util_avg);
2197}
2198
2185static inline unsigned long cpu_util_cfs(struct rq *rq) 2199static inline unsigned long cpu_util_cfs(struct rq *rq)
2186{ 2200{
2187 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); 2201 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
@@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
2193 2207
2194 return util; 2208 return util;
2195} 2209}
2210
2211static inline unsigned long cpu_util_rt(struct rq *rq)
2212{
2213 return READ_ONCE(rq->avg_rt.util_avg);
2214}
2215#endif
2216
2217#ifdef HAVE_SCHED_AVG_IRQ
2218static inline unsigned long cpu_util_irq(struct rq *rq)
2219{
2220 return rq->avg_irq.util_avg;
2221}
2222
2223static inline
2224unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2225{
2226 util *= (max - irq);
2227 util /= max;
2228
2229 return util;
2230
2231}
2232#else
2233static inline unsigned long cpu_util_irq(struct rq *rq)
2234{
2235 return 0;
2236}
2237
2238static inline
2239unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
2240{
2241 return util;
2242}
2196#endif 2243#endif
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index b6fb2c3b3ff7..66b59ac77c22 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q)
32} 32}
33EXPORT_SYMBOL(swake_up_locked); 33EXPORT_SYMBOL(swake_up_locked);
34 34
35void swake_up(struct swait_queue_head *q) 35void swake_up_one(struct swait_queue_head *q)
36{ 36{
37 unsigned long flags; 37 unsigned long flags;
38 38
@@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q)
40 swake_up_locked(q); 40 swake_up_locked(q);
41 raw_spin_unlock_irqrestore(&q->lock, flags); 41 raw_spin_unlock_irqrestore(&q->lock, flags);
42} 42}
43EXPORT_SYMBOL(swake_up); 43EXPORT_SYMBOL(swake_up_one);
44 44
45/* 45/*
46 * Does not allow usage from IRQ disabled, since we must be able to 46 * Does not allow usage from IRQ disabled, since we must be able to
@@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q)
69} 69}
70EXPORT_SYMBOL(swake_up_all); 70EXPORT_SYMBOL(swake_up_all);
71 71
72void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) 72static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
73{ 73{
74 wait->task = current; 74 wait->task = current;
75 if (list_empty(&wait->task_list)) 75 if (list_empty(&wait->task_list))
76 list_add(&wait->task_list, &q->task_list); 76 list_add_tail(&wait->task_list, &q->task_list);
77} 77}
78 78
79void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) 79void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
80{ 80{
81 unsigned long flags; 81 unsigned long flags;
82 82
@@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int
85 set_current_state(state); 85 set_current_state(state);
86 raw_spin_unlock_irqrestore(&q->lock, flags); 86 raw_spin_unlock_irqrestore(&q->lock, flags);
87} 87}
88EXPORT_SYMBOL(prepare_to_swait); 88EXPORT_SYMBOL(prepare_to_swait_exclusive);
89 89
90long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) 90long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
91{ 91{
92 if (signal_pending_state(state, current)) 92 unsigned long flags;
93 return -ERESTARTSYS; 93 long ret = 0;
94 94
95 prepare_to_swait(q, wait, state); 95 raw_spin_lock_irqsave(&q->lock, flags);
96 if (unlikely(signal_pending_state(state, current))) {
97 /*
98 * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
99 * must not see us.
100 */
101 list_del_init(&wait->task_list);
102 ret = -ERESTARTSYS;
103 } else {
104 __prepare_to_swait(q, wait);
105 set_current_state(state);
106 }
107 raw_spin_unlock_irqrestore(&q->lock, flags);
96 108
97 return 0; 109 return ret;
98} 110}
99EXPORT_SYMBOL(prepare_to_swait_event); 111EXPORT_SYMBOL(prepare_to_swait_event);
100 112
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 5043e7433f4b..c230c2dd48e1 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu)
238 238
239 mutex_lock(&smpboot_threads_lock); 239 mutex_lock(&smpboot_threads_lock);
240 list_for_each_entry(cur, &hotplug_threads, list) 240 list_for_each_entry(cur, &hotplug_threads, list)
241 if (cpumask_test_cpu(cpu, cur->cpumask)) 241 smpboot_unpark_thread(cur, cpu);
242 smpboot_unpark_thread(cur, cpu);
243 mutex_unlock(&smpboot_threads_lock); 242 mutex_unlock(&smpboot_threads_lock);
244 return 0; 243 return 0;
245} 244}
@@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
280} 279}
281 280
282/** 281/**
283 * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related 282 * smpboot_register_percpu_thread - Register a per_cpu thread related
284 * to hotplug 283 * to hotplug
285 * @plug_thread: Hotplug thread descriptor 284 * @plug_thread: Hotplug thread descriptor
286 * @cpumask: The cpumask where threads run
287 * 285 *
288 * Creates and starts the threads on all online cpus. 286 * Creates and starts the threads on all online cpus.
289 */ 287 */
290int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, 288int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
291 const struct cpumask *cpumask)
292{ 289{
293 unsigned int cpu; 290 unsigned int cpu;
294 int ret = 0; 291 int ret = 0;
295 292
296 if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
297 return -ENOMEM;
298 cpumask_copy(plug_thread->cpumask, cpumask);
299
300 get_online_cpus(); 293 get_online_cpus();
301 mutex_lock(&smpboot_threads_lock); 294 mutex_lock(&smpboot_threads_lock);
302 for_each_online_cpu(cpu) { 295 for_each_online_cpu(cpu) {
303 ret = __smpboot_create_thread(plug_thread, cpu); 296 ret = __smpboot_create_thread(plug_thread, cpu);
304 if (ret) { 297 if (ret) {
305 smpboot_destroy_threads(plug_thread); 298 smpboot_destroy_threads(plug_thread);
306 free_cpumask_var(plug_thread->cpumask);
307 goto out; 299 goto out;
308 } 300 }
309 if (cpumask_test_cpu(cpu, cpumask)) 301 smpboot_unpark_thread(plug_thread, cpu);
310 smpboot_unpark_thread(plug_thread, cpu);
311 } 302 }
312 list_add(&plug_thread->list, &hotplug_threads); 303 list_add(&plug_thread->list, &hotplug_threads);
313out: 304out:
@@ -315,7 +306,7 @@ out:
315 put_online_cpus(); 306 put_online_cpus();
316 return ret; 307 return ret;
317} 308}
318EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); 309EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
319 310
320/** 311/**
321 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug 312 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
@@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
331 smpboot_destroy_threads(plug_thread); 322 smpboot_destroy_threads(plug_thread);
332 mutex_unlock(&smpboot_threads_lock); 323 mutex_unlock(&smpboot_threads_lock);
333 put_online_cpus(); 324 put_online_cpus();
334 free_cpumask_var(plug_thread->cpumask);
335} 325}
336EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); 326EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
337 327
338/**
339 * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
340 * @plug_thread: Hotplug thread descriptor
341 * @new: Revised mask to use
342 *
343 * The cpumask field in the smp_hotplug_thread must not be updated directly
344 * by the client, but only by calling this function.
345 * This function can only be called on a registered smp_hotplug_thread.
346 */
347void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
348 const struct cpumask *new)
349{
350 struct cpumask *old = plug_thread->cpumask;
351 static struct cpumask tmp;
352 unsigned int cpu;
353
354 lockdep_assert_cpus_held();
355 mutex_lock(&smpboot_threads_lock);
356
357 /* Park threads that were exclusively enabled on the old mask. */
358 cpumask_andnot(&tmp, old, new);
359 for_each_cpu_and(cpu, &tmp, cpu_online_mask)
360 smpboot_park_thread(plug_thread, cpu);
361
362 /* Unpark threads that are exclusively enabled on the new mask. */
363 cpumask_andnot(&tmp, new, old);
364 for_each_cpu_and(cpu, &tmp, cpu_online_mask)
365 smpboot_unpark_thread(plug_thread, cpu);
366
367 cpumask_copy(old, new);
368
369 mutex_unlock(&smpboot_threads_lock);
370}
371
372static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); 328static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
373 329
374/* 330/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 69eb76daed34..067cb83f37ea 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
238 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); 238 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
239 DEFINE_WAKE_Q(wakeq); 239 DEFINE_WAKE_Q(wakeq);
240 int err; 240 int err;
241
241retry: 242retry:
243 /*
244 * The waking up of stopper threads has to happen in the same
245 * scheduling context as the queueing. Otherwise, there is a
246 * possibility of one of the above stoppers being woken up by another
247 * CPU, and preempting us. This will cause us to not wake up the other
248 * stopper forever.
249 */
250 preempt_disable();
242 raw_spin_lock_irq(&stopper1->lock); 251 raw_spin_lock_irq(&stopper1->lock);
243 raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); 252 raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
244 253
245 err = -ENOENT; 254 if (!stopper1->enabled || !stopper2->enabled) {
246 if (!stopper1->enabled || !stopper2->enabled) 255 err = -ENOENT;
247 goto unlock; 256 goto unlock;
257 }
258
248 /* 259 /*
249 * Ensure that if we race with __stop_cpus() the stoppers won't get 260 * Ensure that if we race with __stop_cpus() the stoppers won't get
250 * queued up in reverse order leading to system deadlock. 261 * queued up in reverse order leading to system deadlock.
@@ -255,36 +266,30 @@ retry:
255 * It can be falsely true but it is safe to spin until it is cleared, 266 * It can be falsely true but it is safe to spin until it is cleared,
256 * queue_stop_cpus_work() does everything under preempt_disable(). 267 * queue_stop_cpus_work() does everything under preempt_disable().
257 */ 268 */
258 err = -EDEADLK; 269 if (unlikely(stop_cpus_in_progress)) {
259 if (unlikely(stop_cpus_in_progress)) 270 err = -EDEADLK;
260 goto unlock; 271 goto unlock;
272 }
261 273
262 err = 0; 274 err = 0;
263 __cpu_stop_queue_work(stopper1, work1, &wakeq); 275 __cpu_stop_queue_work(stopper1, work1, &wakeq);
264 __cpu_stop_queue_work(stopper2, work2, &wakeq); 276 __cpu_stop_queue_work(stopper2, work2, &wakeq);
265 /* 277
266 * The waking up of stopper threads has to happen
267 * in the same scheduling context as the queueing.
268 * Otherwise, there is a possibility of one of the
269 * above stoppers being woken up by another CPU,
270 * and preempting us. This will cause us to n ot
271 * wake up the other stopper forever.
272 */
273 preempt_disable();
274unlock: 278unlock:
275 raw_spin_unlock(&stopper2->lock); 279 raw_spin_unlock(&stopper2->lock);
276 raw_spin_unlock_irq(&stopper1->lock); 280 raw_spin_unlock_irq(&stopper1->lock);
277 281
278 if (unlikely(err == -EDEADLK)) { 282 if (unlikely(err == -EDEADLK)) {
283 preempt_enable();
284
279 while (stop_cpus_in_progress) 285 while (stop_cpus_in_progress)
280 cpu_relax(); 286 cpu_relax();
287
281 goto retry; 288 goto retry;
282 } 289 }
283 290
284 if (!err) { 291 wake_up_q(&wakeq);
285 wake_up_q(&wakeq); 292 preempt_enable();
286 preempt_enable();
287 }
288 293
289 return err; 294 return err;
290} 295}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d9837c0aff4..f22f76b7a138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = {
368 .mode = 0644, 368 .mode = 0644,
369 .proc_handler = proc_dointvec, 369 .proc_handler = proc_dointvec,
370 }, 370 },
371 {
372 .procname = "sched_time_avg_ms",
373 .data = &sysctl_sched_time_avg,
374 .maxlen = sizeof(unsigned int),
375 .mode = 0644,
376 .proc_handler = proc_dointvec_minmax,
377 .extra1 = &one,
378 },
379#ifdef CONFIG_SCHEDSTATS 371#ifdef CONFIG_SCHEDSTATS
380 { 372 {
381 .procname = "sched_schedstats", 373 .procname = "sched_schedstats",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 576d18045811..5470dce212c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -18,18 +18,14 @@
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/smpboot.h>
22#include <linux/sched/rt.h>
23#include <uapi/linux/sched/types.h>
24#include <linux/tick.h> 21#include <linux/tick.h>
25#include <linux/workqueue.h>
26#include <linux/sched/clock.h> 22#include <linux/sched/clock.h>
27#include <linux/sched/debug.h> 23#include <linux/sched/debug.h>
28#include <linux/sched/isolation.h> 24#include <linux/sched/isolation.h>
25#include <linux/stop_machine.h>
29 26
30#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
31#include <linux/kvm_para.h> 28#include <linux/kvm_para.h>
32#include <linux/kthread.h>
33 29
34static DEFINE_MUTEX(watchdog_mutex); 30static DEFINE_MUTEX(watchdog_mutex);
35 31
@@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void)
169unsigned int __read_mostly softlockup_panic = 165unsigned int __read_mostly softlockup_panic =
170 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; 166 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
171 167
172static bool softlockup_threads_initialized __read_mostly; 168static bool softlockup_initialized __read_mostly;
173static u64 __read_mostly sample_period; 169static u64 __read_mostly sample_period;
174 170
175static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 171static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
176static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
177static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); 172static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
178static DEFINE_PER_CPU(bool, softlockup_touch_sync); 173static DEFINE_PER_CPU(bool, softlockup_touch_sync);
179static DEFINE_PER_CPU(bool, soft_watchdog_warn); 174static DEFINE_PER_CPU(bool, soft_watchdog_warn);
@@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void)
335 __this_cpu_inc(hrtimer_interrupts); 330 __this_cpu_inc(hrtimer_interrupts);
336} 331}
337 332
333static DEFINE_PER_CPU(struct completion, softlockup_completion);
334static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
335
336/*
337 * The watchdog thread function - touches the timestamp.
338 *
339 * It only runs once every sample_period seconds (4 seconds by
340 * default) to reset the softlockup timestamp. If this gets delayed
341 * for more than 2*watchdog_thresh seconds then the debug-printout
342 * triggers in watchdog_timer_fn().
343 */
344static int softlockup_fn(void *data)
345{
346 __this_cpu_write(soft_lockup_hrtimer_cnt,
347 __this_cpu_read(hrtimer_interrupts));
348 __touch_watchdog();
349 complete(this_cpu_ptr(&softlockup_completion));
350
351 return 0;
352}
353
338/* watchdog kicker functions */ 354/* watchdog kicker functions */
339static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 355static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
340{ 356{
@@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
350 watchdog_interrupt_count(); 366 watchdog_interrupt_count();
351 367
352 /* kick the softlockup detector */ 368 /* kick the softlockup detector */
353 wake_up_process(__this_cpu_read(softlockup_watchdog)); 369 if (completion_done(this_cpu_ptr(&softlockup_completion))) {
370 reinit_completion(this_cpu_ptr(&softlockup_completion));
371 stop_one_cpu_nowait(smp_processor_id(),
372 softlockup_fn, NULL,
373 this_cpu_ptr(&softlockup_stop_work));
374 }
354 375
355 /* .. and repeat */ 376 /* .. and repeat */
356 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); 377 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
@@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
448 return HRTIMER_RESTART; 469 return HRTIMER_RESTART;
449} 470}
450 471
451static void watchdog_set_prio(unsigned int policy, unsigned int prio)
452{
453 struct sched_param param = { .sched_priority = prio };
454
455 sched_setscheduler(current, policy, &param);
456}
457
458static void watchdog_enable(unsigned int cpu) 472static void watchdog_enable(unsigned int cpu)
459{ 473{
460 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); 474 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
475 struct completion *done = this_cpu_ptr(&softlockup_completion);
476
477 WARN_ON_ONCE(cpu != smp_processor_id());
478
479 init_completion(done);
480 complete(done);
461 481
462 /* 482 /*
463 * Start the timer first to prevent the NMI watchdog triggering 483 * Start the timer first to prevent the NMI watchdog triggering
@@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu)
473 /* Enable the perf event */ 493 /* Enable the perf event */
474 if (watchdog_enabled & NMI_WATCHDOG_ENABLED) 494 if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
475 watchdog_nmi_enable(cpu); 495 watchdog_nmi_enable(cpu);
476
477 watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
478} 496}
479 497
480static void watchdog_disable(unsigned int cpu) 498static void watchdog_disable(unsigned int cpu)
481{ 499{
482 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); 500 struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
483 501
484 watchdog_set_prio(SCHED_NORMAL, 0); 502 WARN_ON_ONCE(cpu != smp_processor_id());
503
485 /* 504 /*
486 * Disable the perf event first. That prevents that a large delay 505 * Disable the perf event first. That prevents that a large delay
487 * between disabling the timer and disabling the perf event causes 506 * between disabling the timer and disabling the perf event causes
@@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu)
489 */ 508 */
490 watchdog_nmi_disable(cpu); 509 watchdog_nmi_disable(cpu);
491 hrtimer_cancel(hrtimer); 510 hrtimer_cancel(hrtimer);
511 wait_for_completion(this_cpu_ptr(&softlockup_completion));
492} 512}
493 513
494static void watchdog_cleanup(unsigned int cpu, bool online) 514static int softlockup_stop_fn(void *data)
495{ 515{
496 watchdog_disable(cpu); 516 watchdog_disable(smp_processor_id());
517 return 0;
497} 518}
498 519
499static int watchdog_should_run(unsigned int cpu) 520static void softlockup_stop_all(void)
500{ 521{
501 return __this_cpu_read(hrtimer_interrupts) != 522 int cpu;
502 __this_cpu_read(soft_lockup_hrtimer_cnt); 523
524 if (!softlockup_initialized)
525 return;
526
527 for_each_cpu(cpu, &watchdog_allowed_mask)
528 smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
529
530 cpumask_clear(&watchdog_allowed_mask);
503} 531}
504 532
505/* 533static int softlockup_start_fn(void *data)
506 * The watchdog thread function - touches the timestamp.
507 *
508 * It only runs once every sample_period seconds (4 seconds by
509 * default) to reset the softlockup timestamp. If this gets delayed
510 * for more than 2*watchdog_thresh seconds then the debug-printout
511 * triggers in watchdog_timer_fn().
512 */
513static void watchdog(unsigned int cpu)
514{ 534{
515 __this_cpu_write(soft_lockup_hrtimer_cnt, 535 watchdog_enable(smp_processor_id());
516 __this_cpu_read(hrtimer_interrupts)); 536 return 0;
517 __touch_watchdog();
518} 537}
519 538
520static struct smp_hotplug_thread watchdog_threads = { 539static void softlockup_start_all(void)
521 .store = &softlockup_watchdog,
522 .thread_should_run = watchdog_should_run,
523 .thread_fn = watchdog,
524 .thread_comm = "watchdog/%u",
525 .setup = watchdog_enable,
526 .cleanup = watchdog_cleanup,
527 .park = watchdog_disable,
528 .unpark = watchdog_enable,
529};
530
531static void softlockup_update_smpboot_threads(void)
532{ 540{
533 lockdep_assert_held(&watchdog_mutex); 541 int cpu;
534
535 if (!softlockup_threads_initialized)
536 return;
537 542
538 smpboot_update_cpumask_percpu_thread(&watchdog_threads, 543 cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
539 &watchdog_allowed_mask); 544 for_each_cpu(cpu, &watchdog_allowed_mask)
545 smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
540} 546}
541 547
542/* Temporarily park all watchdog threads */ 548int lockup_detector_online_cpu(unsigned int cpu)
543static void softlockup_park_all_threads(void)
544{ 549{
545 cpumask_clear(&watchdog_allowed_mask); 550 watchdog_enable(cpu);
546 softlockup_update_smpboot_threads(); 551 return 0;
547} 552}
548 553
549/* Unpark enabled threads */ 554int lockup_detector_offline_cpu(unsigned int cpu)
550static void softlockup_unpark_threads(void)
551{ 555{
552 cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); 556 watchdog_disable(cpu);
553 softlockup_update_smpboot_threads(); 557 return 0;
554} 558}
555 559
556static void lockup_detector_reconfigure(void) 560static void lockup_detector_reconfigure(void)
557{ 561{
558 cpus_read_lock(); 562 cpus_read_lock();
559 watchdog_nmi_stop(); 563 watchdog_nmi_stop();
560 softlockup_park_all_threads(); 564
565 softlockup_stop_all();
561 set_sample_period(); 566 set_sample_period();
562 lockup_detector_update_enable(); 567 lockup_detector_update_enable();
563 if (watchdog_enabled && watchdog_thresh) 568 if (watchdog_enabled && watchdog_thresh)
564 softlockup_unpark_threads(); 569 softlockup_start_all();
570
565 watchdog_nmi_start(); 571 watchdog_nmi_start();
566 cpus_read_unlock(); 572 cpus_read_unlock();
567 /* 573 /*
@@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void)
580 */ 586 */
581static __init void lockup_detector_setup(void) 587static __init void lockup_detector_setup(void)
582{ 588{
583 int ret;
584
585 /* 589 /*
586 * If sysctl is off and watchdog got disabled on the command line, 590 * If sysctl is off and watchdog got disabled on the command line,
587 * nothing to do here. 591 * nothing to do here.
@@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void)
592 !(watchdog_enabled && watchdog_thresh)) 596 !(watchdog_enabled && watchdog_thresh))
593 return; 597 return;
594 598
595 ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
596 &watchdog_allowed_mask);
597 if (ret) {
598 pr_err("Failed to initialize soft lockup detector threads\n");
599 return;
600 }
601
602 mutex_lock(&watchdog_mutex); 599 mutex_lock(&watchdog_mutex);
603 softlockup_threads_initialized = true;
604 lockup_detector_reconfigure(); 600 lockup_detector_reconfigure();
601 softlockup_initialized = true;
605 mutex_unlock(&watchdog_mutex); 602 mutex_unlock(&watchdog_mutex);
606} 603}
607 604
608#else /* CONFIG_SOFTLOCKUP_DETECTOR */ 605#else /* CONFIG_SOFTLOCKUP_DETECTOR */
609static inline int watchdog_park_threads(void) { return 0; }
610static inline void watchdog_unpark_threads(void) { }
611static inline int watchdog_enable_all_cpus(void) { return 0; }
612static inline void watchdog_disable_all_cpus(void) { }
613static void lockup_detector_reconfigure(void) 606static void lockup_detector_reconfigure(void)
614{ 607{
615 cpus_read_lock(); 608 cpus_read_lock();
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index e449a23e9d59..1f7020d65d0a 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void)
175 evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, 175 evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
176 watchdog_overflow_callback, NULL); 176 watchdog_overflow_callback, NULL);
177 if (IS_ERR(evt)) { 177 if (IS_ERR(evt)) {
178 pr_info("Perf event create on CPU %d failed with %ld\n", cpu, 178 pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
179 PTR_ERR(evt)); 179 PTR_ERR(evt));
180 return PTR_ERR(evt); 180 return PTR_ERR(evt);
181 } 181 }
182 this_cpu_write(watchdog_ev, evt); 182 this_cpu_write(watchdog_ev, evt);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 04e554cae3a2..108250e4d376 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm)
604 604
605 kvm_for_each_vcpu(i, vcpu, kvm) { 605 kvm_for_each_vcpu(i, vcpu, kvm) {
606 vcpu->arch.pause = false; 606 vcpu->arch.pause = false;
607 swake_up(kvm_arch_vcpu_wq(vcpu)); 607 swake_up_one(kvm_arch_vcpu_wq(vcpu));
608 } 608 }
609} 609}
610 610
@@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
612{ 612{
613 struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); 613 struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
614 614
615 swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && 615 swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
616 (!vcpu->arch.pause))); 616 (!vcpu->arch.pause)));
617 617
618 if (vcpu->arch.power_off || vcpu->arch.pause) { 618 if (vcpu->arch.power_off || vcpu->arch.pause) {
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index c95ab4c5a475..9b73d3ad918a 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
155 smp_mb(); /* Make sure the above is visible */ 155 smp_mb(); /* Make sure the above is visible */
156 156
157 wq = kvm_arch_vcpu_wq(vcpu); 157 wq = kvm_arch_vcpu_wq(vcpu);
158 swake_up(wq); 158 swake_up_one(wq);
159 159
160 return PSCI_RET_SUCCESS; 160 return PSCI_RET_SUCCESS;
161} 161}
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 57bcb27dcf30..23c2519c5b32 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work)
107 trace_kvm_async_pf_completed(addr, gva); 107 trace_kvm_async_pf_completed(addr, gva);
108 108
109 if (swq_has_sleeper(&vcpu->wq)) 109 if (swq_has_sleeper(&vcpu->wq))
110 swake_up(&vcpu->wq); 110 swake_up_one(&vcpu->wq);
111 111
112 mmput(mm); 112 mmput(mm);
113 kvm_put_kvm(vcpu->kvm); 113 kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b47507faab5..3d233ebfbee9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2172,7 +2172,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2172 kvm_arch_vcpu_blocking(vcpu); 2172 kvm_arch_vcpu_blocking(vcpu);
2173 2173
2174 for (;;) { 2174 for (;;) {
2175 prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2175 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
2176 2176
2177 if (kvm_vcpu_check_block(vcpu) < 0) 2177 if (kvm_vcpu_check_block(vcpu) < 0)
2178 break; 2178 break;
@@ -2214,7 +2214,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2214 2214
2215 wqp = kvm_arch_vcpu_wq(vcpu); 2215 wqp = kvm_arch_vcpu_wq(vcpu);
2216 if (swq_has_sleeper(wqp)) { 2216 if (swq_has_sleeper(wqp)) {
2217 swake_up(wqp); 2217 swake_up_one(wqp);
2218 ++vcpu->stat.halt_wakeup; 2218 ++vcpu->stat.halt_wakeup;
2219 return true; 2219 return true;
2220 } 2220 }