diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-13 14:25:07 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-13 14:25:07 -0400 |
commit | f7951c33f0fed14ee26651a70a46899a59a31e18 (patch) | |
tree | dff372035ceaa7b3a01e2f15c885ff0ff2510e68 | |
parent | 2406fb8d94fb17fee3ace0c09427c08825eacb16 (diff) | |
parent | 1b6266ebe3da8198e9a02fbad77bbb56e2f7ce2e (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner:
- Cleanup and improvement of NUMA balancing
- Refactoring and improvements to the PELT (Per Entity Load Tracking)
code
- Watchdog simplification and related cleanups
- The usual pile of small incremental fixes and improvements
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
watchdog: Reduce message verbosity
stop_machine: Reflow cpu_stop_queue_two_works()
sched/numa: Move task_numa_placement() closer to numa_migrate_preferred()
sched/numa: Use group_weights to identify if migration degrades locality
sched/numa: Update the scan period without holding the numa_group lock
sched/numa: Remove numa_has_capacity()
sched/numa: Modify migrate_swap() to accept additional parameters
sched/numa: Remove unused task_capacity from 'struct numa_stats'
sched/numa: Skip nodes that are at 'hoplimit'
sched/debug: Reverse the order of printing faults
sched/numa: Use task faults only if numa_group is not yet set up
sched/numa: Set preferred_node based on best_cpu
sched/numa: Simplify load_too_imbalanced()
sched/numa: Evaluate move once per node
sched/numa: Remove redundant field
sched/debug: Show the sum wait time of a task group
sched/fair: Remove #ifdefs from scale_rt_capacity()
sched/core: Remove get_cpu() from sched_fork()
sched/cpufreq: Clarify sugov_get_util()
sched/sysctl: Remove unused sched_time_avg_ms sysctl
...
38 files changed, 1009 insertions, 870 deletions
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7cd76f93a438..f7ea8e21656b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c | |||
@@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, | |||
515 | dvcpu->arch.wait = 0; | 515 | dvcpu->arch.wait = 0; |
516 | 516 | ||
517 | if (swq_has_sleeper(&dvcpu->wq)) | 517 | if (swq_has_sleeper(&dvcpu->wq)) |
518 | swake_up(&dvcpu->wq); | 518 | swake_up_one(&dvcpu->wq); |
519 | 519 | ||
520 | return 0; | 520 | return 0; |
521 | } | 521 | } |
@@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data) | |||
1204 | 1204 | ||
1205 | vcpu->arch.wait = 0; | 1205 | vcpu->arch.wait = 0; |
1206 | if (swq_has_sleeper(&vcpu->wq)) | 1206 | if (swq_has_sleeper(&vcpu->wq)) |
1207 | swake_up(&vcpu->wq); | 1207 | swake_up_one(&vcpu->wq); |
1208 | } | 1208 | } |
1209 | 1209 | ||
1210 | /* low level hrtimer wake routine */ | 1210 | /* low level hrtimer wake routine */ |
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index de686b340f4a..ee4a8854985e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c | |||
@@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) | |||
216 | 216 | ||
217 | wqp = kvm_arch_vcpu_wq(vcpu); | 217 | wqp = kvm_arch_vcpu_wq(vcpu); |
218 | if (swq_has_sleeper(wqp)) { | 218 | if (swq_has_sleeper(wqp)) { |
219 | swake_up(wqp); | 219 | swake_up_one(wqp); |
220 | ++vcpu->stat.halt_wakeup; | 220 | ++vcpu->stat.halt_wakeup; |
221 | } | 221 | } |
222 | 222 | ||
@@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) | |||
3188 | } | 3188 | } |
3189 | } | 3189 | } |
3190 | 3190 | ||
3191 | prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); | 3191 | prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE); |
3192 | 3192 | ||
3193 | if (kvmppc_vcore_check_block(vc)) { | 3193 | if (kvmppc_vcore_check_block(vc)) { |
3194 | finish_swait(&vc->wq, &wait); | 3194 | finish_swait(&vc->wq, &wait); |
@@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) | |||
3311 | kvmppc_start_thread(vcpu, vc); | 3311 | kvmppc_start_thread(vcpu, vc); |
3312 | trace_kvm_guest_enter(vcpu); | 3312 | trace_kvm_guest_enter(vcpu); |
3313 | } else if (vc->vcore_state == VCORE_SLEEPING) { | 3313 | } else if (vc->vcore_state == VCORE_SLEEPING) { |
3314 | swake_up(&vc->wq); | 3314 | swake_up_one(&vc->wq); |
3315 | } | 3315 | } |
3316 | 3316 | ||
3317 | } | 3317 | } |
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index daa09f89ca2d..fcb55b02990e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c | |||
@@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) | |||
1145 | * yield-candidate. | 1145 | * yield-candidate. |
1146 | */ | 1146 | */ |
1147 | vcpu->preempted = true; | 1147 | vcpu->preempted = true; |
1148 | swake_up(&vcpu->wq); | 1148 | swake_up_one(&vcpu->wq); |
1149 | vcpu->stat.halt_wakeup++; | 1149 | vcpu->stat.halt_wakeup++; |
1150 | } | 1150 | } |
1151 | /* | 1151 | /* |
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..a37bda38d205 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c | |||
@@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) | |||
154 | 154 | ||
155 | for (;;) { | 155 | for (;;) { |
156 | if (!n.halted) | 156 | if (!n.halted) |
157 | prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); | 157 | prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); |
158 | if (hlist_unhashed(&n.link)) | 158 | if (hlist_unhashed(&n.link)) |
159 | break; | 159 | break; |
160 | 160 | ||
@@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) | |||
188 | if (n->halted) | 188 | if (n->halted) |
189 | smp_send_reschedule(n->cpu); | 189 | smp_send_reschedule(n->cpu); |
190 | else if (swq_has_sleeper(&n->wq)) | 190 | else if (swq_has_sleeper(&n->wq)) |
191 | swake_up(&n->wq); | 191 | swake_up_one(&n->wq); |
192 | } | 192 | } |
193 | 193 | ||
194 | static void apf_task_wake_all(void) | 194 | static void apf_task_wake_all(void) |
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b5cd8465d44f..d536d457517b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c | |||
@@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) | |||
1379 | * using swait_active() is safe. | 1379 | * using swait_active() is safe. |
1380 | */ | 1380 | */ |
1381 | if (swait_active(q)) | 1381 | if (swait_active(q)) |
1382 | swake_up(q); | 1382 | swake_up_one(q); |
1383 | 1383 | ||
1384 | if (apic_lvtt_tscdeadline(apic)) | 1384 | if (apic_lvtt_tscdeadline(apic)) |
1385 | ktimer->expired_tscdeadline = ktimer->tscdeadline; | 1385 | ktimer->expired_tscdeadline = ktimer->tscdeadline; |
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 8796ba387152..4cf06a64bc02 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h | |||
@@ -164,6 +164,7 @@ enum cpuhp_state { | |||
164 | CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, | 164 | CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, |
165 | CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, | 165 | CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, |
166 | CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, | 166 | CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, |
167 | CPUHP_AP_WATCHDOG_ONLINE, | ||
167 | CPUHP_AP_WORKQUEUE_ONLINE, | 168 | CPUHP_AP_WORKQUEUE_ONLINE, |
168 | CPUHP_AP_RCUTREE_ONLINE, | 169 | CPUHP_AP_RCUTREE_ONLINE, |
169 | CPUHP_AP_ONLINE_DYN, | 170 | CPUHP_AP_ONLINE_DYN, |
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index b8d868d23e79..08f9247e9827 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
@@ -45,12 +45,18 @@ extern void touch_softlockup_watchdog(void); | |||
45 | extern void touch_softlockup_watchdog_sync(void); | 45 | extern void touch_softlockup_watchdog_sync(void); |
46 | extern void touch_all_softlockup_watchdogs(void); | 46 | extern void touch_all_softlockup_watchdogs(void); |
47 | extern unsigned int softlockup_panic; | 47 | extern unsigned int softlockup_panic; |
48 | #else | 48 | |
49 | extern int lockup_detector_online_cpu(unsigned int cpu); | ||
50 | extern int lockup_detector_offline_cpu(unsigned int cpu); | ||
51 | #else /* CONFIG_SOFTLOCKUP_DETECTOR */ | ||
49 | static inline void touch_softlockup_watchdog_sched(void) { } | 52 | static inline void touch_softlockup_watchdog_sched(void) { } |
50 | static inline void touch_softlockup_watchdog(void) { } | 53 | static inline void touch_softlockup_watchdog(void) { } |
51 | static inline void touch_softlockup_watchdog_sync(void) { } | 54 | static inline void touch_softlockup_watchdog_sync(void) { } |
52 | static inline void touch_all_softlockup_watchdogs(void) { } | 55 | static inline void touch_all_softlockup_watchdogs(void) { } |
53 | #endif | 56 | |
57 | #define lockup_detector_online_cpu NULL | ||
58 | #define lockup_detector_offline_cpu NULL | ||
59 | #endif /* CONFIG_SOFTLOCKUP_DETECTOR */ | ||
54 | 60 | ||
55 | #ifdef CONFIG_DETECT_HUNG_TASK | 61 | #ifdef CONFIG_DETECT_HUNG_TASK |
56 | void reset_hung_task_detector(void); | 62 | void reset_hung_task_detector(void); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 43731fe51c97..e0f4f56c9310 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1017,7 +1017,6 @@ struct task_struct { | |||
1017 | u64 last_sum_exec_runtime; | 1017 | u64 last_sum_exec_runtime; |
1018 | struct callback_head numa_work; | 1018 | struct callback_head numa_work; |
1019 | 1019 | ||
1020 | struct list_head numa_entry; | ||
1021 | struct numa_group *numa_group; | 1020 | struct numa_group *numa_group; |
1022 | 1021 | ||
1023 | /* | 1022 | /* |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 1c1a1512ec55..913488d828cb 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; | |||
40 | #ifdef CONFIG_SCHED_DEBUG | 40 | #ifdef CONFIG_SCHED_DEBUG |
41 | extern __read_mostly unsigned int sysctl_sched_migration_cost; | 41 | extern __read_mostly unsigned int sysctl_sched_migration_cost; |
42 | extern __read_mostly unsigned int sysctl_sched_nr_migrate; | 42 | extern __read_mostly unsigned int sysctl_sched_nr_migrate; |
43 | extern __read_mostly unsigned int sysctl_sched_time_avg; | ||
44 | 43 | ||
45 | int sched_proc_update_handler(struct ctl_table *table, int write, | 44 | int sched_proc_update_handler(struct ctl_table *table, int write, |
46 | void __user *buffer, size_t *length, | 45 | void __user *buffer, size_t *length, |
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index c174844cf663..d0884b525001 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h | |||
@@ -25,8 +25,6 @@ struct smpboot_thread_data; | |||
25 | * parked (cpu offline) | 25 | * parked (cpu offline) |
26 | * @unpark: Optional unpark function, called when the thread is | 26 | * @unpark: Optional unpark function, called when the thread is |
27 | * unparked (cpu online) | 27 | * unparked (cpu online) |
28 | * @cpumask: Internal state. To update which threads are unparked, | ||
29 | * call smpboot_update_cpumask_percpu_thread(). | ||
30 | * @selfparking: Thread is not parked by the park function. | 28 | * @selfparking: Thread is not parked by the park function. |
31 | * @thread_comm: The base name of the thread | 29 | * @thread_comm: The base name of the thread |
32 | */ | 30 | */ |
@@ -40,23 +38,12 @@ struct smp_hotplug_thread { | |||
40 | void (*cleanup)(unsigned int cpu, bool online); | 38 | void (*cleanup)(unsigned int cpu, bool online); |
41 | void (*park)(unsigned int cpu); | 39 | void (*park)(unsigned int cpu); |
42 | void (*unpark)(unsigned int cpu); | 40 | void (*unpark)(unsigned int cpu); |
43 | cpumask_var_t cpumask; | ||
44 | bool selfparking; | 41 | bool selfparking; |
45 | const char *thread_comm; | 42 | const char *thread_comm; |
46 | }; | 43 | }; |
47 | 44 | ||
48 | int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, | 45 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread); |
49 | const struct cpumask *cpumask); | ||
50 | |||
51 | static inline int | ||
52 | smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | ||
53 | { | ||
54 | return smpboot_register_percpu_thread_cpumask(plug_thread, | ||
55 | cpu_possible_mask); | ||
56 | } | ||
57 | 46 | ||
58 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); | 47 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); |
59 | void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, | ||
60 | const struct cpumask *); | ||
61 | 48 | ||
62 | #endif | 49 | #endif |
diff --git a/include/linux/swait.h b/include/linux/swait.h index bf8cb0dee23c..73e06e9986d4 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h | |||
@@ -16,7 +16,7 @@ | |||
16 | * wait-queues, but the semantics are actually completely different, and | 16 | * wait-queues, but the semantics are actually completely different, and |
17 | * every single user we have ever had has been buggy (or pointless). | 17 | * every single user we have ever had has been buggy (or pointless). |
18 | * | 18 | * |
19 | * A "swake_up()" only wakes up _one_ waiter, which is not at all what | 19 | * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what |
20 | * "wake_up()" does, and has led to problems. In other cases, it has | 20 | * "wake_up()" does, and has led to problems. In other cases, it has |
21 | * been fine, because there's only ever one waiter (kvm), but in that | 21 | * been fine, because there's only ever one waiter (kvm), but in that |
22 | * case gthe whole "simple" wait-queue is just pointless to begin with, | 22 | * case gthe whole "simple" wait-queue is just pointless to begin with, |
@@ -38,8 +38,8 @@ | |||
38 | * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right | 38 | * all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right |
39 | * sleeper state. | 39 | * sleeper state. |
40 | * | 40 | * |
41 | * - the exclusive mode; because this requires preserving the list order | 41 | * - the !exclusive mode; because that leads to O(n) wakeups, everything is |
42 | * and this is hard. | 42 | * exclusive. |
43 | * | 43 | * |
44 | * - custom wake callback functions; because you cannot give any guarantees | 44 | * - custom wake callback functions; because you cannot give any guarantees |
45 | * about random code. This also allows swait to be used in RT, such that | 45 | * about random code. This also allows swait to be used in RT, such that |
@@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name | |||
115 | * CPU0 - waker CPU1 - waiter | 115 | * CPU0 - waker CPU1 - waiter |
116 | * | 116 | * |
117 | * for (;;) { | 117 | * for (;;) { |
118 | * @cond = true; prepare_to_swait(&wq_head, &wait, state); | 118 | * @cond = true; prepare_to_swait_exclusive(&wq_head, &wait, state); |
119 | * smp_mb(); // smp_mb() from set_current_state() | 119 | * smp_mb(); // smp_mb() from set_current_state() |
120 | * if (swait_active(wq_head)) if (@cond) | 120 | * if (swait_active(wq_head)) if (@cond) |
121 | * wake_up(wq_head); break; | 121 | * wake_up(wq_head); break; |
@@ -157,20 +157,20 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq) | |||
157 | return swait_active(wq); | 157 | return swait_active(wq); |
158 | } | 158 | } |
159 | 159 | ||
160 | extern void swake_up(struct swait_queue_head *q); | 160 | extern void swake_up_one(struct swait_queue_head *q); |
161 | extern void swake_up_all(struct swait_queue_head *q); | 161 | extern void swake_up_all(struct swait_queue_head *q); |
162 | extern void swake_up_locked(struct swait_queue_head *q); | 162 | extern void swake_up_locked(struct swait_queue_head *q); |
163 | 163 | ||
164 | extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); | 164 | extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state); |
165 | extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); | ||
166 | extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); | 165 | extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); |
167 | 166 | ||
168 | extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); | 167 | extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait); |
169 | extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); | 168 | extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); |
170 | 169 | ||
171 | /* as per ___wait_event() but for swait, therefore "exclusive == 0" */ | 170 | /* as per ___wait_event() but for swait, therefore "exclusive == 1" */ |
172 | #define ___swait_event(wq, condition, state, ret, cmd) \ | 171 | #define ___swait_event(wq, condition, state, ret, cmd) \ |
173 | ({ \ | 172 | ({ \ |
173 | __label__ __out; \ | ||
174 | struct swait_queue __wait; \ | 174 | struct swait_queue __wait; \ |
175 | long __ret = ret; \ | 175 | long __ret = ret; \ |
176 | \ | 176 | \ |
@@ -183,20 +183,20 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait); | |||
183 | \ | 183 | \ |
184 | if (___wait_is_interruptible(state) && __int) { \ | 184 | if (___wait_is_interruptible(state) && __int) { \ |
185 | __ret = __int; \ | 185 | __ret = __int; \ |
186 | break; \ | 186 | goto __out; \ |
187 | } \ | 187 | } \ |
188 | \ | 188 | \ |
189 | cmd; \ | 189 | cmd; \ |
190 | } \ | 190 | } \ |
191 | finish_swait(&wq, &__wait); \ | 191 | finish_swait(&wq, &__wait); \ |
192 | __ret; \ | 192 | __out: __ret; \ |
193 | }) | 193 | }) |
194 | 194 | ||
195 | #define __swait_event(wq, condition) \ | 195 | #define __swait_event(wq, condition) \ |
196 | (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ | 196 | (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ |
197 | schedule()) | 197 | schedule()) |
198 | 198 | ||
199 | #define swait_event(wq, condition) \ | 199 | #define swait_event_exclusive(wq, condition) \ |
200 | do { \ | 200 | do { \ |
201 | if (condition) \ | 201 | if (condition) \ |
202 | break; \ | 202 | break; \ |
@@ -208,7 +208,7 @@ do { \ | |||
208 | TASK_UNINTERRUPTIBLE, timeout, \ | 208 | TASK_UNINTERRUPTIBLE, timeout, \ |
209 | __ret = schedule_timeout(__ret)) | 209 | __ret = schedule_timeout(__ret)) |
210 | 210 | ||
211 | #define swait_event_timeout(wq, condition, timeout) \ | 211 | #define swait_event_timeout_exclusive(wq, condition, timeout) \ |
212 | ({ \ | 212 | ({ \ |
213 | long __ret = timeout; \ | 213 | long __ret = timeout; \ |
214 | if (!___wait_cond_timeout(condition)) \ | 214 | if (!___wait_cond_timeout(condition)) \ |
@@ -220,7 +220,7 @@ do { \ | |||
220 | ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ | 220 | ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0, \ |
221 | schedule()) | 221 | schedule()) |
222 | 222 | ||
223 | #define swait_event_interruptible(wq, condition) \ | 223 | #define swait_event_interruptible_exclusive(wq, condition) \ |
224 | ({ \ | 224 | ({ \ |
225 | int __ret = 0; \ | 225 | int __ret = 0; \ |
226 | if (!(condition)) \ | 226 | if (!(condition)) \ |
@@ -233,7 +233,7 @@ do { \ | |||
233 | TASK_INTERRUPTIBLE, timeout, \ | 233 | TASK_INTERRUPTIBLE, timeout, \ |
234 | __ret = schedule_timeout(__ret)) | 234 | __ret = schedule_timeout(__ret)) |
235 | 235 | ||
236 | #define swait_event_interruptible_timeout(wq, condition, timeout) \ | 236 | #define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\ |
237 | ({ \ | 237 | ({ \ |
238 | long __ret = timeout; \ | 238 | long __ret = timeout; \ |
239 | if (!___wait_cond_timeout(condition)) \ | 239 | if (!___wait_cond_timeout(condition)) \ |
@@ -246,7 +246,7 @@ do { \ | |||
246 | (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule()) | 246 | (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule()) |
247 | 247 | ||
248 | /** | 248 | /** |
249 | * swait_event_idle - wait without system load contribution | 249 | * swait_event_idle_exclusive - wait without system load contribution |
250 | * @wq: the waitqueue to wait on | 250 | * @wq: the waitqueue to wait on |
251 | * @condition: a C expression for the event to wait for | 251 | * @condition: a C expression for the event to wait for |
252 | * | 252 | * |
@@ -257,7 +257,7 @@ do { \ | |||
257 | * condition and doesn't want to contribute to system load. Signals are | 257 | * condition and doesn't want to contribute to system load. Signals are |
258 | * ignored. | 258 | * ignored. |
259 | */ | 259 | */ |
260 | #define swait_event_idle(wq, condition) \ | 260 | #define swait_event_idle_exclusive(wq, condition) \ |
261 | do { \ | 261 | do { \ |
262 | if (condition) \ | 262 | if (condition) \ |
263 | break; \ | 263 | break; \ |
@@ -270,7 +270,7 @@ do { \ | |||
270 | __ret = schedule_timeout(__ret)) | 270 | __ret = schedule_timeout(__ret)) |
271 | 271 | ||
272 | /** | 272 | /** |
273 | * swait_event_idle_timeout - wait up to timeout without load contribution | 273 | * swait_event_idle_timeout_exclusive - wait up to timeout without load contribution |
274 | * @wq: the waitqueue to wait on | 274 | * @wq: the waitqueue to wait on |
275 | * @condition: a C expression for the event to wait for | 275 | * @condition: a C expression for the event to wait for |
276 | * @timeout: timeout at which we'll give up in jiffies | 276 | * @timeout: timeout at which we'll give up in jiffies |
@@ -288,7 +288,7 @@ do { \ | |||
288 | * or the remaining jiffies (at least 1) if the @condition evaluated | 288 | * or the remaining jiffies (at least 1) if the @condition evaluated |
289 | * to %true before the @timeout elapsed. | 289 | * to %true before the @timeout elapsed. |
290 | */ | 290 | */ |
291 | #define swait_event_idle_timeout(wq, condition, timeout) \ | 291 | #define swait_event_idle_timeout_exclusive(wq, condition, timeout) \ |
292 | ({ \ | 292 | ({ \ |
293 | long __ret = timeout; \ | 293 | long __ret = timeout; \ |
294 | if (!___wait_cond_timeout(condition)) \ | 294 | if (!___wait_cond_timeout(condition)) \ |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 2f8f338e77cf..15be70aae8ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { | |||
1344 | .startup.single = perf_event_init_cpu, | 1344 | .startup.single = perf_event_init_cpu, |
1345 | .teardown.single = perf_event_exit_cpu, | 1345 | .teardown.single = perf_event_exit_cpu, |
1346 | }, | 1346 | }, |
1347 | [CPUHP_AP_WATCHDOG_ONLINE] = { | ||
1348 | .name = "lockup_detector:online", | ||
1349 | .startup.single = lockup_detector_online_cpu, | ||
1350 | .teardown.single = lockup_detector_offline_cpu, | ||
1351 | }, | ||
1347 | [CPUHP_AP_WORKQUEUE_ONLINE] = { | 1352 | [CPUHP_AP_WORKQUEUE_ONLINE] = { |
1348 | .name = "workqueue:online", | 1353 | .name = "workqueue:online", |
1349 | .startup.single = workqueue_online_cpu, | 1354 | .startup.single = workqueue_online_cpu, |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 486dedbd9af5..087d18d771b5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self) | |||
190 | if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) | 190 | if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) |
191 | break; | 191 | break; |
192 | 192 | ||
193 | complete_all(&self->parked); | 193 | complete(&self->parked); |
194 | schedule(); | 194 | schedule(); |
195 | } | 195 | } |
196 | __set_current_state(TASK_RUNNING); | 196 | __set_current_state(TASK_RUNNING); |
@@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k) | |||
471 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) | 471 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) |
472 | __kthread_bind(k, kthread->cpu, TASK_PARKED); | 472 | __kthread_bind(k, kthread->cpu, TASK_PARKED); |
473 | 473 | ||
474 | reinit_completion(&kthread->parked); | ||
475 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 474 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
476 | /* | 475 | /* |
477 | * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. | 476 | * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. |
@@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k) | |||
499 | if (WARN_ON(k->flags & PF_EXITING)) | 498 | if (WARN_ON(k->flags & PF_EXITING)) |
500 | return -ENOSYS; | 499 | return -ENOSYS; |
501 | 500 | ||
501 | if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) | ||
502 | return -EBUSY; | ||
503 | |||
502 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 504 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
503 | if (k != current) { | 505 | if (k != current) { |
504 | wake_up_process(k); | 506 | wake_up_process(k); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 87331565e505..70178f6ffdc4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -92,7 +92,7 @@ static void s2idle_enter(void) | |||
92 | /* Push all the CPUs into the idle loop. */ | 92 | /* Push all the CPUs into the idle loop. */ |
93 | wake_up_all_idle_cpus(); | 93 | wake_up_all_idle_cpus(); |
94 | /* Make the current CPU wait so it can enter the idle loop too. */ | 94 | /* Make the current CPU wait so it can enter the idle loop too. */ |
95 | swait_event(s2idle_wait_head, | 95 | swait_event_exclusive(s2idle_wait_head, |
96 | s2idle_state == S2IDLE_STATE_WAKE); | 96 | s2idle_state == S2IDLE_STATE_WAKE); |
97 | 97 | ||
98 | cpuidle_pause(); | 98 | cpuidle_pause(); |
@@ -160,7 +160,7 @@ void s2idle_wake(void) | |||
160 | raw_spin_lock_irqsave(&s2idle_lock, flags); | 160 | raw_spin_lock_irqsave(&s2idle_lock, flags); |
161 | if (s2idle_state > S2IDLE_STATE_NONE) { | 161 | if (s2idle_state > S2IDLE_STATE_NONE) { |
162 | s2idle_state = S2IDLE_STATE_WAKE; | 162 | s2idle_state = S2IDLE_STATE_WAKE; |
163 | swake_up(&s2idle_wait_head); | 163 | swake_up_one(&s2idle_wait_head); |
164 | } | 164 | } |
165 | raw_spin_unlock_irqrestore(&s2idle_lock, flags); | 165 | raw_spin_unlock_irqrestore(&s2idle_lock, flags); |
166 | } | 166 | } |
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 622792abe41a..04fc2ed71af8 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c | |||
@@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) | |||
110 | 110 | ||
111 | WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); | 111 | WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); |
112 | if (!newval && READ_ONCE(sp->srcu_gp_waiting)) | 112 | if (!newval && READ_ONCE(sp->srcu_gp_waiting)) |
113 | swake_up(&sp->srcu_wq); | 113 | swake_up_one(&sp->srcu_wq); |
114 | } | 114 | } |
115 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 115 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
116 | 116 | ||
@@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp) | |||
140 | idx = sp->srcu_idx; | 140 | idx = sp->srcu_idx; |
141 | WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); | 141 | WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); |
142 | WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ | 142 | WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ |
143 | swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); | 143 | swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); |
144 | WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ | 144 | WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ |
145 | 145 | ||
146 | /* Invoke the callbacks we removed above. */ | 146 | /* Invoke the callbacks we removed above. */ |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6930934e8b9f..0b760c1369f7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) | |||
1701 | !READ_ONCE(rsp->gp_flags) || | 1701 | !READ_ONCE(rsp->gp_flags) || |
1702 | !rsp->gp_kthread) | 1702 | !rsp->gp_kthread) |
1703 | return; | 1703 | return; |
1704 | swake_up(&rsp->gp_wq); | 1704 | swake_up_one(&rsp->gp_wq); |
1705 | } | 1705 | } |
1706 | 1706 | ||
1707 | /* | 1707 | /* |
@@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) | |||
2015 | } | 2015 | } |
2016 | 2016 | ||
2017 | /* | 2017 | /* |
2018 | * Helper function for swait_event_idle() wakeup at force-quiescent-state | 2018 | * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state |
2019 | * time. | 2019 | * time. |
2020 | */ | 2020 | */ |
2021 | static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) | 2021 | static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) |
@@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2163 | READ_ONCE(rsp->gp_seq), | 2163 | READ_ONCE(rsp->gp_seq), |
2164 | TPS("reqwait")); | 2164 | TPS("reqwait")); |
2165 | rsp->gp_state = RCU_GP_WAIT_GPS; | 2165 | rsp->gp_state = RCU_GP_WAIT_GPS; |
2166 | swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & | 2166 | swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & |
2167 | RCU_GP_FLAG_INIT); | 2167 | RCU_GP_FLAG_INIT); |
2168 | rsp->gp_state = RCU_GP_DONE_GPS; | 2168 | rsp->gp_state = RCU_GP_DONE_GPS; |
2169 | /* Locking provides needed memory barrier. */ | 2169 | /* Locking provides needed memory barrier. */ |
@@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2191 | READ_ONCE(rsp->gp_seq), | 2191 | READ_ONCE(rsp->gp_seq), |
2192 | TPS("fqswait")); | 2192 | TPS("fqswait")); |
2193 | rsp->gp_state = RCU_GP_WAIT_FQS; | 2193 | rsp->gp_state = RCU_GP_WAIT_FQS; |
2194 | ret = swait_event_idle_timeout(rsp->gp_wq, | 2194 | ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, |
2195 | rcu_gp_fqs_check_wake(rsp, &gf), j); | 2195 | rcu_gp_fqs_check_wake(rsp, &gf), j); |
2196 | rsp->gp_state = RCU_GP_DOING_FQS; | 2196 | rsp->gp_state = RCU_GP_DOING_FQS; |
2197 | /* Locking provides needed memory barriers. */ | 2197 | /* Locking provides needed memory barriers. */ |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index b3df3b770afb..0b2c2ad69629 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
212 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 212 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
213 | if (wake) { | 213 | if (wake) { |
214 | smp_mb(); /* EGP done before wake_up(). */ | 214 | smp_mb(); /* EGP done before wake_up(). */ |
215 | swake_up(&rsp->expedited_wq); | 215 | swake_up_one(&rsp->expedited_wq); |
216 | } | 216 | } |
217 | break; | 217 | break; |
218 | } | 218 | } |
@@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
526 | jiffies_start = jiffies; | 526 | jiffies_start = jiffies; |
527 | 527 | ||
528 | for (;;) { | 528 | for (;;) { |
529 | ret = swait_event_timeout( | 529 | ret = swait_event_timeout_exclusive( |
530 | rsp->expedited_wq, | 530 | rsp->expedited_wq, |
531 | sync_rcu_preempt_exp_done_unlocked(rnp_root), | 531 | sync_rcu_preempt_exp_done_unlocked(rnp_root), |
532 | jiffies_stall); | 532 | jiffies_stall); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c1b17f5b9361..a97c20ea9bce 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force, | |||
1926 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); | 1926 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); |
1927 | del_timer(&rdp->nocb_timer); | 1927 | del_timer(&rdp->nocb_timer); |
1928 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | 1928 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); |
1929 | smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ | 1929 | smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ |
1930 | swake_up(&rdp_leader->nocb_wq); | 1930 | swake_up_one(&rdp_leader->nocb_wq); |
1931 | } else { | 1931 | } else { |
1932 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | 1932 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); |
1933 | } | 1933 | } |
@@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2159 | */ | 2159 | */ |
2160 | trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); | 2160 | trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); |
2161 | for (;;) { | 2161 | for (;;) { |
2162 | swait_event_interruptible( | 2162 | swait_event_interruptible_exclusive( |
2163 | rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], | 2163 | rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], |
2164 | (d = rcu_seq_done(&rnp->gp_seq, c))); | 2164 | (d = rcu_seq_done(&rnp->gp_seq, c))); |
2165 | if (likely(d)) | 2165 | if (likely(d)) |
@@ -2188,7 +2188,7 @@ wait_again: | |||
2188 | /* Wait for callbacks to appear. */ | 2188 | /* Wait for callbacks to appear. */ |
2189 | if (!rcu_nocb_poll) { | 2189 | if (!rcu_nocb_poll) { |
2190 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); | 2190 | trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); |
2191 | swait_event_interruptible(my_rdp->nocb_wq, | 2191 | swait_event_interruptible_exclusive(my_rdp->nocb_wq, |
2192 | !READ_ONCE(my_rdp->nocb_leader_sleep)); | 2192 | !READ_ONCE(my_rdp->nocb_leader_sleep)); |
2193 | raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); | 2193 | raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); |
2194 | my_rdp->nocb_leader_sleep = true; | 2194 | my_rdp->nocb_leader_sleep = true; |
@@ -2253,7 +2253,7 @@ wait_again: | |||
2253 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | 2253 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); |
2254 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { | 2254 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { |
2255 | /* List was empty, so wake up the follower. */ | 2255 | /* List was empty, so wake up the follower. */ |
2256 | swake_up(&rdp->nocb_wq); | 2256 | swake_up_one(&rdp->nocb_wq); |
2257 | } | 2257 | } |
2258 | } | 2258 | } |
2259 | 2259 | ||
@@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) | |||
2270 | { | 2270 | { |
2271 | for (;;) { | 2271 | for (;;) { |
2272 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); | 2272 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); |
2273 | swait_event_interruptible(rdp->nocb_wq, | 2273 | swait_event_interruptible_exclusive(rdp->nocb_wq, |
2274 | READ_ONCE(rdp->nocb_follower_head)); | 2274 | READ_ONCE(rdp->nocb_follower_head)); |
2275 | if (smp_load_acquire(&rdp->nocb_follower_head)) { | 2275 | if (smp_load_acquire(&rdp->nocb_follower_head)) { |
2276 | /* ^^^ Ensure CB invocation follows _head test. */ | 2276 | /* ^^^ Ensure CB invocation follows _head test. */ |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index d9a02b318108..7fe183404c38 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o | |||
20 | obj-y += idle.o fair.o rt.o deadline.o | 20 | obj-y += idle.o fair.o rt.o deadline.o |
21 | obj-y += wait.o wait_bit.o swait.o completion.o | 21 | obj-y += wait.o wait_bit.o swait.o completion.o |
22 | 22 | ||
23 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o | 23 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o |
24 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o | 24 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
25 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 25 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
26 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 26 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe365c9a08e9..deafa9fe602b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include "../workqueue_internal.h" | 17 | #include "../workqueue_internal.h" |
18 | #include "../smpboot.h" | 18 | #include "../smpboot.h" |
19 | 19 | ||
20 | #include "pelt.h" | ||
21 | |||
20 | #define CREATE_TRACE_POINTS | 22 | #define CREATE_TRACE_POINTS |
21 | #include <trace/events/sched.h> | 23 | #include <trace/events/sched.h> |
22 | 24 | ||
@@ -45,14 +47,6 @@ const_debug unsigned int sysctl_sched_features = | |||
45 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 47 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
46 | 48 | ||
47 | /* | 49 | /* |
48 | * period over which we average the RT time consumption, measured | ||
49 | * in ms. | ||
50 | * | ||
51 | * default: 1s | ||
52 | */ | ||
53 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
54 | |||
55 | /* | ||
56 | * period over which we measure -rt task CPU usage in us. | 50 | * period over which we measure -rt task CPU usage in us. |
57 | * default: 1s | 51 | * default: 1s |
58 | */ | 52 | */ |
@@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
183 | 177 | ||
184 | rq->clock_task += delta; | 178 | rq->clock_task += delta; |
185 | 179 | ||
186 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | 180 | #ifdef HAVE_SCHED_AVG_IRQ |
187 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | 181 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) |
188 | sched_rt_avg_update(rq, irq_delta + steal); | 182 | update_irq_load_avg(rq, irq_delta + steal); |
189 | #endif | 183 | #endif |
190 | } | 184 | } |
191 | 185 | ||
@@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq) | |||
649 | return true; | 643 | return true; |
650 | } | 644 | } |
651 | #endif /* CONFIG_NO_HZ_FULL */ | 645 | #endif /* CONFIG_NO_HZ_FULL */ |
652 | |||
653 | void sched_avg_update(struct rq *rq) | ||
654 | { | ||
655 | s64 period = sched_avg_period(); | ||
656 | |||
657 | while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { | ||
658 | /* | ||
659 | * Inline assembly required to prevent the compiler | ||
660 | * optimising this loop into a divmod call. | ||
661 | * See __iter_div_u64_rem() for another example of this. | ||
662 | */ | ||
663 | asm("" : "+rm" (rq->age_stamp)); | ||
664 | rq->age_stamp += period; | ||
665 | rq->rt_avg /= 2; | ||
666 | } | ||
667 | } | ||
668 | |||
669 | #endif /* CONFIG_SMP */ | 646 | #endif /* CONFIG_SMP */ |
670 | 647 | ||
671 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 648 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
@@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1199 | __set_task_cpu(p, new_cpu); | 1176 | __set_task_cpu(p, new_cpu); |
1200 | } | 1177 | } |
1201 | 1178 | ||
1179 | #ifdef CONFIG_NUMA_BALANCING | ||
1202 | static void __migrate_swap_task(struct task_struct *p, int cpu) | 1180 | static void __migrate_swap_task(struct task_struct *p, int cpu) |
1203 | { | 1181 | { |
1204 | if (task_on_rq_queued(p)) { | 1182 | if (task_on_rq_queued(p)) { |
@@ -1280,16 +1258,17 @@ unlock: | |||
1280 | /* | 1258 | /* |
1281 | * Cross migrate two tasks | 1259 | * Cross migrate two tasks |
1282 | */ | 1260 | */ |
1283 | int migrate_swap(struct task_struct *cur, struct task_struct *p) | 1261 | int migrate_swap(struct task_struct *cur, struct task_struct *p, |
1262 | int target_cpu, int curr_cpu) | ||
1284 | { | 1263 | { |
1285 | struct migration_swap_arg arg; | 1264 | struct migration_swap_arg arg; |
1286 | int ret = -EINVAL; | 1265 | int ret = -EINVAL; |
1287 | 1266 | ||
1288 | arg = (struct migration_swap_arg){ | 1267 | arg = (struct migration_swap_arg){ |
1289 | .src_task = cur, | 1268 | .src_task = cur, |
1290 | .src_cpu = task_cpu(cur), | 1269 | .src_cpu = curr_cpu, |
1291 | .dst_task = p, | 1270 | .dst_task = p, |
1292 | .dst_cpu = task_cpu(p), | 1271 | .dst_cpu = target_cpu, |
1293 | }; | 1272 | }; |
1294 | 1273 | ||
1295 | if (arg.src_cpu == arg.dst_cpu) | 1274 | if (arg.src_cpu == arg.dst_cpu) |
@@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) | |||
1314 | out: | 1293 | out: |
1315 | return ret; | 1294 | return ret; |
1316 | } | 1295 | } |
1296 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1317 | 1297 | ||
1318 | /* | 1298 | /* |
1319 | * wait_task_inactive - wait for a thread to unschedule. | 1299 | * wait_task_inactive - wait for a thread to unschedule. |
@@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {} | |||
2317 | int sched_fork(unsigned long clone_flags, struct task_struct *p) | 2297 | int sched_fork(unsigned long clone_flags, struct task_struct *p) |
2318 | { | 2298 | { |
2319 | unsigned long flags; | 2299 | unsigned long flags; |
2320 | int cpu = get_cpu(); | ||
2321 | 2300 | ||
2322 | __sched_fork(clone_flags, p); | 2301 | __sched_fork(clone_flags, p); |
2323 | /* | 2302 | /* |
@@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2353 | p->sched_reset_on_fork = 0; | 2332 | p->sched_reset_on_fork = 0; |
2354 | } | 2333 | } |
2355 | 2334 | ||
2356 | if (dl_prio(p->prio)) { | 2335 | if (dl_prio(p->prio)) |
2357 | put_cpu(); | ||
2358 | return -EAGAIN; | 2336 | return -EAGAIN; |
2359 | } else if (rt_prio(p->prio)) { | 2337 | else if (rt_prio(p->prio)) |
2360 | p->sched_class = &rt_sched_class; | 2338 | p->sched_class = &rt_sched_class; |
2361 | } else { | 2339 | else |
2362 | p->sched_class = &fair_sched_class; | 2340 | p->sched_class = &fair_sched_class; |
2363 | } | ||
2364 | 2341 | ||
2365 | init_entity_runnable_average(&p->se); | 2342 | init_entity_runnable_average(&p->se); |
2366 | 2343 | ||
@@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2376 | * We're setting the CPU for the first time, we don't migrate, | 2353 | * We're setting the CPU for the first time, we don't migrate, |
2377 | * so use __set_task_cpu(). | 2354 | * so use __set_task_cpu(). |
2378 | */ | 2355 | */ |
2379 | __set_task_cpu(p, cpu); | 2356 | __set_task_cpu(p, smp_processor_id()); |
2380 | if (p->sched_class->task_fork) | 2357 | if (p->sched_class->task_fork) |
2381 | p->sched_class->task_fork(p); | 2358 | p->sched_class->task_fork(p); |
2382 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2359 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2393 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2370 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2394 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | 2371 | RB_CLEAR_NODE(&p->pushable_dl_tasks); |
2395 | #endif | 2372 | #endif |
2396 | |||
2397 | put_cpu(); | ||
2398 | return 0; | 2373 | return 0; |
2399 | } | 2374 | } |
2400 | 2375 | ||
@@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq) | |||
5714 | } | 5689 | } |
5715 | } | 5690 | } |
5716 | 5691 | ||
5717 | static void set_cpu_rq_start_time(unsigned int cpu) | ||
5718 | { | ||
5719 | struct rq *rq = cpu_rq(cpu); | ||
5720 | |||
5721 | rq->age_stamp = sched_clock_cpu(cpu); | ||
5722 | } | ||
5723 | |||
5724 | /* | 5692 | /* |
5725 | * used to mark begin/end of suspend/resume: | 5693 | * used to mark begin/end of suspend/resume: |
5726 | */ | 5694 | */ |
@@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) | |||
5838 | 5806 | ||
5839 | int sched_cpu_starting(unsigned int cpu) | 5807 | int sched_cpu_starting(unsigned int cpu) |
5840 | { | 5808 | { |
5841 | set_cpu_rq_start_time(cpu); | ||
5842 | sched_rq_cpu_starting(cpu); | 5809 | sched_rq_cpu_starting(cpu); |
5843 | sched_tick_start(cpu); | 5810 | sched_tick_start(cpu); |
5844 | return 0; | 5811 | return 0; |
@@ -6106,7 +6073,6 @@ void __init sched_init(void) | |||
6106 | 6073 | ||
6107 | #ifdef CONFIG_SMP | 6074 | #ifdef CONFIG_SMP |
6108 | idle_thread_set_boot_cpu(); | 6075 | idle_thread_set_boot_cpu(); |
6109 | set_cpu_rq_start_time(smp_processor_id()); | ||
6110 | #endif | 6076 | #endif |
6111 | init_sched_fair_class(); | 6077 | init_sched_fair_class(); |
6112 | 6078 | ||
@@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) | |||
6785 | seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); | 6751 | seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); |
6786 | seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); | 6752 | seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); |
6787 | 6753 | ||
6754 | if (schedstat_enabled() && tg != &root_task_group) { | ||
6755 | u64 ws = 0; | ||
6756 | int i; | ||
6757 | |||
6758 | for_each_possible_cpu(i) | ||
6759 | ws += schedstat_val(tg->se[i]->statistics.wait_sum); | ||
6760 | |||
6761 | seq_printf(sf, "wait_sum %llu\n", ws); | ||
6762 | } | ||
6763 | |||
6788 | return 0; | 6764 | return 0; |
6789 | } | 6765 | } |
6790 | #endif /* CONFIG_CFS_BANDWIDTH */ | 6766 | #endif /* CONFIG_CFS_BANDWIDTH */ |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c907fde01eaa..3fffad3bc8a8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -53,9 +53,7 @@ struct sugov_cpu { | |||
53 | unsigned int iowait_boost_max; | 53 | unsigned int iowait_boost_max; |
54 | u64 last_update; | 54 | u64 last_update; |
55 | 55 | ||
56 | /* The fields below are only needed when sharing a policy: */ | 56 | unsigned long bw_dl; |
57 | unsigned long util_cfs; | ||
58 | unsigned long util_dl; | ||
59 | unsigned long max; | 57 | unsigned long max; |
60 | 58 | ||
61 | /* The field below is for single-CPU policies only: */ | 59 | /* The field below is for single-CPU policies only: */ |
@@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, | |||
179 | return cpufreq_driver_resolve_freq(policy, freq); | 177 | return cpufreq_driver_resolve_freq(policy, freq); |
180 | } | 178 | } |
181 | 179 | ||
182 | static void sugov_get_util(struct sugov_cpu *sg_cpu) | 180 | /* |
181 | * This function computes an effective utilization for the given CPU, to be | ||
182 | * used for frequency selection given the linear relation: f = u * f_max. | ||
183 | * | ||
184 | * The scheduler tracks the following metrics: | ||
185 | * | ||
186 | * cpu_util_{cfs,rt,dl,irq}() | ||
187 | * cpu_bw_dl() | ||
188 | * | ||
189 | * Where the cfs,rt and dl util numbers are tracked with the same metric and | ||
190 | * synchronized windows and are thus directly comparable. | ||
191 | * | ||
192 | * The cfs,rt,dl utilization are the running times measured with rq->clock_task | ||
193 | * which excludes things like IRQ and steal-time. These latter are then accrued | ||
194 | * in the irq utilization. | ||
195 | * | ||
196 | * The DL bandwidth number otoh is not a measured metric but a value computed | ||
197 | * based on the task model parameters and gives the minimal utilization | ||
198 | * required to meet deadlines. | ||
199 | */ | ||
200 | static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) | ||
183 | { | 201 | { |
184 | struct rq *rq = cpu_rq(sg_cpu->cpu); | 202 | struct rq *rq = cpu_rq(sg_cpu->cpu); |
203 | unsigned long util, irq, max; | ||
185 | 204 | ||
186 | sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); | 205 | sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); |
187 | sg_cpu->util_cfs = cpu_util_cfs(rq); | 206 | sg_cpu->bw_dl = cpu_bw_dl(rq); |
188 | sg_cpu->util_dl = cpu_util_dl(rq); | ||
189 | } | ||
190 | |||
191 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) | ||
192 | { | ||
193 | struct rq *rq = cpu_rq(sg_cpu->cpu); | ||
194 | 207 | ||
195 | if (rt_rq_is_runnable(&rq->rt)) | 208 | if (rt_rq_is_runnable(&rq->rt)) |
196 | return sg_cpu->max; | 209 | return max; |
210 | |||
211 | /* | ||
212 | * Early check to see if IRQ/steal time saturates the CPU, can be | ||
213 | * because of inaccuracies in how we track these -- see | ||
214 | * update_irq_load_avg(). | ||
215 | */ | ||
216 | irq = cpu_util_irq(rq); | ||
217 | if (unlikely(irq >= max)) | ||
218 | return max; | ||
219 | |||
220 | /* | ||
221 | * Because the time spend on RT/DL tasks is visible as 'lost' time to | ||
222 | * CFS tasks and we use the same metric to track the effective | ||
223 | * utilization (PELT windows are synchronized) we can directly add them | ||
224 | * to obtain the CPU's actual utilization. | ||
225 | */ | ||
226 | util = cpu_util_cfs(rq); | ||
227 | util += cpu_util_rt(rq); | ||
228 | |||
229 | /* | ||
230 | * We do not make cpu_util_dl() a permanent part of this sum because we | ||
231 | * want to use cpu_bw_dl() later on, but we need to check if the | ||
232 | * CFS+RT+DL sum is saturated (ie. no idle time) such that we select | ||
233 | * f_max when there is no idle time. | ||
234 | * | ||
235 | * NOTE: numerical errors or stop class might cause us to not quite hit | ||
236 | * saturation when we should -- something for later. | ||
237 | */ | ||
238 | if ((util + cpu_util_dl(rq)) >= max) | ||
239 | return max; | ||
240 | |||
241 | /* | ||
242 | * There is still idle time; further improve the number by using the | ||
243 | * irq metric. Because IRQ/steal time is hidden from the task clock we | ||
244 | * need to scale the task numbers: | ||
245 | * | ||
246 | * 1 - irq | ||
247 | * U' = irq + ------- * U | ||
248 | * max | ||
249 | */ | ||
250 | util = scale_irq_capacity(util, irq, max); | ||
251 | util += irq; | ||
197 | 252 | ||
198 | /* | 253 | /* |
199 | * Utilization required by DEADLINE must always be granted while, for | 254 | * Bandwidth required by DEADLINE must always be granted while, for |
200 | * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to | 255 | * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism |
201 | * gracefully reduce the frequency when no tasks show up for longer | 256 | * to gracefully reduce the frequency when no tasks show up for longer |
202 | * periods of time. | 257 | * periods of time. |
203 | * | 258 | * |
204 | * Ideally we would like to set util_dl as min/guaranteed freq and | 259 | * Ideally we would like to set bw_dl as min/guaranteed freq and util + |
205 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet | 260 | * bw_dl as requested freq. However, cpufreq is not yet ready for such |
206 | * ready for such an interface. So, we only do the latter for now. | 261 | * an interface. So, we only do the latter for now. |
207 | */ | 262 | */ |
208 | return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); | 263 | return min(max, util + sg_cpu->bw_dl); |
209 | } | 264 | } |
210 | 265 | ||
211 | /** | 266 | /** |
@@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } | |||
360 | */ | 415 | */ |
361 | static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) | 416 | static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) |
362 | { | 417 | { |
363 | if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) | 418 | if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) |
364 | sg_policy->need_freq_update = true; | 419 | sg_policy->need_freq_update = true; |
365 | } | 420 | } |
366 | 421 | ||
@@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, | |||
383 | 438 | ||
384 | busy = sugov_cpu_is_busy(sg_cpu); | 439 | busy = sugov_cpu_is_busy(sg_cpu); |
385 | 440 | ||
386 | sugov_get_util(sg_cpu); | 441 | util = sugov_get_util(sg_cpu); |
387 | max = sg_cpu->max; | 442 | max = sg_cpu->max; |
388 | util = sugov_aggregate_util(sg_cpu); | ||
389 | sugov_iowait_apply(sg_cpu, time, &util, &max); | 443 | sugov_iowait_apply(sg_cpu, time, &util, &max); |
390 | next_f = get_next_freq(sg_policy, util, max); | 444 | next_f = get_next_freq(sg_policy, util, max); |
391 | /* | 445 | /* |
@@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
424 | struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); | 478 | struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); |
425 | unsigned long j_util, j_max; | 479 | unsigned long j_util, j_max; |
426 | 480 | ||
427 | sugov_get_util(j_sg_cpu); | 481 | j_util = sugov_get_util(j_sg_cpu); |
428 | j_max = j_sg_cpu->max; | 482 | j_max = j_sg_cpu->max; |
429 | j_util = sugov_aggregate_util(j_sg_cpu); | ||
430 | sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); | 483 | sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); |
431 | 484 | ||
432 | if (j_util * max > j_max * util) { | 485 | if (j_util * max > j_max * util) { |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b5fbdde6afa9..997ea7b839fa 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -16,6 +16,7 @@ | |||
16 | * Fabio Checconi <fchecconi@gmail.com> | 16 | * Fabio Checconi <fchecconi@gmail.com> |
17 | */ | 17 | */ |
18 | #include "sched.h" | 18 | #include "sched.h" |
19 | #include "pelt.h" | ||
19 | 20 | ||
20 | struct dl_bandwidth def_dl_bandwidth; | 21 | struct dl_bandwidth def_dl_bandwidth; |
21 | 22 | ||
@@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq) | |||
1179 | curr->se.exec_start = now; | 1180 | curr->se.exec_start = now; |
1180 | cgroup_account_cputime(curr, delta_exec); | 1181 | cgroup_account_cputime(curr, delta_exec); |
1181 | 1182 | ||
1182 | sched_rt_avg_update(rq, delta_exec); | ||
1183 | |||
1184 | if (dl_entity_is_special(dl_se)) | 1183 | if (dl_entity_is_special(dl_se)) |
1185 | return; | 1184 | return; |
1186 | 1185 | ||
@@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1761 | 1760 | ||
1762 | deadline_queue_push_tasks(rq); | 1761 | deadline_queue_push_tasks(rq); |
1763 | 1762 | ||
1763 | if (rq->curr->sched_class != &dl_sched_class) | ||
1764 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); | ||
1765 | |||
1764 | return p; | 1766 | return p; |
1765 | } | 1767 | } |
1766 | 1768 | ||
@@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | |||
1768 | { | 1770 | { |
1769 | update_curr_dl(rq); | 1771 | update_curr_dl(rq); |
1770 | 1772 | ||
1773 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); | ||
1771 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) | 1774 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) |
1772 | enqueue_pushable_dl_task(rq, p); | 1775 | enqueue_pushable_dl_task(rq, p); |
1773 | } | 1776 | } |
@@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | |||
1784 | { | 1787 | { |
1785 | update_curr_dl(rq); | 1788 | update_curr_dl(rq); |
1786 | 1789 | ||
1790 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); | ||
1787 | /* | 1791 | /* |
1788 | * Even when we have runtime, update_curr_dl() might have resulted in us | 1792 | * Even when we have runtime, update_curr_dl() might have resulted in us |
1789 | * not being the leftmost task anymore. In that case NEED_RESCHED will | 1793 | * not being the leftmost task anymore. In that case NEED_RESCHED will |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e593b4118578..870d4f3da285 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp) | |||
111 | cmp += 3; | 111 | cmp += 3; |
112 | } | 112 | } |
113 | 113 | ||
114 | for (i = 0; i < __SCHED_FEAT_NR; i++) { | 114 | i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp); |
115 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | 115 | if (i < 0) |
116 | if (neg) { | 116 | return i; |
117 | sysctl_sched_features &= ~(1UL << i); | 117 | |
118 | sched_feat_disable(i); | 118 | if (neg) { |
119 | } else { | 119 | sysctl_sched_features &= ~(1UL << i); |
120 | sysctl_sched_features |= (1UL << i); | 120 | sched_feat_disable(i); |
121 | sched_feat_enable(i); | 121 | } else { |
122 | } | 122 | sysctl_sched_features |= (1UL << i); |
123 | break; | 123 | sched_feat_enable(i); |
124 | } | ||
125 | } | 124 | } |
126 | 125 | ||
127 | return i; | 126 | return 0; |
128 | } | 127 | } |
129 | 128 | ||
130 | static ssize_t | 129 | static ssize_t |
@@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
133 | { | 132 | { |
134 | char buf[64]; | 133 | char buf[64]; |
135 | char *cmp; | 134 | char *cmp; |
136 | int i; | 135 | int ret; |
137 | struct inode *inode; | 136 | struct inode *inode; |
138 | 137 | ||
139 | if (cnt > 63) | 138 | if (cnt > 63) |
@@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
148 | /* Ensure the static_key remains in a consistent state */ | 147 | /* Ensure the static_key remains in a consistent state */ |
149 | inode = file_inode(filp); | 148 | inode = file_inode(filp); |
150 | inode_lock(inode); | 149 | inode_lock(inode); |
151 | i = sched_feat_set(cmp); | 150 | ret = sched_feat_set(cmp); |
152 | inode_unlock(inode); | 151 | inode_unlock(inode); |
153 | if (i == __SCHED_FEAT_NR) | 152 | if (ret < 0) |
154 | return -EINVAL; | 153 | return ret; |
155 | 154 | ||
156 | *ppos += cnt; | 155 | *ppos += cnt; |
157 | 156 | ||
@@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, | |||
843 | unsigned long tpf, unsigned long gsf, unsigned long gpf) | 842 | unsigned long tpf, unsigned long gsf, unsigned long gpf) |
844 | { | 843 | { |
845 | SEQ_printf(m, "numa_faults node=%d ", node); | 844 | SEQ_printf(m, "numa_faults node=%d ", node); |
846 | SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf); | 845 | SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf); |
847 | SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf); | 846 | SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf); |
848 | } | 847 | } |
849 | #endif | 848 | #endif |
850 | 849 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2f0a0be4d344..309c93fcc604 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
255 | return cfs_rq->rq; | 255 | return cfs_rq->rq; |
256 | } | 256 | } |
257 | 257 | ||
258 | /* An entity is a task if it doesn't "own" a runqueue */ | ||
259 | #define entity_is_task(se) (!se->my_q) | ||
260 | |||
261 | static inline struct task_struct *task_of(struct sched_entity *se) | 258 | static inline struct task_struct *task_of(struct sched_entity *se) |
262 | { | 259 | { |
263 | SCHED_WARN_ON(!entity_is_task(se)); | 260 | SCHED_WARN_ON(!entity_is_task(se)); |
@@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | |||
419 | return container_of(cfs_rq, struct rq, cfs); | 416 | return container_of(cfs_rq, struct rq, cfs); |
420 | } | 417 | } |
421 | 418 | ||
422 | #define entity_is_task(se) 1 | ||
423 | 419 | ||
424 | #define for_each_sched_entity(se) \ | 420 | #define for_each_sched_entity(se) \ |
425 | for (; se; se = NULL) | 421 | for (; se; se = NULL) |
@@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
692 | } | 688 | } |
693 | 689 | ||
694 | #ifdef CONFIG_SMP | 690 | #ifdef CONFIG_SMP |
695 | 691 | #include "pelt.h" | |
696 | #include "sched-pelt.h" | 692 | #include "sched-pelt.h" |
697 | 693 | ||
698 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); | 694 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); |
@@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se); | |||
735 | * To solve this problem, we also cap the util_avg of successive tasks to | 731 | * To solve this problem, we also cap the util_avg of successive tasks to |
736 | * only 1/2 of the left utilization budget: | 732 | * only 1/2 of the left utilization budget: |
737 | * | 733 | * |
738 | * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n | 734 | * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n |
739 | * | 735 | * |
740 | * where n denotes the nth task. | 736 | * where n denotes the nth task and cpu_scale the CPU capacity. |
741 | * | 737 | * |
742 | * For example, a simplest series from the beginning would be like: | 738 | * For example, for a CPU with 1024 of capacity, a simplest series from |
739 | * the beginning would be like: | ||
743 | * | 740 | * |
744 | * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... | 741 | * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... |
745 | * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... | 742 | * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... |
@@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
751 | { | 748 | { |
752 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 749 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
753 | struct sched_avg *sa = &se->avg; | 750 | struct sched_avg *sa = &se->avg; |
754 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 751 | long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); |
752 | long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; | ||
755 | 753 | ||
756 | if (cap > 0) { | 754 | if (cap > 0) { |
757 | if (cfs_rq->avg.util_avg != 0) { | 755 | if (cfs_rq->avg.util_avg != 0) { |
@@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, | |||
1314 | * of each group. Skip other nodes. | 1312 | * of each group. Skip other nodes. |
1315 | */ | 1313 | */ |
1316 | if (sched_numa_topology_type == NUMA_BACKPLANE && | 1314 | if (sched_numa_topology_type == NUMA_BACKPLANE && |
1317 | dist > maxdist) | 1315 | dist >= maxdist) |
1318 | continue; | 1316 | continue; |
1319 | 1317 | ||
1320 | /* Add up the faults from nearby nodes. */ | 1318 | /* Add up the faults from nearby nodes. */ |
@@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu); | |||
1452 | 1450 | ||
1453 | /* Cached statistics for all CPUs within a node */ | 1451 | /* Cached statistics for all CPUs within a node */ |
1454 | struct numa_stats { | 1452 | struct numa_stats { |
1455 | unsigned long nr_running; | ||
1456 | unsigned long load; | 1453 | unsigned long load; |
1457 | 1454 | ||
1458 | /* Total compute capacity of CPUs on a node */ | 1455 | /* Total compute capacity of CPUs on a node */ |
1459 | unsigned long compute_capacity; | 1456 | unsigned long compute_capacity; |
1460 | 1457 | ||
1461 | /* Approximate capacity in terms of runnable tasks on a node */ | 1458 | unsigned int nr_running; |
1462 | unsigned long task_capacity; | ||
1463 | int has_free_capacity; | ||
1464 | }; | 1459 | }; |
1465 | 1460 | ||
1466 | /* | 1461 | /* |
@@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1487 | * the @ns structure is NULL'ed and task_numa_compare() will | 1482 | * the @ns structure is NULL'ed and task_numa_compare() will |
1488 | * not find this node attractive. | 1483 | * not find this node attractive. |
1489 | * | 1484 | * |
1490 | * We'll either bail at !has_free_capacity, or we'll detect a huge | 1485 | * We'll detect a huge imbalance and bail there. |
1491 | * imbalance and bail there. | ||
1492 | */ | 1486 | */ |
1493 | if (!cpus) | 1487 | if (!cpus) |
1494 | return; | 1488 | return; |
@@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1497 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); | 1491 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); |
1498 | capacity = cpus / smt; /* cores */ | 1492 | capacity = cpus / smt; /* cores */ |
1499 | 1493 | ||
1500 | ns->task_capacity = min_t(unsigned, capacity, | 1494 | capacity = min_t(unsigned, capacity, |
1501 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); | 1495 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); |
1502 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | ||
1503 | } | 1496 | } |
1504 | 1497 | ||
1505 | struct task_numa_env { | 1498 | struct task_numa_env { |
@@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
1548 | src_capacity = env->src_stats.compute_capacity; | 1541 | src_capacity = env->src_stats.compute_capacity; |
1549 | dst_capacity = env->dst_stats.compute_capacity; | 1542 | dst_capacity = env->dst_stats.compute_capacity; |
1550 | 1543 | ||
1551 | /* We care about the slope of the imbalance, not the direction. */ | 1544 | imb = abs(dst_load * src_capacity - src_load * dst_capacity); |
1552 | if (dst_load < src_load) | ||
1553 | swap(dst_load, src_load); | ||
1554 | 1545 | ||
1555 | /* Is the difference below the threshold? */ | ||
1556 | imb = dst_load * src_capacity * 100 - | ||
1557 | src_load * dst_capacity * env->imbalance_pct; | ||
1558 | if (imb <= 0) | ||
1559 | return false; | ||
1560 | |||
1561 | /* | ||
1562 | * The imbalance is above the allowed threshold. | ||
1563 | * Compare it with the old imbalance. | ||
1564 | */ | ||
1565 | orig_src_load = env->src_stats.load; | 1546 | orig_src_load = env->src_stats.load; |
1566 | orig_dst_load = env->dst_stats.load; | 1547 | orig_dst_load = env->dst_stats.load; |
1567 | 1548 | ||
1568 | if (orig_dst_load < orig_src_load) | 1549 | old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); |
1569 | swap(orig_dst_load, orig_src_load); | ||
1570 | |||
1571 | old_imb = orig_dst_load * src_capacity * 100 - | ||
1572 | orig_src_load * dst_capacity * env->imbalance_pct; | ||
1573 | 1550 | ||
1574 | /* Would this change make things worse? */ | 1551 | /* Would this change make things worse? */ |
1575 | return (imb > old_imb); | 1552 | return (imb > old_imb); |
@@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
1582 | * be exchanged with the source task | 1559 | * be exchanged with the source task |
1583 | */ | 1560 | */ |
1584 | static void task_numa_compare(struct task_numa_env *env, | 1561 | static void task_numa_compare(struct task_numa_env *env, |
1585 | long taskimp, long groupimp) | 1562 | long taskimp, long groupimp, bool maymove) |
1586 | { | 1563 | { |
1587 | struct rq *src_rq = cpu_rq(env->src_cpu); | ||
1588 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | 1564 | struct rq *dst_rq = cpu_rq(env->dst_cpu); |
1589 | struct task_struct *cur; | 1565 | struct task_struct *cur; |
1590 | long src_load, dst_load; | 1566 | long src_load, dst_load; |
@@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1605 | if (cur == env->p) | 1581 | if (cur == env->p) |
1606 | goto unlock; | 1582 | goto unlock; |
1607 | 1583 | ||
1584 | if (!cur) { | ||
1585 | if (maymove || imp > env->best_imp) | ||
1586 | goto assign; | ||
1587 | else | ||
1588 | goto unlock; | ||
1589 | } | ||
1590 | |||
1608 | /* | 1591 | /* |
1609 | * "imp" is the fault differential for the source task between the | 1592 | * "imp" is the fault differential for the source task between the |
1610 | * source and destination node. Calculate the total differential for | 1593 | * source and destination node. Calculate the total differential for |
1611 | * the source task and potential destination task. The more negative | 1594 | * the source task and potential destination task. The more negative |
1612 | * the value is, the more rmeote accesses that would be expected to | 1595 | * the value is, the more remote accesses that would be expected to |
1613 | * be incurred if the tasks were swapped. | 1596 | * be incurred if the tasks were swapped. |
1614 | */ | 1597 | */ |
1615 | if (cur) { | 1598 | /* Skip this swap candidate if cannot move to the source cpu */ |
1616 | /* Skip this swap candidate if cannot move to the source CPU: */ | 1599 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) |
1617 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | 1600 | goto unlock; |
1618 | goto unlock; | ||
1619 | 1601 | ||
1602 | /* | ||
1603 | * If dst and source tasks are in the same NUMA group, or not | ||
1604 | * in any group then look only at task weights. | ||
1605 | */ | ||
1606 | if (cur->numa_group == env->p->numa_group) { | ||
1607 | imp = taskimp + task_weight(cur, env->src_nid, dist) - | ||
1608 | task_weight(cur, env->dst_nid, dist); | ||
1620 | /* | 1609 | /* |
1621 | * If dst and source tasks are in the same NUMA group, or not | 1610 | * Add some hysteresis to prevent swapping the |
1622 | * in any group then look only at task weights. | 1611 | * tasks within a group over tiny differences. |
1623 | */ | 1612 | */ |
1624 | if (cur->numa_group == env->p->numa_group) { | 1613 | if (cur->numa_group) |
1625 | imp = taskimp + task_weight(cur, env->src_nid, dist) - | 1614 | imp -= imp / 16; |
1626 | task_weight(cur, env->dst_nid, dist); | 1615 | } else { |
1627 | /* | 1616 | /* |
1628 | * Add some hysteresis to prevent swapping the | 1617 | * Compare the group weights. If a task is all by itself |
1629 | * tasks within a group over tiny differences. | 1618 | * (not part of a group), use the task weight instead. |
1630 | */ | 1619 | */ |
1631 | if (cur->numa_group) | 1620 | if (cur->numa_group && env->p->numa_group) |
1632 | imp -= imp/16; | 1621 | imp += group_weight(cur, env->src_nid, dist) - |
1633 | } else { | 1622 | group_weight(cur, env->dst_nid, dist); |
1634 | /* | 1623 | else |
1635 | * Compare the group weights. If a task is all by | 1624 | imp += task_weight(cur, env->src_nid, dist) - |
1636 | * itself (not part of a group), use the task weight | 1625 | task_weight(cur, env->dst_nid, dist); |
1637 | * instead. | ||
1638 | */ | ||
1639 | if (cur->numa_group) | ||
1640 | imp += group_weight(cur, env->src_nid, dist) - | ||
1641 | group_weight(cur, env->dst_nid, dist); | ||
1642 | else | ||
1643 | imp += task_weight(cur, env->src_nid, dist) - | ||
1644 | task_weight(cur, env->dst_nid, dist); | ||
1645 | } | ||
1646 | } | 1626 | } |
1647 | 1627 | ||
1648 | if (imp <= env->best_imp && moveimp <= env->best_imp) | 1628 | if (imp <= env->best_imp) |
1649 | goto unlock; | 1629 | goto unlock; |
1650 | 1630 | ||
1651 | if (!cur) { | 1631 | if (maymove && moveimp > imp && moveimp > env->best_imp) { |
1652 | /* Is there capacity at our destination? */ | 1632 | imp = moveimp - 1; |
1653 | if (env->src_stats.nr_running <= env->src_stats.task_capacity && | 1633 | cur = NULL; |
1654 | !env->dst_stats.has_free_capacity) | ||
1655 | goto unlock; | ||
1656 | |||
1657 | goto balance; | ||
1658 | } | ||
1659 | |||
1660 | /* Balance doesn't matter much if we're running a task per CPU: */ | ||
1661 | if (imp > env->best_imp && src_rq->nr_running == 1 && | ||
1662 | dst_rq->nr_running == 1) | ||
1663 | goto assign; | 1634 | goto assign; |
1635 | } | ||
1664 | 1636 | ||
1665 | /* | 1637 | /* |
1666 | * In the overloaded case, try and keep the load balanced. | 1638 | * In the overloaded case, try and keep the load balanced. |
1667 | */ | 1639 | */ |
1668 | balance: | 1640 | load = task_h_load(env->p) - task_h_load(cur); |
1669 | load = task_h_load(env->p); | 1641 | if (!load) |
1642 | goto assign; | ||
1643 | |||
1670 | dst_load = env->dst_stats.load + load; | 1644 | dst_load = env->dst_stats.load + load; |
1671 | src_load = env->src_stats.load - load; | 1645 | src_load = env->src_stats.load - load; |
1672 | 1646 | ||
1673 | if (moveimp > imp && moveimp > env->best_imp) { | ||
1674 | /* | ||
1675 | * If the improvement from just moving env->p direction is | ||
1676 | * better than swapping tasks around, check if a move is | ||
1677 | * possible. Store a slightly smaller score than moveimp, | ||
1678 | * so an actually idle CPU will win. | ||
1679 | */ | ||
1680 | if (!load_too_imbalanced(src_load, dst_load, env)) { | ||
1681 | imp = moveimp - 1; | ||
1682 | cur = NULL; | ||
1683 | goto assign; | ||
1684 | } | ||
1685 | } | ||
1686 | |||
1687 | if (imp <= env->best_imp) | ||
1688 | goto unlock; | ||
1689 | |||
1690 | if (cur) { | ||
1691 | load = task_h_load(cur); | ||
1692 | dst_load -= load; | ||
1693 | src_load += load; | ||
1694 | } | ||
1695 | |||
1696 | if (load_too_imbalanced(src_load, dst_load, env)) | 1647 | if (load_too_imbalanced(src_load, dst_load, env)) |
1697 | goto unlock; | 1648 | goto unlock; |
1698 | 1649 | ||
1650 | assign: | ||
1699 | /* | 1651 | /* |
1700 | * One idle CPU per node is evaluated for a task numa move. | 1652 | * One idle CPU per node is evaluated for a task numa move. |
1701 | * Call select_idle_sibling to maybe find a better one. | 1653 | * Call select_idle_sibling to maybe find a better one. |
@@ -1711,7 +1663,6 @@ balance: | |||
1711 | local_irq_enable(); | 1663 | local_irq_enable(); |
1712 | } | 1664 | } |
1713 | 1665 | ||
1714 | assign: | ||
1715 | task_numa_assign(env, cur, imp); | 1666 | task_numa_assign(env, cur, imp); |
1716 | unlock: | 1667 | unlock: |
1717 | rcu_read_unlock(); | 1668 | rcu_read_unlock(); |
@@ -1720,43 +1671,30 @@ unlock: | |||
1720 | static void task_numa_find_cpu(struct task_numa_env *env, | 1671 | static void task_numa_find_cpu(struct task_numa_env *env, |
1721 | long taskimp, long groupimp) | 1672 | long taskimp, long groupimp) |
1722 | { | 1673 | { |
1674 | long src_load, dst_load, load; | ||
1675 | bool maymove = false; | ||
1723 | int cpu; | 1676 | int cpu; |
1724 | 1677 | ||
1678 | load = task_h_load(env->p); | ||
1679 | dst_load = env->dst_stats.load + load; | ||
1680 | src_load = env->src_stats.load - load; | ||
1681 | |||
1682 | /* | ||
1683 | * If the improvement from just moving env->p direction is better | ||
1684 | * than swapping tasks around, check if a move is possible. | ||
1685 | */ | ||
1686 | maymove = !load_too_imbalanced(src_load, dst_load, env); | ||
1687 | |||
1725 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | 1688 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { |
1726 | /* Skip this CPU if the source task cannot migrate */ | 1689 | /* Skip this CPU if the source task cannot migrate */ |
1727 | if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) | 1690 | if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) |
1728 | continue; | 1691 | continue; |
1729 | 1692 | ||
1730 | env->dst_cpu = cpu; | 1693 | env->dst_cpu = cpu; |
1731 | task_numa_compare(env, taskimp, groupimp); | 1694 | task_numa_compare(env, taskimp, groupimp, maymove); |
1732 | } | 1695 | } |
1733 | } | 1696 | } |
1734 | 1697 | ||
1735 | /* Only move tasks to a NUMA node less busy than the current node. */ | ||
1736 | static bool numa_has_capacity(struct task_numa_env *env) | ||
1737 | { | ||
1738 | struct numa_stats *src = &env->src_stats; | ||
1739 | struct numa_stats *dst = &env->dst_stats; | ||
1740 | |||
1741 | if (src->has_free_capacity && !dst->has_free_capacity) | ||
1742 | return false; | ||
1743 | |||
1744 | /* | ||
1745 | * Only consider a task move if the source has a higher load | ||
1746 | * than the destination, corrected for CPU capacity on each node. | ||
1747 | * | ||
1748 | * src->load dst->load | ||
1749 | * --------------------- vs --------------------- | ||
1750 | * src->compute_capacity dst->compute_capacity | ||
1751 | */ | ||
1752 | if (src->load * dst->compute_capacity * env->imbalance_pct > | ||
1753 | |||
1754 | dst->load * src->compute_capacity * 100) | ||
1755 | return true; | ||
1756 | |||
1757 | return false; | ||
1758 | } | ||
1759 | |||
1760 | static int task_numa_migrate(struct task_struct *p) | 1698 | static int task_numa_migrate(struct task_struct *p) |
1761 | { | 1699 | { |
1762 | struct task_numa_env env = { | 1700 | struct task_numa_env env = { |
@@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1797 | * elsewhere, so there is no point in (re)trying. | 1735 | * elsewhere, so there is no point in (re)trying. |
1798 | */ | 1736 | */ |
1799 | if (unlikely(!sd)) { | 1737 | if (unlikely(!sd)) { |
1800 | p->numa_preferred_nid = task_node(p); | 1738 | sched_setnuma(p, task_node(p)); |
1801 | return -EINVAL; | 1739 | return -EINVAL; |
1802 | } | 1740 | } |
1803 | 1741 | ||
@@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1811 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1749 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1812 | 1750 | ||
1813 | /* Try to find a spot on the preferred nid. */ | 1751 | /* Try to find a spot on the preferred nid. */ |
1814 | if (numa_has_capacity(&env)) | 1752 | task_numa_find_cpu(&env, taskimp, groupimp); |
1815 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1816 | 1753 | ||
1817 | /* | 1754 | /* |
1818 | * Look at other nodes in these cases: | 1755 | * Look at other nodes in these cases: |
@@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1842 | env.dist = dist; | 1779 | env.dist = dist; |
1843 | env.dst_nid = nid; | 1780 | env.dst_nid = nid; |
1844 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1781 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1845 | if (numa_has_capacity(&env)) | 1782 | task_numa_find_cpu(&env, taskimp, groupimp); |
1846 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
1847 | } | 1783 | } |
1848 | } | 1784 | } |
1849 | 1785 | ||
@@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p) | |||
1856 | * trying for a better one later. Do not set the preferred node here. | 1792 | * trying for a better one later. Do not set the preferred node here. |
1857 | */ | 1793 | */ |
1858 | if (p->numa_group) { | 1794 | if (p->numa_group) { |
1859 | struct numa_group *ng = p->numa_group; | ||
1860 | |||
1861 | if (env.best_cpu == -1) | 1795 | if (env.best_cpu == -1) |
1862 | nid = env.src_nid; | 1796 | nid = env.src_nid; |
1863 | else | 1797 | else |
1864 | nid = env.dst_nid; | 1798 | nid = cpu_to_node(env.best_cpu); |
1865 | 1799 | ||
1866 | if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) | 1800 | if (nid != p->numa_preferred_nid) |
1867 | sched_setnuma(p, env.dst_nid); | 1801 | sched_setnuma(p, nid); |
1868 | } | 1802 | } |
1869 | 1803 | ||
1870 | /* No better CPU than the current one was found. */ | 1804 | /* No better CPU than the current one was found. */ |
@@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p) | |||
1884 | return ret; | 1818 | return ret; |
1885 | } | 1819 | } |
1886 | 1820 | ||
1887 | ret = migrate_swap(p, env.best_task); | 1821 | ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); |
1822 | |||
1888 | if (ret != 0) | 1823 | if (ret != 0) |
1889 | trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); | 1824 | trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); |
1890 | put_task_struct(env.best_task); | 1825 | put_task_struct(env.best_task); |
@@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
2144 | 2079 | ||
2145 | static void task_numa_placement(struct task_struct *p) | 2080 | static void task_numa_placement(struct task_struct *p) |
2146 | { | 2081 | { |
2147 | int seq, nid, max_nid = -1, max_group_nid = -1; | 2082 | int seq, nid, max_nid = -1; |
2148 | unsigned long max_faults = 0, max_group_faults = 0; | 2083 | unsigned long max_faults = 0; |
2149 | unsigned long fault_types[2] = { 0, 0 }; | 2084 | unsigned long fault_types[2] = { 0, 0 }; |
2150 | unsigned long total_faults; | 2085 | unsigned long total_faults; |
2151 | u64 runtime, period; | 2086 | u64 runtime, period; |
@@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p) | |||
2224 | } | 2159 | } |
2225 | } | 2160 | } |
2226 | 2161 | ||
2227 | if (faults > max_faults) { | 2162 | if (!p->numa_group) { |
2228 | max_faults = faults; | 2163 | if (faults > max_faults) { |
2164 | max_faults = faults; | ||
2165 | max_nid = nid; | ||
2166 | } | ||
2167 | } else if (group_faults > max_faults) { | ||
2168 | max_faults = group_faults; | ||
2229 | max_nid = nid; | 2169 | max_nid = nid; |
2230 | } | 2170 | } |
2231 | |||
2232 | if (group_faults > max_group_faults) { | ||
2233 | max_group_faults = group_faults; | ||
2234 | max_group_nid = nid; | ||
2235 | } | ||
2236 | } | 2171 | } |
2237 | 2172 | ||
2238 | update_task_scan_period(p, fault_types[0], fault_types[1]); | ||
2239 | |||
2240 | if (p->numa_group) { | 2173 | if (p->numa_group) { |
2241 | numa_group_count_active_nodes(p->numa_group); | 2174 | numa_group_count_active_nodes(p->numa_group); |
2242 | spin_unlock_irq(group_lock); | 2175 | spin_unlock_irq(group_lock); |
2243 | max_nid = preferred_group_nid(p, max_group_nid); | 2176 | max_nid = preferred_group_nid(p, max_nid); |
2244 | } | 2177 | } |
2245 | 2178 | ||
2246 | if (max_faults) { | 2179 | if (max_faults) { |
2247 | /* Set the new preferred node */ | 2180 | /* Set the new preferred node */ |
2248 | if (max_nid != p->numa_preferred_nid) | 2181 | if (max_nid != p->numa_preferred_nid) |
2249 | sched_setnuma(p, max_nid); | 2182 | sched_setnuma(p, max_nid); |
2250 | |||
2251 | if (task_node(p) != p->numa_preferred_nid) | ||
2252 | numa_migrate_preferred(p); | ||
2253 | } | 2183 | } |
2184 | |||
2185 | update_task_scan_period(p, fault_types[0], fault_types[1]); | ||
2254 | } | 2186 | } |
2255 | 2187 | ||
2256 | static inline int get_numa_group(struct numa_group *grp) | 2188 | static inline int get_numa_group(struct numa_group *grp) |
@@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2450 | numa_is_active_node(mem_node, ng)) | 2382 | numa_is_active_node(mem_node, ng)) |
2451 | local = 1; | 2383 | local = 1; |
2452 | 2384 | ||
2453 | task_numa_placement(p); | ||
2454 | |||
2455 | /* | 2385 | /* |
2456 | * Retry task to preferred node migration periodically, in case it | 2386 | * Retry task to preferred node migration periodically, in case it |
2457 | * case it previously failed, or the scheduler moved us. | 2387 | * case it previously failed, or the scheduler moved us. |
2458 | */ | 2388 | */ |
2459 | if (time_after(jiffies, p->numa_migrate_retry)) | 2389 | if (time_after(jiffies, p->numa_migrate_retry)) { |
2390 | task_numa_placement(p); | ||
2460 | numa_migrate_preferred(p); | 2391 | numa_migrate_preferred(p); |
2392 | } | ||
2461 | 2393 | ||
2462 | if (migrated) | 2394 | if (migrated) |
2463 | p->numa_pages_migrated += pages; | 2395 | p->numa_pages_migrated += pages; |
@@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2749 | } while (0) | 2681 | } while (0) |
2750 | 2682 | ||
2751 | #ifdef CONFIG_SMP | 2683 | #ifdef CONFIG_SMP |
2752 | /* | ||
2753 | * XXX we want to get rid of these helpers and use the full load resolution. | ||
2754 | */ | ||
2755 | static inline long se_weight(struct sched_entity *se) | ||
2756 | { | ||
2757 | return scale_load_down(se->load.weight); | ||
2758 | } | ||
2759 | |||
2760 | static inline long se_runnable(struct sched_entity *se) | ||
2761 | { | ||
2762 | return scale_load_down(se->runnable_weight); | ||
2763 | } | ||
2764 | |||
2765 | static inline void | 2684 | static inline void |
2766 | enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2685 | enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2767 | { | 2686 | { |
@@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) | |||
3062 | } | 2981 | } |
3063 | 2982 | ||
3064 | #ifdef CONFIG_SMP | 2983 | #ifdef CONFIG_SMP |
3065 | /* | ||
3066 | * Approximate: | ||
3067 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | ||
3068 | */ | ||
3069 | static u64 decay_load(u64 val, u64 n) | ||
3070 | { | ||
3071 | unsigned int local_n; | ||
3072 | |||
3073 | if (unlikely(n > LOAD_AVG_PERIOD * 63)) | ||
3074 | return 0; | ||
3075 | |||
3076 | /* after bounds checking we can collapse to 32-bit */ | ||
3077 | local_n = n; | ||
3078 | |||
3079 | /* | ||
3080 | * As y^PERIOD = 1/2, we can combine | ||
3081 | * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) | ||
3082 | * With a look-up table which covers y^n (n<PERIOD) | ||
3083 | * | ||
3084 | * To achieve constant time decay_load. | ||
3085 | */ | ||
3086 | if (unlikely(local_n >= LOAD_AVG_PERIOD)) { | ||
3087 | val >>= local_n / LOAD_AVG_PERIOD; | ||
3088 | local_n %= LOAD_AVG_PERIOD; | ||
3089 | } | ||
3090 | |||
3091 | val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); | ||
3092 | return val; | ||
3093 | } | ||
3094 | |||
3095 | static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) | ||
3096 | { | ||
3097 | u32 c1, c2, c3 = d3; /* y^0 == 1 */ | ||
3098 | |||
3099 | /* | ||
3100 | * c1 = d1 y^p | ||
3101 | */ | ||
3102 | c1 = decay_load((u64)d1, periods); | ||
3103 | |||
3104 | /* | ||
3105 | * p-1 | ||
3106 | * c2 = 1024 \Sum y^n | ||
3107 | * n=1 | ||
3108 | * | ||
3109 | * inf inf | ||
3110 | * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) | ||
3111 | * n=0 n=p | ||
3112 | */ | ||
3113 | c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; | ||
3114 | |||
3115 | return c1 + c2 + c3; | ||
3116 | } | ||
3117 | |||
3118 | /* | ||
3119 | * Accumulate the three separate parts of the sum; d1 the remainder | ||
3120 | * of the last (incomplete) period, d2 the span of full periods and d3 | ||
3121 | * the remainder of the (incomplete) current period. | ||
3122 | * | ||
3123 | * d1 d2 d3 | ||
3124 | * ^ ^ ^ | ||
3125 | * | | | | ||
3126 | * |<->|<----------------->|<--->| | ||
3127 | * ... |---x---|------| ... |------|-----x (now) | ||
3128 | * | ||
3129 | * p-1 | ||
3130 | * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 | ||
3131 | * n=1 | ||
3132 | * | ||
3133 | * = u y^p + (Step 1) | ||
3134 | * | ||
3135 | * p-1 | ||
3136 | * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) | ||
3137 | * n=1 | ||
3138 | */ | ||
3139 | static __always_inline u32 | ||
3140 | accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | ||
3141 | unsigned long load, unsigned long runnable, int running) | ||
3142 | { | ||
3143 | unsigned long scale_freq, scale_cpu; | ||
3144 | u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ | ||
3145 | u64 periods; | ||
3146 | |||
3147 | scale_freq = arch_scale_freq_capacity(cpu); | ||
3148 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
3149 | |||
3150 | delta += sa->period_contrib; | ||
3151 | periods = delta / 1024; /* A period is 1024us (~1ms) */ | ||
3152 | |||
3153 | /* | ||
3154 | * Step 1: decay old *_sum if we crossed period boundaries. | ||
3155 | */ | ||
3156 | if (periods) { | ||
3157 | sa->load_sum = decay_load(sa->load_sum, periods); | ||
3158 | sa->runnable_load_sum = | ||
3159 | decay_load(sa->runnable_load_sum, periods); | ||
3160 | sa->util_sum = decay_load((u64)(sa->util_sum), periods); | ||
3161 | |||
3162 | /* | ||
3163 | * Step 2 | ||
3164 | */ | ||
3165 | delta %= 1024; | ||
3166 | contrib = __accumulate_pelt_segments(periods, | ||
3167 | 1024 - sa->period_contrib, delta); | ||
3168 | } | ||
3169 | sa->period_contrib = delta; | ||
3170 | |||
3171 | contrib = cap_scale(contrib, scale_freq); | ||
3172 | if (load) | ||
3173 | sa->load_sum += load * contrib; | ||
3174 | if (runnable) | ||
3175 | sa->runnable_load_sum += runnable * contrib; | ||
3176 | if (running) | ||
3177 | sa->util_sum += contrib * scale_cpu; | ||
3178 | |||
3179 | return periods; | ||
3180 | } | ||
3181 | |||
3182 | /* | ||
3183 | * We can represent the historical contribution to runnable average as the | ||
3184 | * coefficients of a geometric series. To do this we sub-divide our runnable | ||
3185 | * history into segments of approximately 1ms (1024us); label the segment that | ||
3186 | * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. | ||
3187 | * | ||
3188 | * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... | ||
3189 | * p0 p1 p2 | ||
3190 | * (now) (~1ms ago) (~2ms ago) | ||
3191 | * | ||
3192 | * Let u_i denote the fraction of p_i that the entity was runnable. | ||
3193 | * | ||
3194 | * We then designate the fractions u_i as our co-efficients, yielding the | ||
3195 | * following representation of historical load: | ||
3196 | * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... | ||
3197 | * | ||
3198 | * We choose y based on the with of a reasonably scheduling period, fixing: | ||
3199 | * y^32 = 0.5 | ||
3200 | * | ||
3201 | * This means that the contribution to load ~32ms ago (u_32) will be weighted | ||
3202 | * approximately half as much as the contribution to load within the last ms | ||
3203 | * (u_0). | ||
3204 | * | ||
3205 | * When a period "rolls over" and we have new u_0`, multiplying the previous | ||
3206 | * sum again by y is sufficient to update: | ||
3207 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | ||
3208 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | ||
3209 | */ | ||
3210 | static __always_inline int | ||
3211 | ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, | ||
3212 | unsigned long load, unsigned long runnable, int running) | ||
3213 | { | ||
3214 | u64 delta; | ||
3215 | |||
3216 | delta = now - sa->last_update_time; | ||
3217 | /* | ||
3218 | * This should only happen when time goes backwards, which it | ||
3219 | * unfortunately does during sched clock init when we swap over to TSC. | ||
3220 | */ | ||
3221 | if ((s64)delta < 0) { | ||
3222 | sa->last_update_time = now; | ||
3223 | return 0; | ||
3224 | } | ||
3225 | |||
3226 | /* | ||
3227 | * Use 1024ns as the unit of measurement since it's a reasonable | ||
3228 | * approximation of 1us and fast to compute. | ||
3229 | */ | ||
3230 | delta >>= 10; | ||
3231 | if (!delta) | ||
3232 | return 0; | ||
3233 | |||
3234 | sa->last_update_time += delta << 10; | ||
3235 | |||
3236 | /* | ||
3237 | * running is a subset of runnable (weight) so running can't be set if | ||
3238 | * runnable is clear. But there are some corner cases where the current | ||
3239 | * se has been already dequeued but cfs_rq->curr still points to it. | ||
3240 | * This means that weight will be 0 but not running for a sched_entity | ||
3241 | * but also for a cfs_rq if the latter becomes idle. As an example, | ||
3242 | * this happens during idle_balance() which calls | ||
3243 | * update_blocked_averages() | ||
3244 | */ | ||
3245 | if (!load) | ||
3246 | runnable = running = 0; | ||
3247 | |||
3248 | /* | ||
3249 | * Now we know we crossed measurement unit boundaries. The *_avg | ||
3250 | * accrues by two steps: | ||
3251 | * | ||
3252 | * Step 1: accumulate *_sum since last_update_time. If we haven't | ||
3253 | * crossed period boundaries, finish. | ||
3254 | */ | ||
3255 | if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) | ||
3256 | return 0; | ||
3257 | |||
3258 | return 1; | ||
3259 | } | ||
3260 | |||
3261 | static __always_inline void | ||
3262 | ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) | ||
3263 | { | ||
3264 | u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; | ||
3265 | |||
3266 | /* | ||
3267 | * Step 2: update *_avg. | ||
3268 | */ | ||
3269 | sa->load_avg = div_u64(load * sa->load_sum, divider); | ||
3270 | sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); | ||
3271 | sa->util_avg = sa->util_sum / divider; | ||
3272 | } | ||
3273 | |||
3274 | /* | ||
3275 | * When a task is dequeued, its estimated utilization should not be update if | ||
3276 | * its util_avg has not been updated at least once. | ||
3277 | * This flag is used to synchronize util_avg updates with util_est updates. | ||
3278 | * We map this information into the LSB bit of the utilization saved at | ||
3279 | * dequeue time (i.e. util_est.dequeued). | ||
3280 | */ | ||
3281 | #define UTIL_AVG_UNCHANGED 0x1 | ||
3282 | |||
3283 | static inline void cfs_se_util_change(struct sched_avg *avg) | ||
3284 | { | ||
3285 | unsigned int enqueued; | ||
3286 | |||
3287 | if (!sched_feat(UTIL_EST)) | ||
3288 | return; | ||
3289 | |||
3290 | /* Avoid store if the flag has been already set */ | ||
3291 | enqueued = avg->util_est.enqueued; | ||
3292 | if (!(enqueued & UTIL_AVG_UNCHANGED)) | ||
3293 | return; | ||
3294 | |||
3295 | /* Reset flag to report util_avg has been updated */ | ||
3296 | enqueued &= ~UTIL_AVG_UNCHANGED; | ||
3297 | WRITE_ONCE(avg->util_est.enqueued, enqueued); | ||
3298 | } | ||
3299 | |||
3300 | /* | ||
3301 | * sched_entity: | ||
3302 | * | ||
3303 | * task: | ||
3304 | * se_runnable() == se_weight() | ||
3305 | * | ||
3306 | * group: [ see update_cfs_group() ] | ||
3307 | * se_weight() = tg->weight * grq->load_avg / tg->load_avg | ||
3308 | * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg | ||
3309 | * | ||
3310 | * load_sum := runnable_sum | ||
3311 | * load_avg = se_weight(se) * runnable_avg | ||
3312 | * | ||
3313 | * runnable_load_sum := runnable_sum | ||
3314 | * runnable_load_avg = se_runnable(se) * runnable_avg | ||
3315 | * | ||
3316 | * XXX collapse load_sum and runnable_load_sum | ||
3317 | * | ||
3318 | * cfq_rs: | ||
3319 | * | ||
3320 | * load_sum = \Sum se_weight(se) * se->avg.load_sum | ||
3321 | * load_avg = \Sum se->avg.load_avg | ||
3322 | * | ||
3323 | * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum | ||
3324 | * runnable_load_avg = \Sum se->avg.runable_load_avg | ||
3325 | */ | ||
3326 | |||
3327 | static int | ||
3328 | __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) | ||
3329 | { | ||
3330 | if (entity_is_task(se)) | ||
3331 | se->runnable_weight = se->load.weight; | ||
3332 | |||
3333 | if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { | ||
3334 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | ||
3335 | return 1; | ||
3336 | } | ||
3337 | |||
3338 | return 0; | ||
3339 | } | ||
3340 | |||
3341 | static int | ||
3342 | __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
3343 | { | ||
3344 | if (entity_is_task(se)) | ||
3345 | se->runnable_weight = se->load.weight; | ||
3346 | |||
3347 | if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, | ||
3348 | cfs_rq->curr == se)) { | ||
3349 | |||
3350 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | ||
3351 | cfs_se_util_change(&se->avg); | ||
3352 | return 1; | ||
3353 | } | ||
3354 | |||
3355 | return 0; | ||
3356 | } | ||
3357 | |||
3358 | static int | ||
3359 | __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) | ||
3360 | { | ||
3361 | if (___update_load_sum(now, cpu, &cfs_rq->avg, | ||
3362 | scale_load_down(cfs_rq->load.weight), | ||
3363 | scale_load_down(cfs_rq->runnable_weight), | ||
3364 | cfs_rq->curr != NULL)) { | ||
3365 | |||
3366 | ___update_load_avg(&cfs_rq->avg, 1, 1); | ||
3367 | return 1; | ||
3368 | } | ||
3369 | |||
3370 | return 0; | ||
3371 | } | ||
3372 | |||
3373 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2984 | #ifdef CONFIG_FAIR_GROUP_SCHED |
3374 | /** | 2985 | /** |
3375 | * update_tg_load_avg - update the tg's load avg | 2986 | * update_tg_load_avg - update the tg's load avg |
@@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | |||
4037 | 3648 | ||
4038 | #else /* CONFIG_SMP */ | 3649 | #else /* CONFIG_SMP */ |
4039 | 3650 | ||
4040 | static inline int | ||
4041 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | ||
4042 | { | ||
4043 | return 0; | ||
4044 | } | ||
4045 | |||
4046 | #define UPDATE_TG 0x0 | 3651 | #define UPDATE_TG 0x0 |
4047 | #define SKIP_AGE_LOAD 0x0 | 3652 | #define SKIP_AGE_LOAD 0x0 |
4048 | #define DO_ATTACH 0x0 | 3653 | #define DO_ATTACH 0x0 |
@@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg, | |||
4726 | throttled_hierarchy(dest_cfs_rq); | 4331 | throttled_hierarchy(dest_cfs_rq); |
4727 | } | 4332 | } |
4728 | 4333 | ||
4729 | /* updated child weight may affect parent so we have to do this bottom up */ | ||
4730 | static int tg_unthrottle_up(struct task_group *tg, void *data) | 4334 | static int tg_unthrottle_up(struct task_group *tg, void *data) |
4731 | { | 4335 | { |
4732 | struct rq *rq = data; | 4336 | struct rq *rq = data; |
@@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load, | |||
5653 | 5257 | ||
5654 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | 5258 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; |
5655 | } | 5259 | } |
5656 | |||
5657 | sched_avg_update(this_rq); | ||
5658 | } | 5260 | } |
5659 | 5261 | ||
5660 | /* Used instead of source_load when we know the type == 0 */ | 5262 | /* Used instead of source_load when we know the type == 0 */ |
@@ -7294,8 +6896,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env) | |||
7294 | static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | 6896 | static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) |
7295 | { | 6897 | { |
7296 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | 6898 | struct numa_group *numa_group = rcu_dereference(p->numa_group); |
7297 | unsigned long src_faults, dst_faults; | 6899 | unsigned long src_weight, dst_weight; |
7298 | int src_nid, dst_nid; | 6900 | int src_nid, dst_nid, dist; |
7299 | 6901 | ||
7300 | if (!static_branch_likely(&sched_numa_balancing)) | 6902 | if (!static_branch_likely(&sched_numa_balancing)) |
7301 | return -1; | 6903 | return -1; |
@@ -7322,18 +6924,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
7322 | return 0; | 6924 | return 0; |
7323 | 6925 | ||
7324 | /* Leaving a core idle is often worse than degrading locality. */ | 6926 | /* Leaving a core idle is often worse than degrading locality. */ |
7325 | if (env->idle != CPU_NOT_IDLE) | 6927 | if (env->idle == CPU_IDLE) |
7326 | return -1; | 6928 | return -1; |
7327 | 6929 | ||
6930 | dist = node_distance(src_nid, dst_nid); | ||
7328 | if (numa_group) { | 6931 | if (numa_group) { |
7329 | src_faults = group_faults(p, src_nid); | 6932 | src_weight = group_weight(p, src_nid, dist); |
7330 | dst_faults = group_faults(p, dst_nid); | 6933 | dst_weight = group_weight(p, dst_nid, dist); |
7331 | } else { | 6934 | } else { |
7332 | src_faults = task_faults(p, src_nid); | 6935 | src_weight = task_weight(p, src_nid, dist); |
7333 | dst_faults = task_faults(p, dst_nid); | 6936 | dst_weight = task_weight(p, dst_nid, dist); |
7334 | } | 6937 | } |
7335 | 6938 | ||
7336 | return dst_faults < src_faults; | 6939 | return dst_weight < src_weight; |
7337 | } | 6940 | } |
7338 | 6941 | ||
7339 | #else | 6942 | #else |
@@ -7620,6 +7223,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) | |||
7620 | return false; | 7223 | return false; |
7621 | } | 7224 | } |
7622 | 7225 | ||
7226 | static inline bool others_have_blocked(struct rq *rq) | ||
7227 | { | ||
7228 | if (READ_ONCE(rq->avg_rt.util_avg)) | ||
7229 | return true; | ||
7230 | |||
7231 | if (READ_ONCE(rq->avg_dl.util_avg)) | ||
7232 | return true; | ||
7233 | |||
7234 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
7235 | if (READ_ONCE(rq->avg_irq.util_avg)) | ||
7236 | return true; | ||
7237 | #endif | ||
7238 | |||
7239 | return false; | ||
7240 | } | ||
7241 | |||
7623 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7242 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7624 | 7243 | ||
7625 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | 7244 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) |
@@ -7679,6 +7298,12 @@ static void update_blocked_averages(int cpu) | |||
7679 | if (cfs_rq_has_blocked(cfs_rq)) | 7298 | if (cfs_rq_has_blocked(cfs_rq)) |
7680 | done = false; | 7299 | done = false; |
7681 | } | 7300 | } |
7301 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); | ||
7302 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); | ||
7303 | update_irq_load_avg(rq, 0); | ||
7304 | /* Don't need periodic decay once load/util_avg are null */ | ||
7305 | if (others_have_blocked(rq)) | ||
7306 | done = false; | ||
7682 | 7307 | ||
7683 | #ifdef CONFIG_NO_HZ_COMMON | 7308 | #ifdef CONFIG_NO_HZ_COMMON |
7684 | rq->last_blocked_load_update_tick = jiffies; | 7309 | rq->last_blocked_load_update_tick = jiffies; |
@@ -7744,9 +7369,12 @@ static inline void update_blocked_averages(int cpu) | |||
7744 | rq_lock_irqsave(rq, &rf); | 7369 | rq_lock_irqsave(rq, &rf); |
7745 | update_rq_clock(rq); | 7370 | update_rq_clock(rq); |
7746 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); | 7371 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); |
7372 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); | ||
7373 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); | ||
7374 | update_irq_load_avg(rq, 0); | ||
7747 | #ifdef CONFIG_NO_HZ_COMMON | 7375 | #ifdef CONFIG_NO_HZ_COMMON |
7748 | rq->last_blocked_load_update_tick = jiffies; | 7376 | rq->last_blocked_load_update_tick = jiffies; |
7749 | if (!cfs_rq_has_blocked(cfs_rq)) | 7377 | if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) |
7750 | rq->has_blocked_load = 0; | 7378 | rq->has_blocked_load = 0; |
7751 | #endif | 7379 | #endif |
7752 | rq_unlock_irqrestore(rq, &rf); | 7380 | rq_unlock_irqrestore(rq, &rf); |
@@ -7856,39 +7484,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
7856 | static unsigned long scale_rt_capacity(int cpu) | 7484 | static unsigned long scale_rt_capacity(int cpu) |
7857 | { | 7485 | { |
7858 | struct rq *rq = cpu_rq(cpu); | 7486 | struct rq *rq = cpu_rq(cpu); |
7859 | u64 total, used, age_stamp, avg; | 7487 | unsigned long max = arch_scale_cpu_capacity(NULL, cpu); |
7860 | s64 delta; | 7488 | unsigned long used, free; |
7489 | unsigned long irq; | ||
7861 | 7490 | ||
7862 | /* | 7491 | irq = cpu_util_irq(rq); |
7863 | * Since we're reading these variables without serialization make sure | ||
7864 | * we read them once before doing sanity checks on them. | ||
7865 | */ | ||
7866 | age_stamp = READ_ONCE(rq->age_stamp); | ||
7867 | avg = READ_ONCE(rq->rt_avg); | ||
7868 | delta = __rq_clock_broken(rq) - age_stamp; | ||
7869 | 7492 | ||
7870 | if (unlikely(delta < 0)) | 7493 | if (unlikely(irq >= max)) |
7871 | delta = 0; | 7494 | return 1; |
7872 | 7495 | ||
7873 | total = sched_avg_period() + delta; | 7496 | used = READ_ONCE(rq->avg_rt.util_avg); |
7497 | used += READ_ONCE(rq->avg_dl.util_avg); | ||
7874 | 7498 | ||
7875 | used = div_u64(avg, total); | 7499 | if (unlikely(used >= max)) |
7500 | return 1; | ||
7876 | 7501 | ||
7877 | if (likely(used < SCHED_CAPACITY_SCALE)) | 7502 | free = max - used; |
7878 | return SCHED_CAPACITY_SCALE - used; | ||
7879 | 7503 | ||
7880 | return 1; | 7504 | return scale_irq_capacity(free, irq, max); |
7881 | } | 7505 | } |
7882 | 7506 | ||
7883 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 7507 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
7884 | { | 7508 | { |
7885 | unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); | 7509 | unsigned long capacity = scale_rt_capacity(cpu); |
7886 | struct sched_group *sdg = sd->groups; | 7510 | struct sched_group *sdg = sd->groups; |
7887 | 7511 | ||
7888 | cpu_rq(cpu)->cpu_capacity_orig = capacity; | 7512 | cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); |
7889 | |||
7890 | capacity *= scale_rt_capacity(cpu); | ||
7891 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
7892 | 7513 | ||
7893 | if (!capacity) | 7514 | if (!capacity) |
7894 | capacity = 1; | 7515 | capacity = 1; |
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c new file mode 100644 index 000000000000..35475c0c5419 --- /dev/null +++ b/kernel/sched/pelt.c | |||
@@ -0,0 +1,399 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Per Entity Load Tracking | ||
4 | * | ||
5 | * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
6 | * | ||
7 | * Interactivity improvements by Mike Galbraith | ||
8 | * (C) 2007 Mike Galbraith <efault@gmx.de> | ||
9 | * | ||
10 | * Various enhancements by Dmitry Adamushko. | ||
11 | * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> | ||
12 | * | ||
13 | * Group scheduling enhancements by Srivatsa Vaddagiri | ||
14 | * Copyright IBM Corporation, 2007 | ||
15 | * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> | ||
16 | * | ||
17 | * Scaled math optimizations by Thomas Gleixner | ||
18 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> | ||
19 | * | ||
20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | ||
21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra | ||
22 | * | ||
23 | * Move PELT related code from fair.c into this pelt.c file | ||
24 | * Author: Vincent Guittot <vincent.guittot@linaro.org> | ||
25 | */ | ||
26 | |||
27 | #include <linux/sched.h> | ||
28 | #include "sched.h" | ||
29 | #include "sched-pelt.h" | ||
30 | #include "pelt.h" | ||
31 | |||
32 | /* | ||
33 | * Approximate: | ||
34 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | ||
35 | */ | ||
36 | static u64 decay_load(u64 val, u64 n) | ||
37 | { | ||
38 | unsigned int local_n; | ||
39 | |||
40 | if (unlikely(n > LOAD_AVG_PERIOD * 63)) | ||
41 | return 0; | ||
42 | |||
43 | /* after bounds checking we can collapse to 32-bit */ | ||
44 | local_n = n; | ||
45 | |||
46 | /* | ||
47 | * As y^PERIOD = 1/2, we can combine | ||
48 | * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) | ||
49 | * With a look-up table which covers y^n (n<PERIOD) | ||
50 | * | ||
51 | * To achieve constant time decay_load. | ||
52 | */ | ||
53 | if (unlikely(local_n >= LOAD_AVG_PERIOD)) { | ||
54 | val >>= local_n / LOAD_AVG_PERIOD; | ||
55 | local_n %= LOAD_AVG_PERIOD; | ||
56 | } | ||
57 | |||
58 | val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32); | ||
59 | return val; | ||
60 | } | ||
61 | |||
62 | static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) | ||
63 | { | ||
64 | u32 c1, c2, c3 = d3; /* y^0 == 1 */ | ||
65 | |||
66 | /* | ||
67 | * c1 = d1 y^p | ||
68 | */ | ||
69 | c1 = decay_load((u64)d1, periods); | ||
70 | |||
71 | /* | ||
72 | * p-1 | ||
73 | * c2 = 1024 \Sum y^n | ||
74 | * n=1 | ||
75 | * | ||
76 | * inf inf | ||
77 | * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) | ||
78 | * n=0 n=p | ||
79 | */ | ||
80 | c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; | ||
81 | |||
82 | return c1 + c2 + c3; | ||
83 | } | ||
84 | |||
85 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) | ||
86 | |||
87 | /* | ||
88 | * Accumulate the three separate parts of the sum; d1 the remainder | ||
89 | * of the last (incomplete) period, d2 the span of full periods and d3 | ||
90 | * the remainder of the (incomplete) current period. | ||
91 | * | ||
92 | * d1 d2 d3 | ||
93 | * ^ ^ ^ | ||
94 | * | | | | ||
95 | * |<->|<----------------->|<--->| | ||
96 | * ... |---x---|------| ... |------|-----x (now) | ||
97 | * | ||
98 | * p-1 | ||
99 | * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 | ||
100 | * n=1 | ||
101 | * | ||
102 | * = u y^p + (Step 1) | ||
103 | * | ||
104 | * p-1 | ||
105 | * d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2) | ||
106 | * n=1 | ||
107 | */ | ||
108 | static __always_inline u32 | ||
109 | accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | ||
110 | unsigned long load, unsigned long runnable, int running) | ||
111 | { | ||
112 | unsigned long scale_freq, scale_cpu; | ||
113 | u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ | ||
114 | u64 periods; | ||
115 | |||
116 | scale_freq = arch_scale_freq_capacity(cpu); | ||
117 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
118 | |||
119 | delta += sa->period_contrib; | ||
120 | periods = delta / 1024; /* A period is 1024us (~1ms) */ | ||
121 | |||
122 | /* | ||
123 | * Step 1: decay old *_sum if we crossed period boundaries. | ||
124 | */ | ||
125 | if (periods) { | ||
126 | sa->load_sum = decay_load(sa->load_sum, periods); | ||
127 | sa->runnable_load_sum = | ||
128 | decay_load(sa->runnable_load_sum, periods); | ||
129 | sa->util_sum = decay_load((u64)(sa->util_sum), periods); | ||
130 | |||
131 | /* | ||
132 | * Step 2 | ||
133 | */ | ||
134 | delta %= 1024; | ||
135 | contrib = __accumulate_pelt_segments(periods, | ||
136 | 1024 - sa->period_contrib, delta); | ||
137 | } | ||
138 | sa->period_contrib = delta; | ||
139 | |||
140 | contrib = cap_scale(contrib, scale_freq); | ||
141 | if (load) | ||
142 | sa->load_sum += load * contrib; | ||
143 | if (runnable) | ||
144 | sa->runnable_load_sum += runnable * contrib; | ||
145 | if (running) | ||
146 | sa->util_sum += contrib * scale_cpu; | ||
147 | |||
148 | return periods; | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * We can represent the historical contribution to runnable average as the | ||
153 | * coefficients of a geometric series. To do this we sub-divide our runnable | ||
154 | * history into segments of approximately 1ms (1024us); label the segment that | ||
155 | * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. | ||
156 | * | ||
157 | * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... | ||
158 | * p0 p1 p2 | ||
159 | * (now) (~1ms ago) (~2ms ago) | ||
160 | * | ||
161 | * Let u_i denote the fraction of p_i that the entity was runnable. | ||
162 | * | ||
163 | * We then designate the fractions u_i as our co-efficients, yielding the | ||
164 | * following representation of historical load: | ||
165 | * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... | ||
166 | * | ||
167 | * We choose y based on the with of a reasonably scheduling period, fixing: | ||
168 | * y^32 = 0.5 | ||
169 | * | ||
170 | * This means that the contribution to load ~32ms ago (u_32) will be weighted | ||
171 | * approximately half as much as the contribution to load within the last ms | ||
172 | * (u_0). | ||
173 | * | ||
174 | * When a period "rolls over" and we have new u_0`, multiplying the previous | ||
175 | * sum again by y is sufficient to update: | ||
176 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | ||
177 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | ||
178 | */ | ||
179 | static __always_inline int | ||
180 | ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, | ||
181 | unsigned long load, unsigned long runnable, int running) | ||
182 | { | ||
183 | u64 delta; | ||
184 | |||
185 | delta = now - sa->last_update_time; | ||
186 | /* | ||
187 | * This should only happen when time goes backwards, which it | ||
188 | * unfortunately does during sched clock init when we swap over to TSC. | ||
189 | */ | ||
190 | if ((s64)delta < 0) { | ||
191 | sa->last_update_time = now; | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Use 1024ns as the unit of measurement since it's a reasonable | ||
197 | * approximation of 1us and fast to compute. | ||
198 | */ | ||
199 | delta >>= 10; | ||
200 | if (!delta) | ||
201 | return 0; | ||
202 | |||
203 | sa->last_update_time += delta << 10; | ||
204 | |||
205 | /* | ||
206 | * running is a subset of runnable (weight) so running can't be set if | ||
207 | * runnable is clear. But there are some corner cases where the current | ||
208 | * se has been already dequeued but cfs_rq->curr still points to it. | ||
209 | * This means that weight will be 0 but not running for a sched_entity | ||
210 | * but also for a cfs_rq if the latter becomes idle. As an example, | ||
211 | * this happens during idle_balance() which calls | ||
212 | * update_blocked_averages() | ||
213 | */ | ||
214 | if (!load) | ||
215 | runnable = running = 0; | ||
216 | |||
217 | /* | ||
218 | * Now we know we crossed measurement unit boundaries. The *_avg | ||
219 | * accrues by two steps: | ||
220 | * | ||
221 | * Step 1: accumulate *_sum since last_update_time. If we haven't | ||
222 | * crossed period boundaries, finish. | ||
223 | */ | ||
224 | if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) | ||
225 | return 0; | ||
226 | |||
227 | return 1; | ||
228 | } | ||
229 | |||
230 | static __always_inline void | ||
231 | ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) | ||
232 | { | ||
233 | u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; | ||
234 | |||
235 | /* | ||
236 | * Step 2: update *_avg. | ||
237 | */ | ||
238 | sa->load_avg = div_u64(load * sa->load_sum, divider); | ||
239 | sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); | ||
240 | WRITE_ONCE(sa->util_avg, sa->util_sum / divider); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * sched_entity: | ||
245 | * | ||
246 | * task: | ||
247 | * se_runnable() == se_weight() | ||
248 | * | ||
249 | * group: [ see update_cfs_group() ] | ||
250 | * se_weight() = tg->weight * grq->load_avg / tg->load_avg | ||
251 | * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg | ||
252 | * | ||
253 | * load_sum := runnable_sum | ||
254 | * load_avg = se_weight(se) * runnable_avg | ||
255 | * | ||
256 | * runnable_load_sum := runnable_sum | ||
257 | * runnable_load_avg = se_runnable(se) * runnable_avg | ||
258 | * | ||
259 | * XXX collapse load_sum and runnable_load_sum | ||
260 | * | ||
261 | * cfq_rq: | ||
262 | * | ||
263 | * load_sum = \Sum se_weight(se) * se->avg.load_sum | ||
264 | * load_avg = \Sum se->avg.load_avg | ||
265 | * | ||
266 | * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum | ||
267 | * runnable_load_avg = \Sum se->avg.runable_load_avg | ||
268 | */ | ||
269 | |||
270 | int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) | ||
271 | { | ||
272 | if (entity_is_task(se)) | ||
273 | se->runnable_weight = se->load.weight; | ||
274 | |||
275 | if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { | ||
276 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | ||
277 | return 1; | ||
278 | } | ||
279 | |||
280 | return 0; | ||
281 | } | ||
282 | |||
283 | int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
284 | { | ||
285 | if (entity_is_task(se)) | ||
286 | se->runnable_weight = se->load.weight; | ||
287 | |||
288 | if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, | ||
289 | cfs_rq->curr == se)) { | ||
290 | |||
291 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | ||
292 | cfs_se_util_change(&se->avg); | ||
293 | return 1; | ||
294 | } | ||
295 | |||
296 | return 0; | ||
297 | } | ||
298 | |||
299 | int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) | ||
300 | { | ||
301 | if (___update_load_sum(now, cpu, &cfs_rq->avg, | ||
302 | scale_load_down(cfs_rq->load.weight), | ||
303 | scale_load_down(cfs_rq->runnable_weight), | ||
304 | cfs_rq->curr != NULL)) { | ||
305 | |||
306 | ___update_load_avg(&cfs_rq->avg, 1, 1); | ||
307 | return 1; | ||
308 | } | ||
309 | |||
310 | return 0; | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * rt_rq: | ||
315 | * | ||
316 | * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked | ||
317 | * util_sum = cpu_scale * load_sum | ||
318 | * runnable_load_sum = load_sum | ||
319 | * | ||
320 | * load_avg and runnable_load_avg are not supported and meaningless. | ||
321 | * | ||
322 | */ | ||
323 | |||
324 | int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) | ||
325 | { | ||
326 | if (___update_load_sum(now, rq->cpu, &rq->avg_rt, | ||
327 | running, | ||
328 | running, | ||
329 | running)) { | ||
330 | |||
331 | ___update_load_avg(&rq->avg_rt, 1, 1); | ||
332 | return 1; | ||
333 | } | ||
334 | |||
335 | return 0; | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * dl_rq: | ||
340 | * | ||
341 | * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked | ||
342 | * util_sum = cpu_scale * load_sum | ||
343 | * runnable_load_sum = load_sum | ||
344 | * | ||
345 | */ | ||
346 | |||
347 | int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) | ||
348 | { | ||
349 | if (___update_load_sum(now, rq->cpu, &rq->avg_dl, | ||
350 | running, | ||
351 | running, | ||
352 | running)) { | ||
353 | |||
354 | ___update_load_avg(&rq->avg_dl, 1, 1); | ||
355 | return 1; | ||
356 | } | ||
357 | |||
358 | return 0; | ||
359 | } | ||
360 | |||
361 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
362 | /* | ||
363 | * irq: | ||
364 | * | ||
365 | * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked | ||
366 | * util_sum = cpu_scale * load_sum | ||
367 | * runnable_load_sum = load_sum | ||
368 | * | ||
369 | */ | ||
370 | |||
371 | int update_irq_load_avg(struct rq *rq, u64 running) | ||
372 | { | ||
373 | int ret = 0; | ||
374 | /* | ||
375 | * We know the time that has been used by interrupt since last update | ||
376 | * but we don't when. Let be pessimistic and assume that interrupt has | ||
377 | * happened just before the update. This is not so far from reality | ||
378 | * because interrupt will most probably wake up task and trig an update | ||
379 | * of rq clock during which the metric si updated. | ||
380 | * We start to decay with normal context time and then we add the | ||
381 | * interrupt context time. | ||
382 | * We can safely remove running from rq->clock because | ||
383 | * rq->clock += delta with delta >= running | ||
384 | */ | ||
385 | ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, | ||
386 | 0, | ||
387 | 0, | ||
388 | 0); | ||
389 | ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, | ||
390 | 1, | ||
391 | 1, | ||
392 | 1); | ||
393 | |||
394 | if (ret) | ||
395 | ___update_load_avg(&rq->avg_irq, 1, 1); | ||
396 | |||
397 | return ret; | ||
398 | } | ||
399 | #endif | ||
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h new file mode 100644 index 000000000000..d2894db28955 --- /dev/null +++ b/kernel/sched/pelt.h | |||
@@ -0,0 +1,72 @@ | |||
1 | #ifdef CONFIG_SMP | ||
2 | |||
3 | int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); | ||
4 | int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); | ||
5 | int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); | ||
6 | int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); | ||
7 | int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); | ||
8 | |||
9 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
10 | int update_irq_load_avg(struct rq *rq, u64 running); | ||
11 | #else | ||
12 | static inline int | ||
13 | update_irq_load_avg(struct rq *rq, u64 running) | ||
14 | { | ||
15 | return 0; | ||
16 | } | ||
17 | #endif | ||
18 | |||
19 | /* | ||
20 | * When a task is dequeued, its estimated utilization should not be update if | ||
21 | * its util_avg has not been updated at least once. | ||
22 | * This flag is used to synchronize util_avg updates with util_est updates. | ||
23 | * We map this information into the LSB bit of the utilization saved at | ||
24 | * dequeue time (i.e. util_est.dequeued). | ||
25 | */ | ||
26 | #define UTIL_AVG_UNCHANGED 0x1 | ||
27 | |||
28 | static inline void cfs_se_util_change(struct sched_avg *avg) | ||
29 | { | ||
30 | unsigned int enqueued; | ||
31 | |||
32 | if (!sched_feat(UTIL_EST)) | ||
33 | return; | ||
34 | |||
35 | /* Avoid store if the flag has been already set */ | ||
36 | enqueued = avg->util_est.enqueued; | ||
37 | if (!(enqueued & UTIL_AVG_UNCHANGED)) | ||
38 | return; | ||
39 | |||
40 | /* Reset flag to report util_avg has been updated */ | ||
41 | enqueued &= ~UTIL_AVG_UNCHANGED; | ||
42 | WRITE_ONCE(avg->util_est.enqueued, enqueued); | ||
43 | } | ||
44 | |||
45 | #else | ||
46 | |||
47 | static inline int | ||
48 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | ||
49 | { | ||
50 | return 0; | ||
51 | } | ||
52 | |||
53 | static inline int | ||
54 | update_rt_rq_load_avg(u64 now, struct rq *rq, int running) | ||
55 | { | ||
56 | return 0; | ||
57 | } | ||
58 | |||
59 | static inline int | ||
60 | update_dl_rq_load_avg(u64 now, struct rq *rq, int running) | ||
61 | { | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | static inline int | ||
66 | update_irq_load_avg(struct rq *rq, u64 running) | ||
67 | { | ||
68 | return 0; | ||
69 | } | ||
70 | #endif | ||
71 | |||
72 | |||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index eaaec8364f96..2e2955a8cf8f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -5,6 +5,8 @@ | |||
5 | */ | 5 | */ |
6 | #include "sched.h" | 6 | #include "sched.h" |
7 | 7 | ||
8 | #include "pelt.h" | ||
9 | |||
8 | int sched_rr_timeslice = RR_TIMESLICE; | 10 | int sched_rr_timeslice = RR_TIMESLICE; |
9 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; | 11 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; |
10 | 12 | ||
@@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq) | |||
973 | curr->se.exec_start = now; | 975 | curr->se.exec_start = now; |
974 | cgroup_account_cputime(curr, delta_exec); | 976 | cgroup_account_cputime(curr, delta_exec); |
975 | 977 | ||
976 | sched_rt_avg_update(rq, delta_exec); | ||
977 | |||
978 | if (!rt_bandwidth_enabled()) | 978 | if (!rt_bandwidth_enabled()) |
979 | return; | 979 | return; |
980 | 980 | ||
@@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1578 | 1578 | ||
1579 | rt_queue_push_tasks(rq); | 1579 | rt_queue_push_tasks(rq); |
1580 | 1580 | ||
1581 | /* | ||
1582 | * If prev task was rt, put_prev_task() has already updated the | ||
1583 | * utilization. We only care of the case where we start to schedule a | ||
1584 | * rt task | ||
1585 | */ | ||
1586 | if (rq->curr->sched_class != &rt_sched_class) | ||
1587 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); | ||
1588 | |||
1581 | return p; | 1589 | return p; |
1582 | } | 1590 | } |
1583 | 1591 | ||
@@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1585 | { | 1593 | { |
1586 | update_curr_rt(rq); | 1594 | update_curr_rt(rq); |
1587 | 1595 | ||
1596 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); | ||
1597 | |||
1588 | /* | 1598 | /* |
1589 | * The previous task needs to be made eligible for pushing | 1599 | * The previous task needs to be made eligible for pushing |
1590 | * if it is still active | 1600 | * if it is still active |
@@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
2314 | struct sched_rt_entity *rt_se = &p->rt; | 2324 | struct sched_rt_entity *rt_se = &p->rt; |
2315 | 2325 | ||
2316 | update_curr_rt(rq); | 2326 | update_curr_rt(rq); |
2327 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); | ||
2317 | 2328 | ||
2318 | watchdog(rq, p); | 2329 | watchdog(rq, p); |
2319 | 2330 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c7742dcc136c..4a2e8cae63c4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -594,6 +594,7 @@ struct rt_rq { | |||
594 | unsigned long rt_nr_total; | 594 | unsigned long rt_nr_total; |
595 | int overloaded; | 595 | int overloaded; |
596 | struct plist_head pushable_tasks; | 596 | struct plist_head pushable_tasks; |
597 | |||
597 | #endif /* CONFIG_SMP */ | 598 | #endif /* CONFIG_SMP */ |
598 | int rt_queued; | 599 | int rt_queued; |
599 | 600 | ||
@@ -673,7 +674,26 @@ struct dl_rq { | |||
673 | u64 bw_ratio; | 674 | u64 bw_ratio; |
674 | }; | 675 | }; |
675 | 676 | ||
677 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
678 | /* An entity is a task if it doesn't "own" a runqueue */ | ||
679 | #define entity_is_task(se) (!se->my_q) | ||
680 | #else | ||
681 | #define entity_is_task(se) 1 | ||
682 | #endif | ||
683 | |||
676 | #ifdef CONFIG_SMP | 684 | #ifdef CONFIG_SMP |
685 | /* | ||
686 | * XXX we want to get rid of these helpers and use the full load resolution. | ||
687 | */ | ||
688 | static inline long se_weight(struct sched_entity *se) | ||
689 | { | ||
690 | return scale_load_down(se->load.weight); | ||
691 | } | ||
692 | |||
693 | static inline long se_runnable(struct sched_entity *se) | ||
694 | { | ||
695 | return scale_load_down(se->runnable_weight); | ||
696 | } | ||
677 | 697 | ||
678 | static inline bool sched_asym_prefer(int a, int b) | 698 | static inline bool sched_asym_prefer(int a, int b) |
679 | { | 699 | { |
@@ -833,8 +853,12 @@ struct rq { | |||
833 | 853 | ||
834 | struct list_head cfs_tasks; | 854 | struct list_head cfs_tasks; |
835 | 855 | ||
836 | u64 rt_avg; | 856 | struct sched_avg avg_rt; |
837 | u64 age_stamp; | 857 | struct sched_avg avg_dl; |
858 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
859 | #define HAVE_SCHED_AVG_IRQ | ||
860 | struct sched_avg avg_irq; | ||
861 | #endif | ||
838 | u64 idle_stamp; | 862 | u64 idle_stamp; |
839 | u64 avg_idle; | 863 | u64 avg_idle; |
840 | 864 | ||
@@ -1075,7 +1099,8 @@ enum numa_faults_stats { | |||
1075 | }; | 1099 | }; |
1076 | extern void sched_setnuma(struct task_struct *p, int node); | 1100 | extern void sched_setnuma(struct task_struct *p, int node); |
1077 | extern int migrate_task_to(struct task_struct *p, int cpu); | 1101 | extern int migrate_task_to(struct task_struct *p, int cpu); |
1078 | extern int migrate_swap(struct task_struct *, struct task_struct *); | 1102 | extern int migrate_swap(struct task_struct *p, struct task_struct *t, |
1103 | int cpu, int scpu); | ||
1079 | extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); | 1104 | extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); |
1080 | #else | 1105 | #else |
1081 | static inline void | 1106 | static inline void |
@@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); | |||
1690 | 1715 | ||
1691 | extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | 1716 | extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
1692 | 1717 | ||
1693 | extern const_debug unsigned int sysctl_sched_time_avg; | ||
1694 | extern const_debug unsigned int sysctl_sched_nr_migrate; | 1718 | extern const_debug unsigned int sysctl_sched_nr_migrate; |
1695 | extern const_debug unsigned int sysctl_sched_migration_cost; | 1719 | extern const_debug unsigned int sysctl_sched_migration_cost; |
1696 | 1720 | ||
1697 | static inline u64 sched_avg_period(void) | ||
1698 | { | ||
1699 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1700 | } | ||
1701 | |||
1702 | #ifdef CONFIG_SCHED_HRTICK | 1721 | #ifdef CONFIG_SCHED_HRTICK |
1703 | 1722 | ||
1704 | /* | 1723 | /* |
@@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu) | |||
1735 | #endif | 1754 | #endif |
1736 | 1755 | ||
1737 | #ifdef CONFIG_SMP | 1756 | #ifdef CONFIG_SMP |
1738 | extern void sched_avg_update(struct rq *rq); | ||
1739 | |||
1740 | #ifndef arch_scale_cpu_capacity | 1757 | #ifndef arch_scale_cpu_capacity |
1741 | static __always_inline | 1758 | static __always_inline |
1742 | unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 1759 | unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) |
@@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
1747 | return SCHED_CAPACITY_SCALE; | 1764 | return SCHED_CAPACITY_SCALE; |
1748 | } | 1765 | } |
1749 | #endif | 1766 | #endif |
1750 | |||
1751 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1752 | { | ||
1753 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); | ||
1754 | sched_avg_update(rq); | ||
1755 | } | ||
1756 | #else | 1767 | #else |
1757 | #ifndef arch_scale_cpu_capacity | 1768 | #ifndef arch_scale_cpu_capacity |
1758 | static __always_inline | 1769 | static __always_inline |
@@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) | |||
1761 | return SCHED_CAPACITY_SCALE; | 1772 | return SCHED_CAPACITY_SCALE; |
1762 | } | 1773 | } |
1763 | #endif | 1774 | #endif |
1764 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } | ||
1765 | static inline void sched_avg_update(struct rq *rq) { } | ||
1766 | #endif | 1775 | #endif |
1767 | 1776 | ||
1768 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 1777 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
@@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | |||
2177 | #endif | 2186 | #endif |
2178 | 2187 | ||
2179 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | 2188 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
2180 | static inline unsigned long cpu_util_dl(struct rq *rq) | 2189 | static inline unsigned long cpu_bw_dl(struct rq *rq) |
2181 | { | 2190 | { |
2182 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; | 2191 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; |
2183 | } | 2192 | } |
2184 | 2193 | ||
2194 | static inline unsigned long cpu_util_dl(struct rq *rq) | ||
2195 | { | ||
2196 | return READ_ONCE(rq->avg_dl.util_avg); | ||
2197 | } | ||
2198 | |||
2185 | static inline unsigned long cpu_util_cfs(struct rq *rq) | 2199 | static inline unsigned long cpu_util_cfs(struct rq *rq) |
2186 | { | 2200 | { |
2187 | unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); | 2201 | unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); |
@@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) | |||
2193 | 2207 | ||
2194 | return util; | 2208 | return util; |
2195 | } | 2209 | } |
2210 | |||
2211 | static inline unsigned long cpu_util_rt(struct rq *rq) | ||
2212 | { | ||
2213 | return READ_ONCE(rq->avg_rt.util_avg); | ||
2214 | } | ||
2215 | #endif | ||
2216 | |||
2217 | #ifdef HAVE_SCHED_AVG_IRQ | ||
2218 | static inline unsigned long cpu_util_irq(struct rq *rq) | ||
2219 | { | ||
2220 | return rq->avg_irq.util_avg; | ||
2221 | } | ||
2222 | |||
2223 | static inline | ||
2224 | unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) | ||
2225 | { | ||
2226 | util *= (max - irq); | ||
2227 | util /= max; | ||
2228 | |||
2229 | return util; | ||
2230 | |||
2231 | } | ||
2232 | #else | ||
2233 | static inline unsigned long cpu_util_irq(struct rq *rq) | ||
2234 | { | ||
2235 | return 0; | ||
2236 | } | ||
2237 | |||
2238 | static inline | ||
2239 | unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) | ||
2240 | { | ||
2241 | return util; | ||
2242 | } | ||
2196 | #endif | 2243 | #endif |
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index b6fb2c3b3ff7..66b59ac77c22 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c | |||
@@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q) | |||
32 | } | 32 | } |
33 | EXPORT_SYMBOL(swake_up_locked); | 33 | EXPORT_SYMBOL(swake_up_locked); |
34 | 34 | ||
35 | void swake_up(struct swait_queue_head *q) | 35 | void swake_up_one(struct swait_queue_head *q) |
36 | { | 36 | { |
37 | unsigned long flags; | 37 | unsigned long flags; |
38 | 38 | ||
@@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q) | |||
40 | swake_up_locked(q); | 40 | swake_up_locked(q); |
41 | raw_spin_unlock_irqrestore(&q->lock, flags); | 41 | raw_spin_unlock_irqrestore(&q->lock, flags); |
42 | } | 42 | } |
43 | EXPORT_SYMBOL(swake_up); | 43 | EXPORT_SYMBOL(swake_up_one); |
44 | 44 | ||
45 | /* | 45 | /* |
46 | * Does not allow usage from IRQ disabled, since we must be able to | 46 | * Does not allow usage from IRQ disabled, since we must be able to |
@@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q) | |||
69 | } | 69 | } |
70 | EXPORT_SYMBOL(swake_up_all); | 70 | EXPORT_SYMBOL(swake_up_all); |
71 | 71 | ||
72 | void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) | 72 | static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) |
73 | { | 73 | { |
74 | wait->task = current; | 74 | wait->task = current; |
75 | if (list_empty(&wait->task_list)) | 75 | if (list_empty(&wait->task_list)) |
76 | list_add(&wait->task_list, &q->task_list); | 76 | list_add_tail(&wait->task_list, &q->task_list); |
77 | } | 77 | } |
78 | 78 | ||
79 | void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) | 79 | void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state) |
80 | { | 80 | { |
81 | unsigned long flags; | 81 | unsigned long flags; |
82 | 82 | ||
@@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int | |||
85 | set_current_state(state); | 85 | set_current_state(state); |
86 | raw_spin_unlock_irqrestore(&q->lock, flags); | 86 | raw_spin_unlock_irqrestore(&q->lock, flags); |
87 | } | 87 | } |
88 | EXPORT_SYMBOL(prepare_to_swait); | 88 | EXPORT_SYMBOL(prepare_to_swait_exclusive); |
89 | 89 | ||
90 | long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) | 90 | long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) |
91 | { | 91 | { |
92 | if (signal_pending_state(state, current)) | 92 | unsigned long flags; |
93 | return -ERESTARTSYS; | 93 | long ret = 0; |
94 | 94 | ||
95 | prepare_to_swait(q, wait, state); | 95 | raw_spin_lock_irqsave(&q->lock, flags); |
96 | if (unlikely(signal_pending_state(state, current))) { | ||
97 | /* | ||
98 | * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() | ||
99 | * must not see us. | ||
100 | */ | ||
101 | list_del_init(&wait->task_list); | ||
102 | ret = -ERESTARTSYS; | ||
103 | } else { | ||
104 | __prepare_to_swait(q, wait); | ||
105 | set_current_state(state); | ||
106 | } | ||
107 | raw_spin_unlock_irqrestore(&q->lock, flags); | ||
96 | 108 | ||
97 | return 0; | 109 | return ret; |
98 | } | 110 | } |
99 | EXPORT_SYMBOL(prepare_to_swait_event); | 111 | EXPORT_SYMBOL(prepare_to_swait_event); |
100 | 112 | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 5043e7433f4b..c230c2dd48e1 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu) | |||
238 | 238 | ||
239 | mutex_lock(&smpboot_threads_lock); | 239 | mutex_lock(&smpboot_threads_lock); |
240 | list_for_each_entry(cur, &hotplug_threads, list) | 240 | list_for_each_entry(cur, &hotplug_threads, list) |
241 | if (cpumask_test_cpu(cpu, cur->cpumask)) | 241 | smpboot_unpark_thread(cur, cpu); |
242 | smpboot_unpark_thread(cur, cpu); | ||
243 | mutex_unlock(&smpboot_threads_lock); | 242 | mutex_unlock(&smpboot_threads_lock); |
244 | return 0; | 243 | return 0; |
245 | } | 244 | } |
@@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | |||
280 | } | 279 | } |
281 | 280 | ||
282 | /** | 281 | /** |
283 | * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related | 282 | * smpboot_register_percpu_thread - Register a per_cpu thread related |
284 | * to hotplug | 283 | * to hotplug |
285 | * @plug_thread: Hotplug thread descriptor | 284 | * @plug_thread: Hotplug thread descriptor |
286 | * @cpumask: The cpumask where threads run | ||
287 | * | 285 | * |
288 | * Creates and starts the threads on all online cpus. | 286 | * Creates and starts the threads on all online cpus. |
289 | */ | 287 | */ |
290 | int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread, | 288 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) |
291 | const struct cpumask *cpumask) | ||
292 | { | 289 | { |
293 | unsigned int cpu; | 290 | unsigned int cpu; |
294 | int ret = 0; | 291 | int ret = 0; |
295 | 292 | ||
296 | if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) | ||
297 | return -ENOMEM; | ||
298 | cpumask_copy(plug_thread->cpumask, cpumask); | ||
299 | |||
300 | get_online_cpus(); | 293 | get_online_cpus(); |
301 | mutex_lock(&smpboot_threads_lock); | 294 | mutex_lock(&smpboot_threads_lock); |
302 | for_each_online_cpu(cpu) { | 295 | for_each_online_cpu(cpu) { |
303 | ret = __smpboot_create_thread(plug_thread, cpu); | 296 | ret = __smpboot_create_thread(plug_thread, cpu); |
304 | if (ret) { | 297 | if (ret) { |
305 | smpboot_destroy_threads(plug_thread); | 298 | smpboot_destroy_threads(plug_thread); |
306 | free_cpumask_var(plug_thread->cpumask); | ||
307 | goto out; | 299 | goto out; |
308 | } | 300 | } |
309 | if (cpumask_test_cpu(cpu, cpumask)) | 301 | smpboot_unpark_thread(plug_thread, cpu); |
310 | smpboot_unpark_thread(plug_thread, cpu); | ||
311 | } | 302 | } |
312 | list_add(&plug_thread->list, &hotplug_threads); | 303 | list_add(&plug_thread->list, &hotplug_threads); |
313 | out: | 304 | out: |
@@ -315,7 +306,7 @@ out: | |||
315 | put_online_cpus(); | 306 | put_online_cpus(); |
316 | return ret; | 307 | return ret; |
317 | } | 308 | } |
318 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask); | 309 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); |
319 | 310 | ||
320 | /** | 311 | /** |
321 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug | 312 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug |
@@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) | |||
331 | smpboot_destroy_threads(plug_thread); | 322 | smpboot_destroy_threads(plug_thread); |
332 | mutex_unlock(&smpboot_threads_lock); | 323 | mutex_unlock(&smpboot_threads_lock); |
333 | put_online_cpus(); | 324 | put_online_cpus(); |
334 | free_cpumask_var(plug_thread->cpumask); | ||
335 | } | 325 | } |
336 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | 326 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); |
337 | 327 | ||
338 | /** | ||
339 | * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked | ||
340 | * @plug_thread: Hotplug thread descriptor | ||
341 | * @new: Revised mask to use | ||
342 | * | ||
343 | * The cpumask field in the smp_hotplug_thread must not be updated directly | ||
344 | * by the client, but only by calling this function. | ||
345 | * This function can only be called on a registered smp_hotplug_thread. | ||
346 | */ | ||
347 | void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, | ||
348 | const struct cpumask *new) | ||
349 | { | ||
350 | struct cpumask *old = plug_thread->cpumask; | ||
351 | static struct cpumask tmp; | ||
352 | unsigned int cpu; | ||
353 | |||
354 | lockdep_assert_cpus_held(); | ||
355 | mutex_lock(&smpboot_threads_lock); | ||
356 | |||
357 | /* Park threads that were exclusively enabled on the old mask. */ | ||
358 | cpumask_andnot(&tmp, old, new); | ||
359 | for_each_cpu_and(cpu, &tmp, cpu_online_mask) | ||
360 | smpboot_park_thread(plug_thread, cpu); | ||
361 | |||
362 | /* Unpark threads that are exclusively enabled on the new mask. */ | ||
363 | cpumask_andnot(&tmp, new, old); | ||
364 | for_each_cpu_and(cpu, &tmp, cpu_online_mask) | ||
365 | smpboot_unpark_thread(plug_thread, cpu); | ||
366 | |||
367 | cpumask_copy(old, new); | ||
368 | |||
369 | mutex_unlock(&smpboot_threads_lock); | ||
370 | } | ||
371 | |||
372 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); | 328 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); |
373 | 329 | ||
374 | /* | 330 | /* |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 69eb76daed34..067cb83f37ea 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, | |||
238 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); | 238 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); |
239 | DEFINE_WAKE_Q(wakeq); | 239 | DEFINE_WAKE_Q(wakeq); |
240 | int err; | 240 | int err; |
241 | |||
241 | retry: | 242 | retry: |
243 | /* | ||
244 | * The waking up of stopper threads has to happen in the same | ||
245 | * scheduling context as the queueing. Otherwise, there is a | ||
246 | * possibility of one of the above stoppers being woken up by another | ||
247 | * CPU, and preempting us. This will cause us to not wake up the other | ||
248 | * stopper forever. | ||
249 | */ | ||
250 | preempt_disable(); | ||
242 | raw_spin_lock_irq(&stopper1->lock); | 251 | raw_spin_lock_irq(&stopper1->lock); |
243 | raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); | 252 | raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); |
244 | 253 | ||
245 | err = -ENOENT; | 254 | if (!stopper1->enabled || !stopper2->enabled) { |
246 | if (!stopper1->enabled || !stopper2->enabled) | 255 | err = -ENOENT; |
247 | goto unlock; | 256 | goto unlock; |
257 | } | ||
258 | |||
248 | /* | 259 | /* |
249 | * Ensure that if we race with __stop_cpus() the stoppers won't get | 260 | * Ensure that if we race with __stop_cpus() the stoppers won't get |
250 | * queued up in reverse order leading to system deadlock. | 261 | * queued up in reverse order leading to system deadlock. |
@@ -255,36 +266,30 @@ retry: | |||
255 | * It can be falsely true but it is safe to spin until it is cleared, | 266 | * It can be falsely true but it is safe to spin until it is cleared, |
256 | * queue_stop_cpus_work() does everything under preempt_disable(). | 267 | * queue_stop_cpus_work() does everything under preempt_disable(). |
257 | */ | 268 | */ |
258 | err = -EDEADLK; | 269 | if (unlikely(stop_cpus_in_progress)) { |
259 | if (unlikely(stop_cpus_in_progress)) | 270 | err = -EDEADLK; |
260 | goto unlock; | 271 | goto unlock; |
272 | } | ||
261 | 273 | ||
262 | err = 0; | 274 | err = 0; |
263 | __cpu_stop_queue_work(stopper1, work1, &wakeq); | 275 | __cpu_stop_queue_work(stopper1, work1, &wakeq); |
264 | __cpu_stop_queue_work(stopper2, work2, &wakeq); | 276 | __cpu_stop_queue_work(stopper2, work2, &wakeq); |
265 | /* | 277 | |
266 | * The waking up of stopper threads has to happen | ||
267 | * in the same scheduling context as the queueing. | ||
268 | * Otherwise, there is a possibility of one of the | ||
269 | * above stoppers being woken up by another CPU, | ||
270 | * and preempting us. This will cause us to n ot | ||
271 | * wake up the other stopper forever. | ||
272 | */ | ||
273 | preempt_disable(); | ||
274 | unlock: | 278 | unlock: |
275 | raw_spin_unlock(&stopper2->lock); | 279 | raw_spin_unlock(&stopper2->lock); |
276 | raw_spin_unlock_irq(&stopper1->lock); | 280 | raw_spin_unlock_irq(&stopper1->lock); |
277 | 281 | ||
278 | if (unlikely(err == -EDEADLK)) { | 282 | if (unlikely(err == -EDEADLK)) { |
283 | preempt_enable(); | ||
284 | |||
279 | while (stop_cpus_in_progress) | 285 | while (stop_cpus_in_progress) |
280 | cpu_relax(); | 286 | cpu_relax(); |
287 | |||
281 | goto retry; | 288 | goto retry; |
282 | } | 289 | } |
283 | 290 | ||
284 | if (!err) { | 291 | wake_up_q(&wakeq); |
285 | wake_up_q(&wakeq); | 292 | preempt_enable(); |
286 | preempt_enable(); | ||
287 | } | ||
288 | 293 | ||
289 | return err; | 294 | return err; |
290 | } | 295 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d9837c0aff4..f22f76b7a138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = { | |||
368 | .mode = 0644, | 368 | .mode = 0644, |
369 | .proc_handler = proc_dointvec, | 369 | .proc_handler = proc_dointvec, |
370 | }, | 370 | }, |
371 | { | ||
372 | .procname = "sched_time_avg_ms", | ||
373 | .data = &sysctl_sched_time_avg, | ||
374 | .maxlen = sizeof(unsigned int), | ||
375 | .mode = 0644, | ||
376 | .proc_handler = proc_dointvec_minmax, | ||
377 | .extra1 = &one, | ||
378 | }, | ||
379 | #ifdef CONFIG_SCHEDSTATS | 371 | #ifdef CONFIG_SCHEDSTATS |
380 | { | 372 | { |
381 | .procname = "sched_schedstats", | 373 | .procname = "sched_schedstats", |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 576d18045811..5470dce212c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -18,18 +18,14 @@ | |||
18 | #include <linux/init.h> | 18 | #include <linux/init.h> |
19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
20 | #include <linux/sysctl.h> | 20 | #include <linux/sysctl.h> |
21 | #include <linux/smpboot.h> | ||
22 | #include <linux/sched/rt.h> | ||
23 | #include <uapi/linux/sched/types.h> | ||
24 | #include <linux/tick.h> | 21 | #include <linux/tick.h> |
25 | #include <linux/workqueue.h> | ||
26 | #include <linux/sched/clock.h> | 22 | #include <linux/sched/clock.h> |
27 | #include <linux/sched/debug.h> | 23 | #include <linux/sched/debug.h> |
28 | #include <linux/sched/isolation.h> | 24 | #include <linux/sched/isolation.h> |
25 | #include <linux/stop_machine.h> | ||
29 | 26 | ||
30 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
31 | #include <linux/kvm_para.h> | 28 | #include <linux/kvm_para.h> |
32 | #include <linux/kthread.h> | ||
33 | 29 | ||
34 | static DEFINE_MUTEX(watchdog_mutex); | 30 | static DEFINE_MUTEX(watchdog_mutex); |
35 | 31 | ||
@@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void) | |||
169 | unsigned int __read_mostly softlockup_panic = | 165 | unsigned int __read_mostly softlockup_panic = |
170 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | 166 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; |
171 | 167 | ||
172 | static bool softlockup_threads_initialized __read_mostly; | 168 | static bool softlockup_initialized __read_mostly; |
173 | static u64 __read_mostly sample_period; | 169 | static u64 __read_mostly sample_period; |
174 | 170 | ||
175 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 171 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
176 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | ||
177 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); | 172 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); |
178 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); | 173 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); |
179 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | 174 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
@@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void) | |||
335 | __this_cpu_inc(hrtimer_interrupts); | 330 | __this_cpu_inc(hrtimer_interrupts); |
336 | } | 331 | } |
337 | 332 | ||
333 | static DEFINE_PER_CPU(struct completion, softlockup_completion); | ||
334 | static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); | ||
335 | |||
336 | /* | ||
337 | * The watchdog thread function - touches the timestamp. | ||
338 | * | ||
339 | * It only runs once every sample_period seconds (4 seconds by | ||
340 | * default) to reset the softlockup timestamp. If this gets delayed | ||
341 | * for more than 2*watchdog_thresh seconds then the debug-printout | ||
342 | * triggers in watchdog_timer_fn(). | ||
343 | */ | ||
344 | static int softlockup_fn(void *data) | ||
345 | { | ||
346 | __this_cpu_write(soft_lockup_hrtimer_cnt, | ||
347 | __this_cpu_read(hrtimer_interrupts)); | ||
348 | __touch_watchdog(); | ||
349 | complete(this_cpu_ptr(&softlockup_completion)); | ||
350 | |||
351 | return 0; | ||
352 | } | ||
353 | |||
338 | /* watchdog kicker functions */ | 354 | /* watchdog kicker functions */ |
339 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 355 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
340 | { | 356 | { |
@@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
350 | watchdog_interrupt_count(); | 366 | watchdog_interrupt_count(); |
351 | 367 | ||
352 | /* kick the softlockup detector */ | 368 | /* kick the softlockup detector */ |
353 | wake_up_process(__this_cpu_read(softlockup_watchdog)); | 369 | if (completion_done(this_cpu_ptr(&softlockup_completion))) { |
370 | reinit_completion(this_cpu_ptr(&softlockup_completion)); | ||
371 | stop_one_cpu_nowait(smp_processor_id(), | ||
372 | softlockup_fn, NULL, | ||
373 | this_cpu_ptr(&softlockup_stop_work)); | ||
374 | } | ||
354 | 375 | ||
355 | /* .. and repeat */ | 376 | /* .. and repeat */ |
356 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); | 377 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); |
@@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
448 | return HRTIMER_RESTART; | 469 | return HRTIMER_RESTART; |
449 | } | 470 | } |
450 | 471 | ||
451 | static void watchdog_set_prio(unsigned int policy, unsigned int prio) | ||
452 | { | ||
453 | struct sched_param param = { .sched_priority = prio }; | ||
454 | |||
455 | sched_setscheduler(current, policy, ¶m); | ||
456 | } | ||
457 | |||
458 | static void watchdog_enable(unsigned int cpu) | 472 | static void watchdog_enable(unsigned int cpu) |
459 | { | 473 | { |
460 | struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); | 474 | struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); |
475 | struct completion *done = this_cpu_ptr(&softlockup_completion); | ||
476 | |||
477 | WARN_ON_ONCE(cpu != smp_processor_id()); | ||
478 | |||
479 | init_completion(done); | ||
480 | complete(done); | ||
461 | 481 | ||
462 | /* | 482 | /* |
463 | * Start the timer first to prevent the NMI watchdog triggering | 483 | * Start the timer first to prevent the NMI watchdog triggering |
@@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu) | |||
473 | /* Enable the perf event */ | 493 | /* Enable the perf event */ |
474 | if (watchdog_enabled & NMI_WATCHDOG_ENABLED) | 494 | if (watchdog_enabled & NMI_WATCHDOG_ENABLED) |
475 | watchdog_nmi_enable(cpu); | 495 | watchdog_nmi_enable(cpu); |
476 | |||
477 | watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); | ||
478 | } | 496 | } |
479 | 497 | ||
480 | static void watchdog_disable(unsigned int cpu) | 498 | static void watchdog_disable(unsigned int cpu) |
481 | { | 499 | { |
482 | struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); | 500 | struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); |
483 | 501 | ||
484 | watchdog_set_prio(SCHED_NORMAL, 0); | 502 | WARN_ON_ONCE(cpu != smp_processor_id()); |
503 | |||
485 | /* | 504 | /* |
486 | * Disable the perf event first. That prevents that a large delay | 505 | * Disable the perf event first. That prevents that a large delay |
487 | * between disabling the timer and disabling the perf event causes | 506 | * between disabling the timer and disabling the perf event causes |
@@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu) | |||
489 | */ | 508 | */ |
490 | watchdog_nmi_disable(cpu); | 509 | watchdog_nmi_disable(cpu); |
491 | hrtimer_cancel(hrtimer); | 510 | hrtimer_cancel(hrtimer); |
511 | wait_for_completion(this_cpu_ptr(&softlockup_completion)); | ||
492 | } | 512 | } |
493 | 513 | ||
494 | static void watchdog_cleanup(unsigned int cpu, bool online) | 514 | static int softlockup_stop_fn(void *data) |
495 | { | 515 | { |
496 | watchdog_disable(cpu); | 516 | watchdog_disable(smp_processor_id()); |
517 | return 0; | ||
497 | } | 518 | } |
498 | 519 | ||
499 | static int watchdog_should_run(unsigned int cpu) | 520 | static void softlockup_stop_all(void) |
500 | { | 521 | { |
501 | return __this_cpu_read(hrtimer_interrupts) != | 522 | int cpu; |
502 | __this_cpu_read(soft_lockup_hrtimer_cnt); | 523 | |
524 | if (!softlockup_initialized) | ||
525 | return; | ||
526 | |||
527 | for_each_cpu(cpu, &watchdog_allowed_mask) | ||
528 | smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); | ||
529 | |||
530 | cpumask_clear(&watchdog_allowed_mask); | ||
503 | } | 531 | } |
504 | 532 | ||
505 | /* | 533 | static int softlockup_start_fn(void *data) |
506 | * The watchdog thread function - touches the timestamp. | ||
507 | * | ||
508 | * It only runs once every sample_period seconds (4 seconds by | ||
509 | * default) to reset the softlockup timestamp. If this gets delayed | ||
510 | * for more than 2*watchdog_thresh seconds then the debug-printout | ||
511 | * triggers in watchdog_timer_fn(). | ||
512 | */ | ||
513 | static void watchdog(unsigned int cpu) | ||
514 | { | 534 | { |
515 | __this_cpu_write(soft_lockup_hrtimer_cnt, | 535 | watchdog_enable(smp_processor_id()); |
516 | __this_cpu_read(hrtimer_interrupts)); | 536 | return 0; |
517 | __touch_watchdog(); | ||
518 | } | 537 | } |
519 | 538 | ||
520 | static struct smp_hotplug_thread watchdog_threads = { | 539 | static void softlockup_start_all(void) |
521 | .store = &softlockup_watchdog, | ||
522 | .thread_should_run = watchdog_should_run, | ||
523 | .thread_fn = watchdog, | ||
524 | .thread_comm = "watchdog/%u", | ||
525 | .setup = watchdog_enable, | ||
526 | .cleanup = watchdog_cleanup, | ||
527 | .park = watchdog_disable, | ||
528 | .unpark = watchdog_enable, | ||
529 | }; | ||
530 | |||
531 | static void softlockup_update_smpboot_threads(void) | ||
532 | { | 540 | { |
533 | lockdep_assert_held(&watchdog_mutex); | 541 | int cpu; |
534 | |||
535 | if (!softlockup_threads_initialized) | ||
536 | return; | ||
537 | 542 | ||
538 | smpboot_update_cpumask_percpu_thread(&watchdog_threads, | 543 | cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); |
539 | &watchdog_allowed_mask); | 544 | for_each_cpu(cpu, &watchdog_allowed_mask) |
545 | smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); | ||
540 | } | 546 | } |
541 | 547 | ||
542 | /* Temporarily park all watchdog threads */ | 548 | int lockup_detector_online_cpu(unsigned int cpu) |
543 | static void softlockup_park_all_threads(void) | ||
544 | { | 549 | { |
545 | cpumask_clear(&watchdog_allowed_mask); | 550 | watchdog_enable(cpu); |
546 | softlockup_update_smpboot_threads(); | 551 | return 0; |
547 | } | 552 | } |
548 | 553 | ||
549 | /* Unpark enabled threads */ | 554 | int lockup_detector_offline_cpu(unsigned int cpu) |
550 | static void softlockup_unpark_threads(void) | ||
551 | { | 555 | { |
552 | cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); | 556 | watchdog_disable(cpu); |
553 | softlockup_update_smpboot_threads(); | 557 | return 0; |
554 | } | 558 | } |
555 | 559 | ||
556 | static void lockup_detector_reconfigure(void) | 560 | static void lockup_detector_reconfigure(void) |
557 | { | 561 | { |
558 | cpus_read_lock(); | 562 | cpus_read_lock(); |
559 | watchdog_nmi_stop(); | 563 | watchdog_nmi_stop(); |
560 | softlockup_park_all_threads(); | 564 | |
565 | softlockup_stop_all(); | ||
561 | set_sample_period(); | 566 | set_sample_period(); |
562 | lockup_detector_update_enable(); | 567 | lockup_detector_update_enable(); |
563 | if (watchdog_enabled && watchdog_thresh) | 568 | if (watchdog_enabled && watchdog_thresh) |
564 | softlockup_unpark_threads(); | 569 | softlockup_start_all(); |
570 | |||
565 | watchdog_nmi_start(); | 571 | watchdog_nmi_start(); |
566 | cpus_read_unlock(); | 572 | cpus_read_unlock(); |
567 | /* | 573 | /* |
@@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void) | |||
580 | */ | 586 | */ |
581 | static __init void lockup_detector_setup(void) | 587 | static __init void lockup_detector_setup(void) |
582 | { | 588 | { |
583 | int ret; | ||
584 | |||
585 | /* | 589 | /* |
586 | * If sysctl is off and watchdog got disabled on the command line, | 590 | * If sysctl is off and watchdog got disabled on the command line, |
587 | * nothing to do here. | 591 | * nothing to do here. |
@@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void) | |||
592 | !(watchdog_enabled && watchdog_thresh)) | 596 | !(watchdog_enabled && watchdog_thresh)) |
593 | return; | 597 | return; |
594 | 598 | ||
595 | ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, | ||
596 | &watchdog_allowed_mask); | ||
597 | if (ret) { | ||
598 | pr_err("Failed to initialize soft lockup detector threads\n"); | ||
599 | return; | ||
600 | } | ||
601 | |||
602 | mutex_lock(&watchdog_mutex); | 599 | mutex_lock(&watchdog_mutex); |
603 | softlockup_threads_initialized = true; | ||
604 | lockup_detector_reconfigure(); | 600 | lockup_detector_reconfigure(); |
601 | softlockup_initialized = true; | ||
605 | mutex_unlock(&watchdog_mutex); | 602 | mutex_unlock(&watchdog_mutex); |
606 | } | 603 | } |
607 | 604 | ||
608 | #else /* CONFIG_SOFTLOCKUP_DETECTOR */ | 605 | #else /* CONFIG_SOFTLOCKUP_DETECTOR */ |
609 | static inline int watchdog_park_threads(void) { return 0; } | ||
610 | static inline void watchdog_unpark_threads(void) { } | ||
611 | static inline int watchdog_enable_all_cpus(void) { return 0; } | ||
612 | static inline void watchdog_disable_all_cpus(void) { } | ||
613 | static void lockup_detector_reconfigure(void) | 606 | static void lockup_detector_reconfigure(void) |
614 | { | 607 | { |
615 | cpus_read_lock(); | 608 | cpus_read_lock(); |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index e449a23e9d59..1f7020d65d0a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c | |||
@@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void) | |||
175 | evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, | 175 | evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, |
176 | watchdog_overflow_callback, NULL); | 176 | watchdog_overflow_callback, NULL); |
177 | if (IS_ERR(evt)) { | 177 | if (IS_ERR(evt)) { |
178 | pr_info("Perf event create on CPU %d failed with %ld\n", cpu, | 178 | pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, |
179 | PTR_ERR(evt)); | 179 | PTR_ERR(evt)); |
180 | return PTR_ERR(evt); | 180 | return PTR_ERR(evt); |
181 | } | 181 | } |
182 | this_cpu_write(watchdog_ev, evt); | 182 | this_cpu_write(watchdog_ev, evt); |
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 04e554cae3a2..108250e4d376 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c | |||
@@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm) | |||
604 | 604 | ||
605 | kvm_for_each_vcpu(i, vcpu, kvm) { | 605 | kvm_for_each_vcpu(i, vcpu, kvm) { |
606 | vcpu->arch.pause = false; | 606 | vcpu->arch.pause = false; |
607 | swake_up(kvm_arch_vcpu_wq(vcpu)); | 607 | swake_up_one(kvm_arch_vcpu_wq(vcpu)); |
608 | } | 608 | } |
609 | } | 609 | } |
610 | 610 | ||
@@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu) | |||
612 | { | 612 | { |
613 | struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); | 613 | struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); |
614 | 614 | ||
615 | swait_event_interruptible(*wq, ((!vcpu->arch.power_off) && | 615 | swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) && |
616 | (!vcpu->arch.pause))); | 616 | (!vcpu->arch.pause))); |
617 | 617 | ||
618 | if (vcpu->arch.power_off || vcpu->arch.pause) { | 618 | if (vcpu->arch.power_off || vcpu->arch.pause) { |
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index c95ab4c5a475..9b73d3ad918a 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c | |||
@@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) | |||
155 | smp_mb(); /* Make sure the above is visible */ | 155 | smp_mb(); /* Make sure the above is visible */ |
156 | 156 | ||
157 | wq = kvm_arch_vcpu_wq(vcpu); | 157 | wq = kvm_arch_vcpu_wq(vcpu); |
158 | swake_up(wq); | 158 | swake_up_one(wq); |
159 | 159 | ||
160 | return PSCI_RET_SUCCESS; | 160 | return PSCI_RET_SUCCESS; |
161 | } | 161 | } |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 57bcb27dcf30..23c2519c5b32 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
@@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work) | |||
107 | trace_kvm_async_pf_completed(addr, gva); | 107 | trace_kvm_async_pf_completed(addr, gva); |
108 | 108 | ||
109 | if (swq_has_sleeper(&vcpu->wq)) | 109 | if (swq_has_sleeper(&vcpu->wq)) |
110 | swake_up(&vcpu->wq); | 110 | swake_up_one(&vcpu->wq); |
111 | 111 | ||
112 | mmput(mm); | 112 | mmput(mm); |
113 | kvm_put_kvm(vcpu->kvm); | 113 | kvm_put_kvm(vcpu->kvm); |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8b47507faab5..3d233ebfbee9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -2172,7 +2172,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) | |||
2172 | kvm_arch_vcpu_blocking(vcpu); | 2172 | kvm_arch_vcpu_blocking(vcpu); |
2173 | 2173 | ||
2174 | for (;;) { | 2174 | for (;;) { |
2175 | prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); | 2175 | prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); |
2176 | 2176 | ||
2177 | if (kvm_vcpu_check_block(vcpu) < 0) | 2177 | if (kvm_vcpu_check_block(vcpu) < 0) |
2178 | break; | 2178 | break; |
@@ -2214,7 +2214,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) | |||
2214 | 2214 | ||
2215 | wqp = kvm_arch_vcpu_wq(vcpu); | 2215 | wqp = kvm_arch_vcpu_wq(vcpu); |
2216 | if (swq_has_sleeper(wqp)) { | 2216 | if (swq_has_sleeper(wqp)) { |
2217 | swake_up(wqp); | 2217 | swake_up_one(wqp); |
2218 | ++vcpu->stat.halt_wakeup; | 2218 | ++vcpu->stat.halt_wakeup; |
2219 | return true; | 2219 | return true; |
2220 | } | 2220 | } |