summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-04-05 03:20:34 -0400
committerIngo Molnar <mingo@kernel.org>2018-04-05 03:20:34 -0400
commitea2a6af517714c52a1209795a03e863e96b460bb (patch)
tree3bd443bc9b23ceeaf3743eaf2d6d35ec63c620c9 /kernel/sched
parent1b5d43cfb69759d8ef8d30469cea31d0c037aed5 (diff)
parent642e7fd23353e22290e3d51719fcb658dc252342 (diff)
Merge branch 'linus' into sched/urgent, to pick up fixes and updates
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/autogroup.c21
-rw-r--r--kernel/sched/autogroup.h12
-rw-r--r--kernel/sched/clock.c36
-rw-r--r--kernel/sched/completion.c11
-rw-r--r--kernel/sched/core.c194
-rw-r--r--kernel/sched/cpuacct.c33
-rw-r--r--kernel/sched/cpudeadline.c23
-rw-r--r--kernel/sched/cpudeadline.h29
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c221
-rw-r--r--kernel/sched/cpupri.c15
-rw-r--r--kernel/sched/cpupri.h25
-rw-r--r--kernel/sched/cputime.c58
-rw-r--r--kernel/sched/deadline.c82
-rw-r--r--kernel/sched/debug.c103
-rw-r--r--kernel/sched/fair.c1415
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/idle.c142
-rw-r--r--kernel/sched/idle_task.c110
-rw-r--r--kernel/sched/isolation.c14
-rw-r--r--kernel/sched/loadavg.c34
-rw-r--r--kernel/sched/membarrier.c27
-rw-r--r--kernel/sched/rt.c60
-rw-r--r--kernel/sched/sched.h650
-rw-r--r--kernel/sched/stats.c20
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--kernel/sched/stop_task.c11
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/sched/topology.c46
-rw-r--r--kernel/sched/wait.c13
-rw-r--r--kernel/sched/wait_bit.c127
32 files changed, 2055 insertions, 1580 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4feff40..d9a02b318108 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
17endif 17endif
18 18
19obj-y += core.o loadavg.o clock.o cputime.o 19obj-y += core.o loadavg.o clock.o cputime.o
20obj-y += idle_task.o fair.o rt.o deadline.o 20obj-y += idle.o fair.o rt.o deadline.o
21obj-y += wait.o wait_bit.o swait.o completion.o idle.o 21obj-y += wait.o wait_bit.o swait.o completion.o
22
22obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o 23obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
23obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o 24obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
24obj-$(CONFIG_SCHEDSTATS) += stats.o 25obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..6be6c575b6cd 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,10 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/proc_fs.h> 2/*
3#include <linux/seq_file.h> 3 * Auto-group scheduling implementation:
4#include <linux/utsname.h> 4 */
5#include <linux/security.h>
6#include <linux/export.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 7unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
168 autogroup_kref_put(prev); 165 autogroup_kref_put(prev);
169} 166}
170 167
171/* Allocates GFP_KERNEL, cannot be called under any spinlock */ 168/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
172void sched_autogroup_create_attach(struct task_struct *p) 169void sched_autogroup_create_attach(struct task_struct *p)
173{ 170{
174 struct autogroup *ag = autogroup_create(); 171 struct autogroup *ag = autogroup_create();
175 172
176 autogroup_move_group(p, ag); 173 autogroup_move_group(p, ag);
177 /* drop extra reference added by autogroup_create() */ 174
175 /* Drop extra reference added by autogroup_create(): */
178 autogroup_kref_put(ag); 176 autogroup_kref_put(ag);
179} 177}
180EXPORT_SYMBOL(sched_autogroup_create_attach); 178EXPORT_SYMBOL(sched_autogroup_create_attach);
181 179
182/* Cannot be called under siglock. Currently has no users */ 180/* Cannot be called under siglock. Currently has no users: */
183void sched_autogroup_detach(struct task_struct *p) 181void sched_autogroup_detach(struct task_struct *p)
184{ 182{
185 autogroup_move_group(p, &autogroup_default); 183 autogroup_move_group(p, &autogroup_default);
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
202 200
203 return 1; 201 return 1;
204} 202}
205
206__setup("noautogroup", setup_autogroup); 203__setup("noautogroup", setup_autogroup);
207 204
208#ifdef CONFIG_PROC_FS 205#ifdef CONFIG_PROC_FS
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
224 if (nice < 0 && !can_nice(current, nice)) 221 if (nice < 0 && !can_nice(current, nice))
225 return -EPERM; 222 return -EPERM;
226 223
227 /* this is a heavy operation taking global locks.. */ 224 /* This is a heavy operation, taking global locks.. */
228 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) 225 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
229 return -EAGAIN; 226 return -EAGAIN;
230 227
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
267 264
268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 265 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
269} 266}
270#endif /* CONFIG_SCHED_DEBUG */ 267#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..b96419974a1f 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,15 +1,11 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifdef CONFIG_SCHED_AUTOGROUP 2#ifdef CONFIG_SCHED_AUTOGROUP
3 3
4#include <linux/kref.h>
5#include <linux/rwsem.h>
6#include <linux/sched/autogroup.h>
7
8struct autogroup { 4struct autogroup {
9 /* 5 /*
10 * reference doesn't mean how many thread attach to this 6 * Reference doesn't mean how many threads attach to this
11 * autogroup now. It just stands for the number of task 7 * autogroup now. It just stands for the number of tasks
12 * could use this autogroup. 8 * which could use this autogroup.
13 */ 9 */
14 struct kref kref; 10 struct kref kref;
15 struct task_group *tg; 11 struct task_group *tg;
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
56 return tg; 52 return tg;
57} 53}
58 54
59#ifdef CONFIG_SCHED_DEBUG
60static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 55static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
61{ 56{
62 return 0; 57 return 0;
63} 58}
64#endif
65 59
66#endif /* CONFIG_SCHED_AUTOGROUP */ 60#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..10c83e73837a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * sched_clock for unstable cpu clocks 2 * sched_clock() for unstable CPU clocks
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
5 * 5 *
@@ -11,7 +11,7 @@
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * 13 *
14 * What: 14 * What this file implements:
15 * 15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution 16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i) 17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current CPU.
30 * 30 *
31 * sched_clock_cpu(i) 31 * sched_clock_cpu(i)
32 * 32 *
33 * How: 33 * How it is implemented:
34 * 34 *
35 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the 36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -52,19 +52,7 @@
52 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
53 * 53 *
54 */ 54 */
55#include <linux/spinlock.h> 55#include "sched.h"
56#include <linux/hardirq.h>
57#include <linux/export.h>
58#include <linux/percpu.h>
59#include <linux/ktime.h>
60#include <linux/sched.h>
61#include <linux/nmi.h>
62#include <linux/sched/clock.h>
63#include <linux/static_key.h>
64#include <linux/workqueue.h>
65#include <linux/compiler.h>
66#include <linux/tick.h>
67#include <linux/init.h>
68 56
69/* 57/*
70 * Scheduler clock - returns current time in nanosec units. 58 * Scheduler clock - returns current time in nanosec units.
@@ -302,21 +290,21 @@ again:
302 * cmpxchg64 below only protects one readout. 290 * cmpxchg64 below only protects one readout.
303 * 291 *
304 * We must reread via sched_clock_local() in the retry case on 292 * We must reread via sched_clock_local() in the retry case on
305 * 32bit as an NMI could use sched_clock_local() via the 293 * 32-bit kernels as an NMI could use sched_clock_local() via the
306 * tracer and hit between the readout of 294 * tracer and hit between the readout of
307 * the low32bit and the high 32bit portion. 295 * the low 32-bit and the high 32-bit portion.
308 */ 296 */
309 this_clock = sched_clock_local(my_scd); 297 this_clock = sched_clock_local(my_scd);
310 /* 298 /*
311 * We must enforce atomic readout on 32bit, otherwise the 299 * We must enforce atomic readout on 32-bit, otherwise the
312 * update on the remote cpu can hit inbetween the readout of 300 * update on the remote CPU can hit inbetween the readout of
313 * the low32bit and the high 32bit portion. 301 * the low 32-bit and the high 32-bit portion.
314 */ 302 */
315 remote_clock = cmpxchg64(&scd->clock, 0, 0); 303 remote_clock = cmpxchg64(&scd->clock, 0, 0);
316#else 304#else
317 /* 305 /*
318 * On 64bit the read of [my]scd->clock is atomic versus the 306 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
319 * update, so we can avoid the above 32bit dance. 307 * update, so we can avoid the above 32-bit dance.
320 */ 308 */
321 sched_clock_local(my_scd); 309 sched_clock_local(my_scd);
322again: 310again:
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0926aef10dad..e426b0cb9ac6 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
11 * typically be used for exclusion which gives rise to priority inversion. 11 * typically be used for exclusion which gives rise to priority inversion.
12 * Waiting for completion is a typically sync point, but not an exclusion point. 12 * Waiting for completion is a typically sync point, but not an exclusion point.
13 */ 13 */
14 14#include "sched.h"
15#include <linux/sched/signal.h>
16#include <linux/sched/debug.h>
17#include <linux/completion.h>
18 15
19/** 16/**
20 * complete: - signals a single thread waiting on this completion 17 * complete: - signals a single thread waiting on this completion
@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
283bool try_wait_for_completion(struct completion *x) 280bool try_wait_for_completion(struct completion *x)
284{ 281{
285 unsigned long flags; 282 unsigned long flags;
286 int ret = 1; 283 bool ret = true;
287 284
288 /* 285 /*
289 * Since x->done will need to be locked only 286 * Since x->done will need to be locked only
@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
292 * return early in the blocking case. 289 * return early in the blocking case.
293 */ 290 */
294 if (!READ_ONCE(x->done)) 291 if (!READ_ONCE(x->done))
295 return 0; 292 return false;
296 293
297 spin_lock_irqsave(&x->wait.lock, flags); 294 spin_lock_irqsave(&x->wait.lock, flags);
298 if (!x->done) 295 if (!x->done)
299 ret = 0; 296 ret = false;
300 else if (x->done != UINT_MAX) 297 else if (x->done != UINT_MAX)
301 x->done--; 298 x->done--;
302 spin_unlock_irqrestore(&x->wait.lock, flags); 299 spin_unlock_irqrestore(&x->wait.lock, flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c94895bc5a2c..28b68995a417 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,37 +5,11 @@
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 */ 7 */
8#include <linux/sched.h> 8#include "sched.h"
9#include <linux/sched/clock.h>
10#include <uapi/linux/sched/types.h>
11#include <linux/sched/loadavg.h>
12#include <linux/sched/hotplug.h>
13#include <linux/wait_bit.h>
14#include <linux/cpuset.h>
15#include <linux/delayacct.h>
16#include <linux/init_task.h>
17#include <linux/context_tracking.h>
18#include <linux/rcupdate_wait.h>
19#include <linux/compat.h>
20
21#include <linux/blkdev.h>
22#include <linux/kprobes.h>
23#include <linux/mmu_context.h>
24#include <linux/module.h>
25#include <linux/nmi.h>
26#include <linux/prefetch.h>
27#include <linux/profile.h>
28#include <linux/security.h>
29#include <linux/syscalls.h>
30#include <linux/sched/isolation.h>
31 9
32#include <asm/switch_to.h> 10#include <asm/switch_to.h>
33#include <asm/tlb.h> 11#include <asm/tlb.h>
34#ifdef CONFIG_PARAVIRT
35#include <asm/paravirt.h>
36#endif
37 12
38#include "sched.h"
39#include "../workqueue_internal.h" 13#include "../workqueue_internal.h"
40#include "../smpboot.h" 14#include "../smpboot.h"
41 15
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
135 * [L] ->on_rq 109 * [L] ->on_rq
136 * RELEASE (rq->lock) 110 * RELEASE (rq->lock)
137 * 111 *
138 * If we observe the old cpu in task_rq_lock, the acquire of 112 * If we observe the old CPU in task_rq_lock, the acquire of
139 * the old rq->lock will fully serialize against the stores. 113 * the old rq->lock will fully serialize against the stores.
140 * 114 *
141 * If we observe the new CPU in task_rq_lock, the acquire will 115 * If we observe the new CPU in task_rq_lock, the acquire will
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
333} 307}
334#endif /* CONFIG_SMP */ 308#endif /* CONFIG_SMP */
335 309
336static void init_rq_hrtick(struct rq *rq) 310static void hrtick_rq_init(struct rq *rq)
337{ 311{
338#ifdef CONFIG_SMP 312#ifdef CONFIG_SMP
339 rq->hrtick_csd_pending = 0; 313 rq->hrtick_csd_pending = 0;
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
351{ 325{
352} 326}
353 327
354static inline void init_rq_hrtick(struct rq *rq) 328static inline void hrtick_rq_init(struct rq *rq)
355{ 329{
356} 330}
357#endif /* CONFIG_SCHED_HRTICK */ 331#endif /* CONFIG_SCHED_HRTICK */
@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
609{ 583{
610 int cpu = smp_processor_id(); 584 int cpu = smp_processor_id();
611 585
612 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 586 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
613 return false; 587 return false;
614 588
615 if (idle_cpu(cpu) && !need_resched()) 589 if (idle_cpu(cpu) && !need_resched())
@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
619 * We can't run Idle Load Balance on this CPU for this time so we 593 * We can't run Idle Load Balance on this CPU for this time so we
620 * cancel it and clear NOHZ_BALANCE_KICK 594 * cancel it and clear NOHZ_BALANCE_KICK
621 */ 595 */
622 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 596 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
623 return false; 597 return false;
624} 598}
625 599
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
1457 * 1431 *
1458 * - cpu_active must be a subset of cpu_online 1432 * - cpu_active must be a subset of cpu_online
1459 * 1433 *
1460 * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, 1434 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
1461 * see __set_cpus_allowed_ptr(). At this point the newly online 1435 * see __set_cpus_allowed_ptr(). At this point the newly online
1462 * CPU isn't yet part of the sched domains, and balancing will not 1436 * CPU isn't yet part of the sched domains, and balancing will not
1463 * see it. 1437 * see it.
@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
2488 2462
2489#ifdef CONFIG_PREEMPT_NOTIFIERS 2463#ifdef CONFIG_PREEMPT_NOTIFIERS
2490 2464
2491static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; 2465static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
2492 2466
2493void preempt_notifier_inc(void) 2467void preempt_notifier_inc(void)
2494{ 2468{
2495 static_key_slow_inc(&preempt_notifier_key); 2469 static_branch_inc(&preempt_notifier_key);
2496} 2470}
2497EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2471EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2498 2472
2499void preempt_notifier_dec(void) 2473void preempt_notifier_dec(void)
2500{ 2474{
2501 static_key_slow_dec(&preempt_notifier_key); 2475 static_branch_dec(&preempt_notifier_key);
2502} 2476}
2503EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2477EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2504 2478
@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2508 */ 2482 */
2509void preempt_notifier_register(struct preempt_notifier *notifier) 2483void preempt_notifier_register(struct preempt_notifier *notifier)
2510{ 2484{
2511 if (!static_key_false(&preempt_notifier_key)) 2485 if (!static_branch_unlikely(&preempt_notifier_key))
2512 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2486 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2513 2487
2514 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2488 hlist_add_head(&notifier->link, &current->preempt_notifiers);
@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2537 2511
2538static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2512static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2539{ 2513{
2540 if (static_key_false(&preempt_notifier_key)) 2514 if (static_branch_unlikely(&preempt_notifier_key))
2541 __fire_sched_in_preempt_notifiers(curr); 2515 __fire_sched_in_preempt_notifiers(curr);
2542} 2516}
2543 2517
@@ -2555,7 +2529,7 @@ static __always_inline void
2555fire_sched_out_preempt_notifiers(struct task_struct *curr, 2529fire_sched_out_preempt_notifiers(struct task_struct *curr,
2556 struct task_struct *next) 2530 struct task_struct *next)
2557{ 2531{
2558 if (static_key_false(&preempt_notifier_key)) 2532 if (static_branch_unlikely(&preempt_notifier_key))
2559 __fire_sched_out_preempt_notifiers(curr, next); 2533 __fire_sched_out_preempt_notifiers(curr, next);
2560} 2534}
2561 2535
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
2629 raw_spin_unlock_irq(&rq->lock); 2603 raw_spin_unlock_irq(&rq->lock);
2630} 2604}
2631 2605
2606/*
2607 * NOP if the arch has not defined these:
2608 */
2609
2610#ifndef prepare_arch_switch
2611# define prepare_arch_switch(next) do { } while (0)
2612#endif
2613
2614#ifndef finish_arch_post_lock_switch
2615# define finish_arch_post_lock_switch() do { } while (0)
2616#endif
2617
2632/** 2618/**
2633 * prepare_task_switch - prepare to switch tasks 2619 * prepare_task_switch - prepare to switch tasks
2634 * @rq: the runqueue preparing to switch 2620 * @rq: the runqueue preparing to switch
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3037 3023
3038#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3024#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3039 /* 3025 /*
3040 * 64-bit doesn't need locks to atomically read a 64bit value. 3026 * 64-bit doesn't need locks to atomically read a 64-bit value.
3041 * So we have a optimization chance when the task's delta_exec is 0. 3027 * So we have a optimization chance when the task's delta_exec is 0.
3042 * Reading ->on_cpu is racy, but this is ok. 3028 * Reading ->on_cpu is racy, but this is ok.
3043 * 3029 *
@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
3096 rq->idle_balance = idle_cpu(cpu); 3082 rq->idle_balance = idle_cpu(cpu);
3097 trigger_load_balance(rq); 3083 trigger_load_balance(rq);
3098#endif 3084#endif
3099 rq_last_tick_reset(rq);
3100} 3085}
3101 3086
3102#ifdef CONFIG_NO_HZ_FULL 3087#ifdef CONFIG_NO_HZ_FULL
3103/** 3088
3104 * scheduler_tick_max_deferment 3089struct tick_work {
3105 * 3090 int cpu;
3106 * Keep at least one tick per second when a single 3091 struct delayed_work work;
3107 * active task is running because the scheduler doesn't 3092};
3108 * yet completely support full dynticks environment. 3093
3109 * 3094static struct tick_work __percpu *tick_work_cpu;
3110 * This makes sure that uptime, CFS vruntime, load 3095
3111 * balancing, etc... continue to move forward, even 3096static void sched_tick_remote(struct work_struct *work)
3112 * with a very low granularity.
3113 *
3114 * Return: Maximum deferment in nanoseconds.
3115 */
3116u64 scheduler_tick_max_deferment(void)
3117{ 3097{
3118 struct rq *rq = this_rq(); 3098 struct delayed_work *dwork = to_delayed_work(work);
3119 unsigned long next, now = READ_ONCE(jiffies); 3099 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3100 int cpu = twork->cpu;
3101 struct rq *rq = cpu_rq(cpu);
3102 struct rq_flags rf;
3103
3104 /*
3105 * Handle the tick only if it appears the remote CPU is running in full
3106 * dynticks mode. The check is racy by nature, but missing a tick or
3107 * having one too much is no big deal because the scheduler tick updates
3108 * statistics and checks timeslices in a time-independent way, regardless
3109 * of when exactly it is running.
3110 */
3111 if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
3112 struct task_struct *curr;
3113 u64 delta;
3120 3114
3121 next = rq->last_sched_tick + HZ; 3115 rq_lock_irq(rq, &rf);
3116 update_rq_clock(rq);
3117 curr = rq->curr;
3118 delta = rq_clock_task(rq) - curr->se.exec_start;
3122 3119
3123 if (time_before_eq(next, now)) 3120 /*
3124 return 0; 3121 * Make sure the next tick runs within a reasonable
3122 * amount of time.
3123 */
3124 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3125 curr->sched_class->task_tick(rq, curr, 0);
3126 rq_unlock_irq(rq, &rf);
3127 }
3125 3128
3126 return jiffies_to_nsecs(next - now); 3129 /*
3130 * Run the remote tick once per second (1Hz). This arbitrary
3131 * frequency is large enough to avoid overload but short enough
3132 * to keep scheduler internal stats reasonably up to date.
3133 */
3134 queue_delayed_work(system_unbound_wq, dwork, HZ);
3135}
3136
3137static void sched_tick_start(int cpu)
3138{
3139 struct tick_work *twork;
3140
3141 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3142 return;
3143
3144 WARN_ON_ONCE(!tick_work_cpu);
3145
3146 twork = per_cpu_ptr(tick_work_cpu, cpu);
3147 twork->cpu = cpu;
3148 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3149 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3127} 3150}
3151
3152#ifdef CONFIG_HOTPLUG_CPU
3153static void sched_tick_stop(int cpu)
3154{
3155 struct tick_work *twork;
3156
3157 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3158 return;
3159
3160 WARN_ON_ONCE(!tick_work_cpu);
3161
3162 twork = per_cpu_ptr(tick_work_cpu, cpu);
3163 cancel_delayed_work_sync(&twork->work);
3164}
3165#endif /* CONFIG_HOTPLUG_CPU */
3166
3167int __init sched_tick_offload_init(void)
3168{
3169 tick_work_cpu = alloc_percpu(struct tick_work);
3170 BUG_ON(!tick_work_cpu);
3171
3172 return 0;
3173}
3174
3175#else /* !CONFIG_NO_HZ_FULL */
3176static inline void sched_tick_start(int cpu) { }
3177static inline void sched_tick_stop(int cpu) { }
3128#endif 3178#endif
3129 3179
3130#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3180#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -4892,7 +4942,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4892 * 4942 *
4893 * Return: 0. 4943 * Return: 0.
4894 */ 4944 */
4895SYSCALL_DEFINE0(sched_yield) 4945static void do_sched_yield(void)
4896{ 4946{
4897 struct rq_flags rf; 4947 struct rq_flags rf;
4898 struct rq *rq; 4948 struct rq *rq;
@@ -4913,7 +4963,11 @@ SYSCALL_DEFINE0(sched_yield)
4913 sched_preempt_enable_no_resched(); 4963 sched_preempt_enable_no_resched();
4914 4964
4915 schedule(); 4965 schedule();
4966}
4916 4967
4968SYSCALL_DEFINE0(sched_yield)
4969{
4970 do_sched_yield();
4917 return 0; 4971 return 0;
4918} 4972}
4919 4973
@@ -4997,7 +5051,7 @@ EXPORT_SYMBOL(__cond_resched_softirq);
4997void __sched yield(void) 5051void __sched yield(void)
4998{ 5052{
4999 set_current_state(TASK_RUNNING); 5053 set_current_state(TASK_RUNNING);
5000 sys_sched_yield(); 5054 do_sched_yield();
5001} 5055}
5002EXPORT_SYMBOL(yield); 5056EXPORT_SYMBOL(yield);
5003 5057
@@ -5786,6 +5840,7 @@ int sched_cpu_starting(unsigned int cpu)
5786{ 5840{
5787 set_cpu_rq_start_time(cpu); 5841 set_cpu_rq_start_time(cpu);
5788 sched_rq_cpu_starting(cpu); 5842 sched_rq_cpu_starting(cpu);
5843 sched_tick_start(cpu);
5789 return 0; 5844 return 0;
5790} 5845}
5791 5846
@@ -5797,6 +5852,7 @@ int sched_cpu_dying(unsigned int cpu)
5797 5852
5798 /* Handle pending wakeups and then migrate everything off */ 5853 /* Handle pending wakeups and then migrate everything off */
5799 sched_ttwu_pending(); 5854 sched_ttwu_pending();
5855 sched_tick_stop(cpu);
5800 5856
5801 rq_lock_irqsave(rq, &rf); 5857 rq_lock_irqsave(rq, &rf);
5802 if (rq->rd) { 5858 if (rq->rd) {
@@ -5809,7 +5865,7 @@ int sched_cpu_dying(unsigned int cpu)
5809 5865
5810 calc_load_migrate(rq); 5866 calc_load_migrate(rq);
5811 update_max_interval(); 5867 update_max_interval();
5812 nohz_balance_exit_idle(cpu); 5868 nohz_balance_exit_idle(rq);
5813 hrtick_clear(rq); 5869 hrtick_clear(rq);
5814 return 0; 5870 return 0;
5815} 5871}
@@ -6022,13 +6078,11 @@ void __init sched_init(void)
6022 rq_attach_root(rq, &def_root_domain); 6078 rq_attach_root(rq, &def_root_domain);
6023#ifdef CONFIG_NO_HZ_COMMON 6079#ifdef CONFIG_NO_HZ_COMMON
6024 rq->last_load_update_tick = jiffies; 6080 rq->last_load_update_tick = jiffies;
6025 rq->nohz_flags = 0; 6081 rq->last_blocked_load_update_tick = jiffies;
6026#endif 6082 atomic_set(&rq->nohz_flags, 0);
6027#ifdef CONFIG_NO_HZ_FULL
6028 rq->last_sched_tick = 0;
6029#endif 6083#endif
6030#endif /* CONFIG_SMP */ 6084#endif /* CONFIG_SMP */
6031 init_rq_hrtick(rq); 6085 hrtick_rq_init(rq);
6032 atomic_set(&rq->nr_iowait, 0); 6086 atomic_set(&rq->nr_iowait, 0);
6033 } 6087 }
6034 6088
@@ -7027,3 +7081,5 @@ const u32 sched_prio_to_wmult[40] = {
7027 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 7081 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
7028 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 7082 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
7029}; 7083};
7084
7085#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..9fbb10383434 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,24 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/cgroup.h>
3#include <linux/slab.h>
4#include <linux/percpu.h>
5#include <linux/spinlock.h>
6#include <linux/cpumask.h>
7#include <linux/seq_file.h>
8#include <linux/rcupdate.h>
9#include <linux/kernel_stat.h>
10#include <linux/err.h>
11
12#include "sched.h"
13
14/* 2/*
15 * CPU accounting code for task groups. 3 * CPU accounting code for task groups.
16 * 4 *
17 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 5 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
18 * (balbir@in.ibm.com). 6 * (balbir@in.ibm.com).
19 */ 7 */
8#include "sched.h"
20 9
21/* Time spent by the tasks of the cpu accounting group executing in ... */ 10/* Time spent by the tasks of the CPU accounting group executing in ... */
22enum cpuacct_stat_index { 11enum cpuacct_stat_index {
23 CPUACCT_STAT_USER, /* ... user mode */ 12 CPUACCT_STAT_USER, /* ... user mode */
24 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 13 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
@@ -35,12 +24,12 @@ struct cpuacct_usage {
35 u64 usages[CPUACCT_STAT_NSTATS]; 24 u64 usages[CPUACCT_STAT_NSTATS];
36}; 25};
37 26
38/* track cpu usage of a group of tasks and its child groups */ 27/* track CPU usage of a group of tasks and its child groups */
39struct cpuacct { 28struct cpuacct {
40 struct cgroup_subsys_state css; 29 struct cgroup_subsys_state css;
41 /* cpuusage holds pointer to a u64-type object on every cpu */ 30 /* cpuusage holds pointer to a u64-type object on every CPU */
42 struct cpuacct_usage __percpu *cpuusage; 31 struct cpuacct_usage __percpu *cpuusage;
43 struct kernel_cpustat __percpu *cpustat; 32 struct kernel_cpustat __percpu *cpustat;
44}; 33};
45 34
46static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) 35static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
48 return css ? container_of(css, struct cpuacct, css) : NULL; 37 return css ? container_of(css, struct cpuacct, css) : NULL;
49} 38}
50 39
51/* return cpu accounting group to which this task belongs */ 40/* Return CPU accounting group to which this task belongs */
52static inline struct cpuacct *task_ca(struct task_struct *tsk) 41static inline struct cpuacct *task_ca(struct task_struct *tsk)
53{ 42{
54 return css_ca(task_css(tsk, cpuacct_cgrp_id)); 43 return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
65 .cpuusage = &root_cpuacct_cpuusage, 54 .cpuusage = &root_cpuacct_cpuusage,
66}; 55};
67 56
68/* create a new cpu accounting group */ 57/* Create a new CPU accounting group */
69static struct cgroup_subsys_state * 58static struct cgroup_subsys_state *
70cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) 59cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
71{ 60{
@@ -96,7 +85,7 @@ out:
96 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
97} 86}
98 87
99/* destroy an existing cpu accounting group */ 88/* Destroy an existing CPU accounting group */
100static void cpuacct_css_free(struct cgroup_subsys_state *css) 89static void cpuacct_css_free(struct cgroup_subsys_state *css)
101{ 90{
102 struct cpuacct *ca = css_ca(css); 91 struct cpuacct *ca = css_ca(css);
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
162#endif 151#endif
163} 152}
164 153
165/* return total cpu usage (in nanoseconds) of a group */ 154/* Return total CPU usage (in nanoseconds) of a group */
166static u64 __cpuusage_read(struct cgroup_subsys_state *css, 155static u64 __cpuusage_read(struct cgroup_subsys_state *css,
167 enum cpuacct_stat_index index) 156 enum cpuacct_stat_index index)
168{ 157{
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8d9562d890d3..50316455ea66 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,11 +10,7 @@
10 * as published by the Free Software Foundation; version 2 10 * as published by the Free Software Foundation; version 2
11 * of the License. 11 * of the License.
12 */ 12 */
13 13#include "sched.h"
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include <linux/slab.h>
17#include "cpudeadline.h"
18 14
19static inline int parent(int i) 15static inline int parent(int i)
20{ 16{
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
42 return; 38 return;
43 39
44 /* adapted from lib/prio_heap.c */ 40 /* adapted from lib/prio_heap.c */
45 while(1) { 41 while (1) {
46 u64 largest_dl; 42 u64 largest_dl;
43
47 l = left_child(idx); 44 l = left_child(idx);
48 r = right_child(idx); 45 r = right_child(idx);
49 largest = idx; 46 largest = idx;
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
131 return 1; 128 return 1;
132 } else { 129 } else {
133 int best_cpu = cpudl_maximum(cp); 130 int best_cpu = cpudl_maximum(cp);
131
134 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 132 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
135 133
136 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && 134 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
145} 143}
146 144
147/* 145/*
148 * cpudl_clear - remove a cpu from the cpudl max-heap 146 * cpudl_clear - remove a CPU from the cpudl max-heap
149 * @cp: the cpudl max-heap context 147 * @cp: the cpudl max-heap context
150 * @cpu: the target cpu 148 * @cpu: the target CPU
151 * 149 *
152 * Notes: assumes cpu_rq(cpu)->lock is locked 150 * Notes: assumes cpu_rq(cpu)->lock is locked
153 * 151 *
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
186/* 184/*
187 * cpudl_set - update the cpudl max-heap 185 * cpudl_set - update the cpudl max-heap
188 * @cp: the cpudl max-heap context 186 * @cp: the cpudl max-heap context
189 * @cpu: the target cpu 187 * @cpu: the target CPU
190 * @dl: the new earliest deadline for this cpu 188 * @dl: the new earliest deadline for this CPU
191 * 189 *
192 * Notes: assumes cpu_rq(cpu)->lock is locked 190 * Notes: assumes cpu_rq(cpu)->lock is locked
193 * 191 *
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
205 old_idx = cp->elements[cpu].idx; 203 old_idx = cp->elements[cpu].idx;
206 if (old_idx == IDX_INVALID) { 204 if (old_idx == IDX_INVALID) {
207 int new_idx = cp->size++; 205 int new_idx = cp->size++;
206
208 cp->elements[new_idx].dl = dl; 207 cp->elements[new_idx].dl = dl;
209 cp->elements[new_idx].cpu = cpu; 208 cp->elements[new_idx].cpu = cpu;
210 cp->elements[cpu].idx = new_idx; 209 cp->elements[cpu].idx = new_idx;
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
221/* 220/*
222 * cpudl_set_freecpu - Set the cpudl.free_cpus 221 * cpudl_set_freecpu - Set the cpudl.free_cpus
223 * @cp: the cpudl max-heap context 222 * @cp: the cpudl max-heap context
224 * @cpu: rd attached cpu 223 * @cpu: rd attached CPU
225 */ 224 */
226void cpudl_set_freecpu(struct cpudl *cp, int cpu) 225void cpudl_set_freecpu(struct cpudl *cp, int cpu)
227{ 226{
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
231/* 230/*
232 * cpudl_clear_freecpu - Clear the cpudl.free_cpus 231 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
233 * @cp: the cpudl max-heap context 232 * @cp: the cpudl max-heap context
234 * @cpu: rd attached cpu 233 * @cpu: rd attached CPU
235 */ 234 */
236void cpudl_clear_freecpu(struct cpudl *cp, int cpu) 235void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
237{ 236{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..0adeda93b5fb 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,26 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUDL_H
3#define _LINUX_CPUDL_H
4 2
5#include <linux/sched.h> 3#define IDX_INVALID -1
6#include <linux/sched/deadline.h>
7
8#define IDX_INVALID -1
9 4
10struct cpudl_item { 5struct cpudl_item {
11 u64 dl; 6 u64 dl;
12 int cpu; 7 int cpu;
13 int idx; 8 int idx;
14}; 9};
15 10
16struct cpudl { 11struct cpudl {
17 raw_spinlock_t lock; 12 raw_spinlock_t lock;
18 int size; 13 int size;
19 cpumask_var_t free_cpus; 14 cpumask_var_t free_cpus;
20 struct cpudl_item *elements; 15 struct cpudl_item *elements;
21}; 16};
22 17
23
24#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
25int cpudl_find(struct cpudl *cp, struct task_struct *p, 19int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
26 struct cpumask *later_mask);
27void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 20void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
28void cpudl_clear(struct cpudl *cp, int cpu); 21void cpudl_clear(struct cpudl *cp, int cpu);
29int cpudl_init(struct cpudl *cp); 22int cpudl_init(struct cpudl *cp);
30void cpudl_set_freecpu(struct cpudl *cp, int cpu); 23void cpudl_set_freecpu(struct cpudl *cp, int cpu);
31void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 24void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
32void cpudl_cleanup(struct cpudl *cp); 25void cpudl_cleanup(struct cpudl *cp);
33#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
34
35#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..5e54cbcae673 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,7 +8,6 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11
12#include "sched.h" 11#include "sched.h"
13 12
14DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 13DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 617c6741c525..d2c6083304b4 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,61 +11,56 @@
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <linux/cpufreq.h>
15#include <linux/kthread.h>
16#include <uapi/linux/sched/types.h>
17#include <linux/slab.h>
18#include <trace/events/power.h>
19
20#include "sched.h" 14#include "sched.h"
21 15
16#include <trace/events/power.h>
17
22struct sugov_tunables { 18struct sugov_tunables {
23 struct gov_attr_set attr_set; 19 struct gov_attr_set attr_set;
24 unsigned int rate_limit_us; 20 unsigned int rate_limit_us;
25}; 21};
26 22
27struct sugov_policy { 23struct sugov_policy {
28 struct cpufreq_policy *policy; 24 struct cpufreq_policy *policy;
29 25
30 struct sugov_tunables *tunables; 26 struct sugov_tunables *tunables;
31 struct list_head tunables_hook; 27 struct list_head tunables_hook;
32 28
33 raw_spinlock_t update_lock; /* For shared policies */ 29 raw_spinlock_t update_lock; /* For shared policies */
34 u64 last_freq_update_time; 30 u64 last_freq_update_time;
35 s64 freq_update_delay_ns; 31 s64 freq_update_delay_ns;
36 unsigned int next_freq; 32 unsigned int next_freq;
37 unsigned int cached_raw_freq; 33 unsigned int cached_raw_freq;
38 34
39 /* The next fields are only needed if fast switch cannot be used. */ 35 /* The next fields are only needed if fast switch cannot be used: */
40 struct irq_work irq_work; 36 struct irq_work irq_work;
41 struct kthread_work work; 37 struct kthread_work work;
42 struct mutex work_lock; 38 struct mutex work_lock;
43 struct kthread_worker worker; 39 struct kthread_worker worker;
44 struct task_struct *thread; 40 struct task_struct *thread;
45 bool work_in_progress; 41 bool work_in_progress;
46 42
47 bool need_freq_update; 43 bool need_freq_update;
48}; 44};
49 45
50struct sugov_cpu { 46struct sugov_cpu {
51 struct update_util_data update_util; 47 struct update_util_data update_util;
52 struct sugov_policy *sg_policy; 48 struct sugov_policy *sg_policy;
53 unsigned int cpu; 49 unsigned int cpu;
54 50
55 bool iowait_boost_pending; 51 bool iowait_boost_pending;
56 unsigned int iowait_boost; 52 unsigned int iowait_boost;
57 unsigned int iowait_boost_max; 53 unsigned int iowait_boost_max;
58 u64 last_update; 54 u64 last_update;
59 55
60 /* The fields below are only needed when sharing a policy. */ 56 /* The fields below are only needed when sharing a policy: */
61 unsigned long util_cfs; 57 unsigned long util_cfs;
62 unsigned long util_dl; 58 unsigned long util_dl;
63 unsigned long max; 59 unsigned long max;
64 unsigned int flags;
65 60
66 /* The field below is for single-CPU policies only. */ 61 /* The field below is for single-CPU policies only: */
67#ifdef CONFIG_NO_HZ_COMMON 62#ifdef CONFIG_NO_HZ_COMMON
68 unsigned long saved_idle_calls; 63 unsigned long saved_idle_calls;
69#endif 64#endif
70}; 65};
71 66
@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
79 74
80 /* 75 /*
81 * Since cpufreq_update_util() is called with rq->lock held for 76 * Since cpufreq_update_util() is called with rq->lock held for
82 * the @target_cpu, our per-cpu data is fully serialized. 77 * the @target_cpu, our per-CPU data is fully serialized.
83 * 78 *
84 * However, drivers cannot in general deal with cross-cpu 79 * However, drivers cannot in general deal with cross-CPU
85 * requests, so while get_next_freq() will work, our 80 * requests, so while get_next_freq() will work, our
86 * sugov_update_commit() call may not for the fast switching platforms. 81 * sugov_update_commit() call may not for the fast switching platforms.
87 * 82 *
@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
111 } 106 }
112 107
113 delta_ns = time - sg_policy->last_freq_update_time; 108 delta_ns = time - sg_policy->last_freq_update_time;
109
114 return delta_ns >= sg_policy->freq_update_delay_ns; 110 return delta_ns >= sg_policy->freq_update_delay_ns;
115} 111}
116 112
@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
186 182
187static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) 183static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
188{ 184{
185 struct rq *rq = cpu_rq(sg_cpu->cpu);
186 unsigned long util;
187
188 if (rq->rt.rt_nr_running) {
189 util = sg_cpu->max;
190 } else {
191 util = sg_cpu->util_dl;
192 if (rq->cfs.h_nr_running)
193 util += sg_cpu->util_cfs;
194 }
195
189 /* 196 /*
190 * Ideally we would like to set util_dl as min/guaranteed freq and 197 * Ideally we would like to set util_dl as min/guaranteed freq and
191 * util_cfs + util_dl as requested freq. However, cpufreq is not yet 198 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
192 * ready for such an interface. So, we only do the latter for now. 199 * ready for such an interface. So, we only do the latter for now.
193 */ 200 */
194 return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); 201 return min(util, sg_cpu->max);
195} 202}
196 203
197static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) 204static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
198{ 205{
199 if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { 206 if (flags & SCHED_CPUFREQ_IOWAIT) {
200 if (sg_cpu->iowait_boost_pending) 207 if (sg_cpu->iowait_boost_pending)
201 return; 208 return;
202 209
@@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
260static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 267static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
261#endif /* CONFIG_NO_HZ_COMMON */ 268#endif /* CONFIG_NO_HZ_COMMON */
262 269
270/*
271 * Make sugov_should_update_freq() ignore the rate limit when DL
272 * has increased the utilization.
273 */
274static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
275{
276 if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
277 sg_policy->need_freq_update = true;
278}
279
263static void sugov_update_single(struct update_util_data *hook, u64 time, 280static void sugov_update_single(struct update_util_data *hook, u64 time,
264 unsigned int flags) 281 unsigned int flags)
265{ 282{
266 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 283 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
267 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 284 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
268 struct cpufreq_policy *policy = sg_policy->policy;
269 unsigned long util, max; 285 unsigned long util, max;
270 unsigned int next_f; 286 unsigned int next_f;
271 bool busy; 287 bool busy;
272 288
273 sugov_set_iowait_boost(sg_cpu, time); 289 sugov_set_iowait_boost(sg_cpu, time, flags);
274 sg_cpu->last_update = time; 290 sg_cpu->last_update = time;
275 291
292 ignore_dl_rate_limit(sg_cpu, sg_policy);
293
276 if (!sugov_should_update_freq(sg_policy, time)) 294 if (!sugov_should_update_freq(sg_policy, time))
277 return; 295 return;
278 296
279 busy = sugov_cpu_is_busy(sg_cpu); 297 busy = sugov_cpu_is_busy(sg_cpu);
280 298
281 if (flags & SCHED_CPUFREQ_RT) { 299 sugov_get_util(sg_cpu);
282 next_f = policy->cpuinfo.max_freq; 300 max = sg_cpu->max;
283 } else { 301 util = sugov_aggregate_util(sg_cpu);
284 sugov_get_util(sg_cpu); 302 sugov_iowait_boost(sg_cpu, &util, &max);
285 max = sg_cpu->max; 303 next_f = get_next_freq(sg_policy, util, max);
286 util = sugov_aggregate_util(sg_cpu); 304 /*
287 sugov_iowait_boost(sg_cpu, &util, &max); 305 * Do not reduce the frequency if the CPU has not been idle
288 next_f = get_next_freq(sg_policy, util, max); 306 * recently, as the reduction is likely to be premature then.
289 /* 307 */
290 * Do not reduce the frequency if the CPU has not been idle 308 if (busy && next_f < sg_policy->next_freq) {
291 * recently, as the reduction is likely to be premature then. 309 next_f = sg_policy->next_freq;
292 */
293 if (busy && next_f < sg_policy->next_freq) {
294 next_f = sg_policy->next_freq;
295 310
296 /* Reset cached freq as next_freq has changed */ 311 /* Reset cached freq as next_freq has changed */
297 sg_policy->cached_raw_freq = 0; 312 sg_policy->cached_raw_freq = 0;
298 }
299 } 313 }
314
300 sugov_update_commit(sg_policy, time, next_f); 315 sugov_update_commit(sg_policy, time, next_f);
301} 316}
302 317
@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
312 unsigned long j_util, j_max; 327 unsigned long j_util, j_max;
313 s64 delta_ns; 328 s64 delta_ns;
314 329
330 sugov_get_util(j_sg_cpu);
331
315 /* 332 /*
316 * If the CFS CPU utilization was last updated before the 333 * If the CFS CPU utilization was last updated before the
317 * previous frequency update and the time elapsed between the 334 * previous frequency update and the time elapsed between the
@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
325 if (delta_ns > TICK_NSEC) { 342 if (delta_ns > TICK_NSEC) {
326 j_sg_cpu->iowait_boost = 0; 343 j_sg_cpu->iowait_boost = 0;
327 j_sg_cpu->iowait_boost_pending = false; 344 j_sg_cpu->iowait_boost_pending = false;
328 j_sg_cpu->util_cfs = 0;
329 if (j_sg_cpu->util_dl == 0)
330 continue;
331 } 345 }
332 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
333 return policy->cpuinfo.max_freq;
334 346
335 j_max = j_sg_cpu->max; 347 j_max = j_sg_cpu->max;
336 j_util = sugov_aggregate_util(j_sg_cpu); 348 j_util = sugov_aggregate_util(j_sg_cpu);
349 sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
337 if (j_util * max > j_max * util) { 350 if (j_util * max > j_max * util) {
338 util = j_util; 351 util = j_util;
339 max = j_max; 352 max = j_max;
340 } 353 }
341
342 sugov_iowait_boost(j_sg_cpu, &util, &max);
343 } 354 }
344 355
345 return get_next_freq(sg_policy, util, max); 356 return get_next_freq(sg_policy, util, max);
346} 357}
347 358
348static void sugov_update_shared(struct update_util_data *hook, u64 time, 359static void
349 unsigned int flags) 360sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
350{ 361{
351 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 362 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
352 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 363 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
354 365
355 raw_spin_lock(&sg_policy->update_lock); 366 raw_spin_lock(&sg_policy->update_lock);
356 367
357 sugov_get_util(sg_cpu); 368 sugov_set_iowait_boost(sg_cpu, time, flags);
358 sg_cpu->flags = flags;
359
360 sugov_set_iowait_boost(sg_cpu, time);
361 sg_cpu->last_update = time; 369 sg_cpu->last_update = time;
362 370
363 if (sugov_should_update_freq(sg_policy, time)) { 371 ignore_dl_rate_limit(sg_cpu, sg_policy);
364 if (flags & SCHED_CPUFREQ_RT)
365 next_f = sg_policy->policy->cpuinfo.max_freq;
366 else
367 next_f = sugov_next_freq_shared(sg_cpu, time);
368 372
373 if (sugov_should_update_freq(sg_policy, time)) {
374 next_f = sugov_next_freq_shared(sg_cpu, time);
369 sugov_update_commit(sg_policy, time, next_f); 375 sugov_update_commit(sg_policy, time, next_f);
370 } 376 }
371 377
@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
423 return sprintf(buf, "%u\n", tunables->rate_limit_us); 429 return sprintf(buf, "%u\n", tunables->rate_limit_us);
424} 430}
425 431
426static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, 432static ssize_t
427 size_t count) 433rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
428{ 434{
429 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 435 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
430 struct sugov_policy *sg_policy; 436 struct sugov_policy *sg_policy;
@@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
479{ 485{
480 struct task_struct *thread; 486 struct task_struct *thread;
481 struct sched_attr attr = { 487 struct sched_attr attr = {
482 .size = sizeof(struct sched_attr), 488 .size = sizeof(struct sched_attr),
483 .sched_policy = SCHED_DEADLINE, 489 .sched_policy = SCHED_DEADLINE,
484 .sched_flags = SCHED_FLAG_SUGOV, 490 .sched_flags = SCHED_FLAG_SUGOV,
485 .sched_nice = 0, 491 .sched_nice = 0,
486 .sched_priority = 0, 492 .sched_priority = 0,
487 /* 493 /*
488 * Fake (unused) bandwidth; workaround to "fix" 494 * Fake (unused) bandwidth; workaround to "fix"
489 * priority inheritance. 495 * priority inheritance.
@@ -662,21 +668,20 @@ static int sugov_start(struct cpufreq_policy *policy)
662 struct sugov_policy *sg_policy = policy->governor_data; 668 struct sugov_policy *sg_policy = policy->governor_data;
663 unsigned int cpu; 669 unsigned int cpu;
664 670
665 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 671 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
666 sg_policy->last_freq_update_time = 0; 672 sg_policy->last_freq_update_time = 0;
667 sg_policy->next_freq = UINT_MAX; 673 sg_policy->next_freq = UINT_MAX;
668 sg_policy->work_in_progress = false; 674 sg_policy->work_in_progress = false;
669 sg_policy->need_freq_update = false; 675 sg_policy->need_freq_update = false;
670 sg_policy->cached_raw_freq = 0; 676 sg_policy->cached_raw_freq = 0;
671 677
672 for_each_cpu(cpu, policy->cpus) { 678 for_each_cpu(cpu, policy->cpus) {
673 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 679 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
674 680
675 memset(sg_cpu, 0, sizeof(*sg_cpu)); 681 memset(sg_cpu, 0, sizeof(*sg_cpu));
676 sg_cpu->cpu = cpu; 682 sg_cpu->cpu = cpu;
677 sg_cpu->sg_policy = sg_policy; 683 sg_cpu->sg_policy = sg_policy;
678 sg_cpu->flags = 0; 684 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
679 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
680 } 685 }
681 686
682 for_each_cpu(cpu, policy->cpus) { 687 for_each_cpu(cpu, policy->cpus) {
@@ -720,14 +725,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
720} 725}
721 726
722static struct cpufreq_governor schedutil_gov = { 727static struct cpufreq_governor schedutil_gov = {
723 .name = "schedutil", 728 .name = "schedutil",
724 .owner = THIS_MODULE, 729 .owner = THIS_MODULE,
725 .dynamic_switching = true, 730 .dynamic_switching = true,
726 .init = sugov_init, 731 .init = sugov_init,
727 .exit = sugov_exit, 732 .exit = sugov_exit,
728 .start = sugov_start, 733 .start = sugov_start,
729 .stop = sugov_stop, 734 .stop = sugov_stop,
730 .limits = sugov_limits, 735 .limits = sugov_limits,
731}; 736};
732 737
733#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 738#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..daaadf939ccb 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
14 * 14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state 15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with 16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus 17 * a 2 dimensional bitmap (the first for priority class, the second for CPUs
18 * in that class). Therefore a typical application without affinity 18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit 19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a 20 * searches). For tasks with affinity restrictions, the algorithm has a
@@ -26,12 +26,7 @@
26 * as published by the Free Software Foundation; version 2 26 * as published by the Free Software Foundation; version 2
27 * of the License. 27 * of the License.
28 */ 28 */
29 29#include "sched.h"
30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
33#include <linux/slab.h>
34#include "cpupri.h"
35 30
36/* Convert between a 140 based task->prio, and our 102 based cpupri */ 31/* Convert between a 140 based task->prio, and our 102 based cpupri */
37static int convert_prio(int prio) 32static int convert_prio(int prio)
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
128} 123}
129 124
130/** 125/**
131 * cpupri_set - update the cpu priority setting 126 * cpupri_set - update the CPU priority setting
132 * @cp: The cpupri context 127 * @cp: The cpupri context
133 * @cpu: The target cpu 128 * @cpu: The target CPU
134 * @newpri: The priority (INVALID-RT99) to assign to this CPU 129 * @newpri: The priority (INVALID-RT99) to assign to this CPU
135 * 130 *
136 * Note: Assumes cpu_rq(cpu)->lock is locked 131 * Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
151 return; 146 return;
152 147
153 /* 148 /*
154 * If the cpu was currently mapped to a different value, we 149 * If the CPU was currently mapped to a different value, we
155 * need to map it to the new value then remove the old value. 150 * need to map it to the new value then remove the old value.
156 * Note, we must add the new value first, otherwise we risk the 151 * Note, we must add the new value first, otherwise we risk the
157 * cpu being missed by the priority loop in cpupri_find. 152 * cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..7dc20a3232e7 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,25 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUPRI_H
3#define _LINUX_CPUPRI_H
4
5#include <linux/sched.h>
6 2
7#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 3#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
8 4
9#define CPUPRI_INVALID -1 5#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 6#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1 7#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */ 8/* values 2-101 are RT priorities 0-99 */
13 9
14struct cpupri_vec { 10struct cpupri_vec {
15 atomic_t count; 11 atomic_t count;
16 cpumask_var_t mask; 12 cpumask_var_t mask;
17}; 13};
18 14
19struct cpupri { 15struct cpupri {
20 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 16 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
21 int *cpu_to_pri; 17 int *cpu_to_pri;
22}; 18};
23 19
24#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
25int cpupri_find(struct cpupri *cp, 21int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
26 struct task_struct *p, struct cpumask *lowest_mask);
27void cpupri_set(struct cpupri *cp, int cpu, int pri); 22void cpupri_set(struct cpupri *cp, int cpu, int pri);
28int cpupri_init(struct cpupri *cp); 23int cpupri_init(struct cpupri *cp);
29void cpupri_cleanup(struct cpupri *cp); 24void cpupri_cleanup(struct cpupri *cp);
30#endif 25#endif
31
32#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..0796f938c4f0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,6 @@
1#include <linux/export.h> 1/*
2#include <linux/sched.h> 2 * Simple CPU accounting cgroup controller
3#include <linux/tsacct_kern.h> 3 */
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
7#include <linux/sched/cputime.h>
8#include "sched.h" 4#include "sched.h"
9 5
10#ifdef CONFIG_IRQ_TIME_ACCOUNTING 6#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
113} 109}
114 110
115/* 111/*
116 * Account user cpu time to a process. 112 * Account user CPU time to a process.
117 * @p: the process that the cpu time gets accounted to 113 * @p: the process that the CPU time gets accounted to
118 * @cputime: the cpu time spent in user space since the last update 114 * @cputime: the CPU time spent in user space since the last update
119 */ 115 */
120void account_user_time(struct task_struct *p, u64 cputime) 116void account_user_time(struct task_struct *p, u64 cputime)
121{ 117{
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
135} 131}
136 132
137/* 133/*
138 * Account guest cpu time to a process. 134 * Account guest CPU time to a process.
139 * @p: the process that the cpu time gets accounted to 135 * @p: the process that the CPU time gets accounted to
140 * @cputime: the cpu time spent in virtual machine since the last update 136 * @cputime: the CPU time spent in virtual machine since the last update
141 */ 137 */
142void account_guest_time(struct task_struct *p, u64 cputime) 138void account_guest_time(struct task_struct *p, u64 cputime)
143{ 139{
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
159} 155}
160 156
161/* 157/*
162 * Account system cpu time to a process and desired cpustat field 158 * Account system CPU time to a process and desired cpustat field
163 * @p: the process that the cpu time gets accounted to 159 * @p: the process that the CPU time gets accounted to
164 * @cputime: the cpu time spent in kernel space since the last update 160 * @cputime: the CPU time spent in kernel space since the last update
165 * @index: pointer to cpustat field that has to be updated 161 * @index: pointer to cpustat field that has to be updated
166 */ 162 */
167void account_system_index_time(struct task_struct *p, 163void account_system_index_time(struct task_struct *p,
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
179} 175}
180 176
181/* 177/*
182 * Account system cpu time to a process. 178 * Account system CPU time to a process.
183 * @p: the process that the cpu time gets accounted to 179 * @p: the process that the CPU time gets accounted to
184 * @hardirq_offset: the offset to subtract from hardirq_count() 180 * @hardirq_offset: the offset to subtract from hardirq_count()
185 * @cputime: the cpu time spent in kernel space since the last update 181 * @cputime: the CPU time spent in kernel space since the last update
186 */ 182 */
187void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 183void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
188{ 184{
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
205 201
206/* 202/*
207 * Account for involuntary wait time. 203 * Account for involuntary wait time.
208 * @cputime: the cpu time spent in involuntary wait 204 * @cputime: the CPU time spent in involuntary wait
209 */ 205 */
210void account_steal_time(u64 cputime) 206void account_steal_time(u64 cputime)
211{ 207{
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
216 212
217/* 213/*
218 * Account for idle time. 214 * Account for idle time.
219 * @cputime: the cpu time spent in idle wait 215 * @cputime: the CPU time spent in idle wait
220 */ 216 */
221void account_idle_time(u64 cputime) 217void account_idle_time(u64 cputime)
222{ 218{
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
338#ifdef CONFIG_IRQ_TIME_ACCOUNTING 334#ifdef CONFIG_IRQ_TIME_ACCOUNTING
339/* 335/*
340 * Account a tick to a process and cpustat 336 * Account a tick to a process and cpustat
341 * @p: the process that the cpu time gets accounted to 337 * @p: the process that the CPU time gets accounted to
342 * @user_tick: is the tick from userspace 338 * @user_tick: is the tick from userspace
343 * @rq: the pointer to rq 339 * @rq: the pointer to rq
344 * 340 *
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
400 irqtime_account_process_tick(current, 0, rq, ticks); 396 irqtime_account_process_tick(current, 0, rq, ticks);
401} 397}
402#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 398#else /* CONFIG_IRQ_TIME_ACCOUNTING */
403static inline void irqtime_account_idle_ticks(int ticks) {} 399static inline void irqtime_account_idle_ticks(int ticks) { }
404static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 400static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
405 struct rq *rq, int nr_ticks) {} 401 struct rq *rq, int nr_ticks) { }
406#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 402#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
407 403
408/* 404/*
409 * Use precise platform statistics if available: 405 * Use precise platform statistics if available:
410 */ 406 */
411#ifdef CONFIG_VIRT_CPU_ACCOUNTING 407#ifdef CONFIG_VIRT_CPU_ACCOUNTING
412 408# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
413#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
414void vtime_common_task_switch(struct task_struct *prev) 409void vtime_common_task_switch(struct task_struct *prev)
415{ 410{
416 if (is_idle_task(prev)) 411 if (is_idle_task(prev))
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
421 vtime_flush(prev); 416 vtime_flush(prev);
422 arch_vtime_task_switch(prev); 417 arch_vtime_task_switch(prev);
423} 418}
424#endif 419# endif
425
426#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 420#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
427 421
428 422
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
469 *ut = cputime.utime; 463 *ut = cputime.utime;
470 *st = cputime.stime; 464 *st = cputime.stime;
471} 465}
472#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 466
467#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
468
473/* 469/*
474 * Account a single tick of cpu time. 470 * Account a single tick of CPU time.
475 * @p: the process that the cpu time gets accounted to 471 * @p: the process that the CPU time gets accounted to
476 * @user_tick: indicates if the tick is a user or a system tick 472 * @user_tick: indicates if the tick is a user or a system tick
477 */ 473 */
478void account_process_tick(struct task_struct *p, int user_tick) 474void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9df09782025c..d1c7bf7c7e5b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,9 +17,6 @@
17 */ 17 */
18#include "sched.h" 18#include "sched.h"
19 19
20#include <linux/slab.h>
21#include <uapi/linux/sched/types.h>
22
23struct dl_bandwidth def_dl_bandwidth; 20struct dl_bandwidth def_dl_bandwidth;
24 21
25static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) 22static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
87 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ 84 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
88 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); 85 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
89 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ 86 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
90 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); 87 cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
91} 88}
92 89
93static inline 90static inline
@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
101 if (dl_rq->running_bw > old) 98 if (dl_rq->running_bw > old)
102 dl_rq->running_bw = 0; 99 dl_rq->running_bw = 0;
103 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ 100 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
104 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); 101 cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
105} 102}
106 103
107static inline 104static inline
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
514static void push_dl_tasks(struct rq *); 511static void push_dl_tasks(struct rq *);
515static void pull_dl_task(struct rq *); 512static void pull_dl_task(struct rq *);
516 513
517static inline void queue_push_tasks(struct rq *rq) 514static inline void deadline_queue_push_tasks(struct rq *rq)
518{ 515{
519 if (!has_pushable_dl_tasks(rq)) 516 if (!has_pushable_dl_tasks(rq))
520 return; 517 return;
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
522 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); 519 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
523} 520}
524 521
525static inline void queue_pull_task(struct rq *rq) 522static inline void deadline_queue_pull_task(struct rq *rq)
526{ 523{
527 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); 524 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
528} 525}
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
539 536
540 /* 537 /*
541 * If we cannot preempt any rq, fall back to pick any 538 * If we cannot preempt any rq, fall back to pick any
542 * online cpu. 539 * online CPU:
543 */ 540 */
544 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 541 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
545 if (cpu >= nr_cpu_ids) { 542 if (cpu >= nr_cpu_ids) {
546 /* 543 /*
547 * Fail to find any suitable cpu. 544 * Failed to find any suitable CPU.
548 * The task will never come back! 545 * The task will never come back!
549 */ 546 */
550 BUG_ON(dl_bandwidth_enabled()); 547 BUG_ON(dl_bandwidth_enabled());
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
597{ 594{
598} 595}
599 596
600static inline void queue_push_tasks(struct rq *rq) 597static inline void deadline_queue_push_tasks(struct rq *rq)
601{ 598{
602} 599}
603 600
604static inline void queue_pull_task(struct rq *rq) 601static inline void deadline_queue_pull_task(struct rq *rq)
605{ 602{
606} 603}
607#endif /* CONFIG_SMP */ 604#endif /* CONFIG_SMP */
608 605
609static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 606static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
610static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); 607static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
611static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, 608static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
612 int flags);
613 609
614/* 610/*
615 * We are being explicitly informed that a new instance is starting, 611 * We are being explicitly informed that a new instance is starting,
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1763 if (hrtick_enabled(rq)) 1759 if (hrtick_enabled(rq))
1764 start_hrtick_dl(rq, p); 1760 start_hrtick_dl(rq, p);
1765 1761
1766 queue_push_tasks(rq); 1762 deadline_queue_push_tasks(rq);
1767 1763
1768 return p; 1764 return p;
1769} 1765}
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1776 enqueue_pushable_dl_task(rq, p); 1772 enqueue_pushable_dl_task(rq, p);
1777} 1773}
1778 1774
1775/*
1776 * scheduler tick hitting a task of our scheduling class.
1777 *
1778 * NOTE: This function can be called remotely by the tick offload that
1779 * goes along full dynticks. Therefore no local assumption can be made
1780 * and everything must be accessed through the @rq and @curr passed in
1781 * parameters.
1782 */
1779static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) 1783static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1780{ 1784{
1781 update_curr_dl(rq); 1785 update_curr_dl(rq);
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
1865 1869
1866 /* 1870 /*
1867 * We have to consider system topology and task affinity 1871 * We have to consider system topology and task affinity
1868 * first, then we can look for a suitable cpu. 1872 * first, then we can look for a suitable CPU.
1869 */ 1873 */
1870 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) 1874 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
1871 return -1; 1875 return -1;
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
1879 * Now we check how well this matches with task's 1883 * Now we check how well this matches with task's
1880 * affinity and system topology. 1884 * affinity and system topology.
1881 * 1885 *
1882 * The last cpu where the task run is our first 1886 * The last CPU where the task run is our first
1883 * guess, since it is most likely cache-hot there. 1887 * guess, since it is most likely cache-hot there.
1884 */ 1888 */
1885 if (cpumask_test_cpu(cpu, later_mask)) 1889 if (cpumask_test_cpu(cpu, later_mask))
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
1909 best_cpu = cpumask_first_and(later_mask, 1913 best_cpu = cpumask_first_and(later_mask,
1910 sched_domain_span(sd)); 1914 sched_domain_span(sd));
1911 /* 1915 /*
1912 * Last chance: if a cpu being in both later_mask 1916 * Last chance: if a CPU being in both later_mask
1913 * and current sd span is valid, that becomes our 1917 * and current sd span is valid, that becomes our
1914 * choice. Of course, the latest possible cpu is 1918 * choice. Of course, the latest possible CPU is
1915 * already under consideration through later_mask. 1919 * already under consideration through later_mask.
1916 */ 1920 */
1917 if (best_cpu < nr_cpu_ids) { 1921 if (best_cpu < nr_cpu_ids) {
@@ -2067,7 +2071,7 @@ retry:
2067 if (task == next_task) { 2071 if (task == next_task) {
2068 /* 2072 /*
2069 * The task is still there. We don't try 2073 * The task is still there. We don't try
2070 * again, some other cpu will pull it when ready. 2074 * again, some other CPU will pull it when ready.
2071 */ 2075 */
2072 goto out; 2076 goto out;
2073 } 2077 }
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
2300 /* 2304 /*
2301 * Since this might be the only -deadline task on the rq, 2305 * Since this might be the only -deadline task on the rq,
2302 * this is the right place to try to pull some other one 2306 * this is the right place to try to pull some other one
2303 * from an overloaded cpu, if any. 2307 * from an overloaded CPU, if any.
2304 */ 2308 */
2305 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) 2309 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
2306 return; 2310 return;
2307 2311
2308 queue_pull_task(rq); 2312 deadline_queue_pull_task(rq);
2309} 2313}
2310 2314
2311/* 2315/*
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
2327 if (rq->curr != p) { 2331 if (rq->curr != p) {
2328#ifdef CONFIG_SMP 2332#ifdef CONFIG_SMP
2329 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) 2333 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
2330 queue_push_tasks(rq); 2334 deadline_queue_push_tasks(rq);
2331#endif 2335#endif
2332 if (dl_task(rq->curr)) 2336 if (dl_task(rq->curr))
2333 check_preempt_curr_dl(rq, p, 0); 2337 check_preempt_curr_dl(rq, p, 0);
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
2352 * or lowering its prio, so... 2356 * or lowering its prio, so...
2353 */ 2357 */
2354 if (!rq->dl.overloaded) 2358 if (!rq->dl.overloaded)
2355 queue_pull_task(rq); 2359 deadline_queue_pull_task(rq);
2356 2360
2357 /* 2361 /*
2358 * If we now have a earlier deadline task than p, 2362 * If we now have a earlier deadline task than p,
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
2626{ 2630{
2627 struct sched_dl_entity *dl_se = &p->dl; 2631 struct sched_dl_entity *dl_se = &p->dl;
2628 2632
2629 dl_se->dl_runtime = 0; 2633 dl_se->dl_runtime = 0;
2630 dl_se->dl_deadline = 0; 2634 dl_se->dl_deadline = 0;
2631 dl_se->dl_period = 0; 2635 dl_se->dl_period = 0;
2632 dl_se->flags = 0; 2636 dl_se->flags = 0;
2633 dl_se->dl_bw = 0; 2637 dl_se->dl_bw = 0;
2634 dl_se->dl_density = 0; 2638 dl_se->dl_density = 0;
2635 2639
2636 dl_se->dl_throttled = 0; 2640 dl_se->dl_throttled = 0;
2637 dl_se->dl_yielded = 0; 2641 dl_se->dl_yielded = 0;
2638 dl_se->dl_non_contending = 0; 2642 dl_se->dl_non_contending = 0;
2639 dl_se->dl_overrun = 0; 2643 dl_se->dl_overrun = 0;
2640} 2644}
2641 2645
2642bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) 2646bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
2655#ifdef CONFIG_SMP 2659#ifdef CONFIG_SMP
2656int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) 2660int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
2657{ 2661{
2658 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 2662 unsigned int dest_cpu;
2659 cs_cpus_allowed);
2660 struct dl_bw *dl_b; 2663 struct dl_bw *dl_b;
2661 bool overflow; 2664 bool overflow;
2662 int cpus, ret; 2665 int cpus, ret;
2663 unsigned long flags; 2666 unsigned long flags;
2664 2667
2668 dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
2669
2665 rcu_read_lock_sched(); 2670 rcu_read_lock_sched();
2666 dl_b = dl_bw_of(dest_cpu); 2671 dl_b = dl_bw_of(dest_cpu);
2667 raw_spin_lock_irqsave(&dl_b->lock, flags); 2672 raw_spin_lock_irqsave(&dl_b->lock, flags);
2668 cpus = dl_bw_cpus(dest_cpu); 2673 cpus = dl_bw_cpus(dest_cpu);
2669 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 2674 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
2670 if (overflow) 2675 if (overflow) {
2671 ret = -EBUSY; 2676 ret = -EBUSY;
2672 else { 2677 } else {
2673 /* 2678 /*
2674 * We reserve space for this task in the destination 2679 * We reserve space for this task in the destination
2675 * root_domain, as we can't fail after this point. 2680 * root_domain, as we can't fail after this point.
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
2681 } 2686 }
2682 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2687 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2683 rcu_read_unlock_sched(); 2688 rcu_read_unlock_sched();
2689
2684 return ret; 2690 return ret;
2685} 2691}
2686 2692
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
2701 ret = 0; 2707 ret = 0;
2702 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 2708 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
2703 rcu_read_unlock_sched(); 2709 rcu_read_unlock_sched();
2710
2704 return ret; 2711 return ret;
2705} 2712}
2706 2713
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
2718 overflow = __dl_overflow(dl_b, cpus, 0, 0); 2725 overflow = __dl_overflow(dl_b, cpus, 0, 0);
2719 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2726 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2720 rcu_read_unlock_sched(); 2727 rcu_read_unlock_sched();
2728
2721 return overflow; 2729 return overflow;
2722} 2730}
2723#endif 2731#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 72c401b3b15c..15b10e210a6b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * kernel/sched/debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree and other debugging details
5 * 5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar 6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 * 7 *
@@ -9,16 +9,6 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched/mm.h>
15#include <linux/sched/task.h>
16#include <linux/seq_file.h>
17#include <linux/kallsyms.h>
18#include <linux/utsname.h>
19#include <linux/mempolicy.h>
20#include <linux/debugfs.h>
21
22#include "sched.h" 12#include "sched.h"
23 13
24static DEFINE_SPINLOCK(sched_debug_lock); 14static DEFINE_SPINLOCK(sched_debug_lock);
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
274 if (table == NULL) 264 if (table == NULL)
275 return NULL; 265 return NULL;
276 266
277 set_table_entry(&table[0], "min_interval", &sd->min_interval, 267 set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
278 sizeof(long), 0644, proc_doulongvec_minmax, false); 268 set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
279 set_table_entry(&table[1], "max_interval", &sd->max_interval, 269 set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
280 sizeof(long), 0644, proc_doulongvec_minmax, false); 270 set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
281 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 271 set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
282 sizeof(int), 0644, proc_dointvec_minmax, true); 272 set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
283 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 273 set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
284 sizeof(int), 0644, proc_dointvec_minmax, true); 274 set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
285 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 275 set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
286 sizeof(int), 0644, proc_dointvec_minmax, true); 276 set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
287 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 277 set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
288 sizeof(int), 0644, proc_dointvec_minmax, true); 278 set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
289 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 279 set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
290 sizeof(int), 0644, proc_dointvec_minmax, true);
291 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
292 sizeof(int), 0644, proc_dointvec_minmax, false);
293 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
294 sizeof(int), 0644, proc_dointvec_minmax, false);
295 set_table_entry(&table[9], "cache_nice_tries",
296 &sd->cache_nice_tries,
297 sizeof(int), 0644, proc_dointvec_minmax, false);
298 set_table_entry(&table[10], "flags", &sd->flags,
299 sizeof(int), 0644, proc_dointvec_minmax, false);
300 set_table_entry(&table[11], "max_newidle_lb_cost",
301 &sd->max_newidle_lb_cost,
302 sizeof(long), 0644, proc_doulongvec_minmax, false);
303 set_table_entry(&table[12], "name", sd->name,
304 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
305 /* &table[13] is terminator */ 280 /* &table[13] is terminator */
306 281
307 return table; 282 return table;
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
332 return table; 307 return table;
333} 308}
334 309
335static cpumask_var_t sd_sysctl_cpus; 310static cpumask_var_t sd_sysctl_cpus;
336static struct ctl_table_header *sd_sysctl_header; 311static struct ctl_table_header *sd_sysctl_header;
337 312
338void register_sched_domain_sysctl(void) 313void register_sched_domain_sysctl(void)
339{ 314{
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
413{ 388{
414 struct sched_entity *se = tg->se[cpu]; 389 struct sched_entity *se = tg->se[cpu];
415 390
416#define P(F) \ 391#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
417 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 392#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
418#define P_SCHEDSTAT(F) \ 393#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
419 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) 394#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
420#define PN(F) \
421 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
422#define PN_SCHEDSTAT(F) \
423 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
424 395
425 if (!se) 396 if (!se)
426 return; 397 return;
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
428 PN(se->exec_start); 399 PN(se->exec_start);
429 PN(se->vruntime); 400 PN(se->vruntime);
430 PN(se->sum_exec_runtime); 401 PN(se->sum_exec_runtime);
402
431 if (schedstat_enabled()) { 403 if (schedstat_enabled()) {
432 PN_SCHEDSTAT(se->statistics.wait_start); 404 PN_SCHEDSTAT(se->statistics.wait_start);
433 PN_SCHEDSTAT(se->statistics.sleep_start); 405 PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
440 PN_SCHEDSTAT(se->statistics.wait_sum); 412 PN_SCHEDSTAT(se->statistics.wait_sum);
441 P_SCHEDSTAT(se->statistics.wait_count); 413 P_SCHEDSTAT(se->statistics.wait_count);
442 } 414 }
415
443 P(se->load.weight); 416 P(se->load.weight);
444 P(se->runnable_weight); 417 P(se->runnable_weight);
445#ifdef CONFIG_SMP 418#ifdef CONFIG_SMP
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
464 return group_path; 437 return group_path;
465 438
466 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 439 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
440
467 return group_path; 441 return group_path;
468} 442}
469#endif 443#endif
@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
569 cfs_rq->avg.runnable_load_avg); 543 cfs_rq->avg.runnable_load_avg);
570 SEQ_printf(m, " .%-30s: %lu\n", "util_avg", 544 SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
571 cfs_rq->avg.util_avg); 545 cfs_rq->avg.util_avg);
546 SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
547 cfs_rq->avg.util_est.enqueued);
572 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", 548 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
573 cfs_rq->removed.load_avg); 549 cfs_rq->removed.load_avg);
574 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", 550 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void)
804/* 780/*
805 * This itererator needs some explanation. 781 * This itererator needs some explanation.
806 * It returns 1 for the header position. 782 * It returns 1 for the header position.
807 * This means 2 is cpu 0. 783 * This means 2 is CPU 0.
808 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 784 * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
809 * to use cpumask_* to iterate over the cpus. 785 * to use cpumask_* to iterate over the CPUs.
810 */ 786 */
811static void *sched_debug_start(struct seq_file *file, loff_t *offset) 787static void *sched_debug_start(struct seq_file *file, loff_t *offset)
812{ 788{
@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
826 802
827 if (n < nr_cpu_ids) 803 if (n < nr_cpu_ids)
828 return (void *)(unsigned long)(n + 2); 804 return (void *)(unsigned long)(n + 2);
805
829 return NULL; 806 return NULL;
830} 807}
831 808
@@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
840} 817}
841 818
842static const struct seq_operations sched_debug_sops = { 819static const struct seq_operations sched_debug_sops = {
843 .start = sched_debug_start, 820 .start = sched_debug_start,
844 .next = sched_debug_next, 821 .next = sched_debug_next,
845 .stop = sched_debug_stop, 822 .stop = sched_debug_stop,
846 .show = sched_debug_show, 823 .show = sched_debug_show,
847}; 824};
848 825
849static int sched_debug_release(struct inode *inode, struct file *file) 826static int sched_debug_release(struct inode *inode, struct file *file)
@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void)
881 858
882__initcall(init_sched_debug_procfs); 859__initcall(init_sched_debug_procfs);
883 860
884#define __P(F) \ 861#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
885 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 862#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
886#define P(F) \ 863#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
887 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 864#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
888#define __PN(F) \
889 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
890#define PN(F) \
891 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
892 865
893 866
894#ifdef CONFIG_NUMA_BALANCING 867#ifdef CONFIG_NUMA_BALANCING
@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
1023 P(se.avg.runnable_load_avg); 996 P(se.avg.runnable_load_avg);
1024 P(se.avg.util_avg); 997 P(se.avg.util_avg);
1025 P(se.avg.last_update_time); 998 P(se.avg.last_update_time);
999 P(se.avg.util_est.ewma);
1000 P(se.avg.util_est.enqueued);
1026#endif 1001#endif
1027 P(policy); 1002 P(policy);
1028 P(prio); 1003 P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..0951d1c58d2f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,10 @@
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */ 22 */
23 23#include "sched.h"
24#include <linux/sched/mm.h>
25#include <linux/sched/topology.h>
26
27#include <linux/latencytop.h>
28#include <linux/cpumask.h>
29#include <linux/cpuidle.h>
30#include <linux/slab.h>
31#include <linux/profile.h>
32#include <linux/interrupt.h>
33#include <linux/mempolicy.h>
34#include <linux/migrate.h>
35#include <linux/task_work.h>
36#include <linux/sched/isolation.h>
37 24
38#include <trace/events/sched.h> 25#include <trace/events/sched.h>
39 26
40#include "sched.h"
41
42/* 27/*
43 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
44 * 29 *
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
103 88
104#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
105/* 90/*
106 * For asym packing, by default the lower numbered cpu has higher priority. 91 * For asym packing, by default the lower numbered CPU has higher priority.
107 */ 92 */
108int __weak arch_asym_cpu_priority(int cpu) 93int __weak arch_asym_cpu_priority(int cpu)
109{ 94{
@@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
787 * For !fair tasks do: 772 * For !fair tasks do:
788 * 773 *
789 update_cfs_rq_load_avg(now, cfs_rq); 774 update_cfs_rq_load_avg(now, cfs_rq);
790 attach_entity_load_avg(cfs_rq, se); 775 attach_entity_load_avg(cfs_rq, se, 0);
791 switched_from_fair(rq, p); 776 switched_from_fair(rq, p);
792 * 777 *
793 * such that the next switched_to_fair() has the 778 * such that the next switched_to_fair() has the
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p)
1181} 1166}
1182 1167
1183/* 1168/*
1184 * The averaged statistics, shared & private, memory & cpu, 1169 * The averaged statistics, shared & private, memory & CPU,
1185 * occupy the first half of the array. The second half of the 1170 * occupy the first half of the array. The second half of the
1186 * array is for current counters, which are averaged into the 1171 * array is for current counters, which are averaged into the
1187 * first set by task_numa_placement. 1172 * first set by task_numa_placement.
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env,
1587 * be incurred if the tasks were swapped. 1572 * be incurred if the tasks were swapped.
1588 */ 1573 */
1589 if (cur) { 1574 if (cur) {
1590 /* Skip this swap candidate if cannot move to the source cpu */ 1575 /* Skip this swap candidate if cannot move to the source CPU: */
1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1576 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1592 goto unlock; 1577 goto unlock;
1593 1578
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env,
1631 goto balance; 1616 goto balance;
1632 } 1617 }
1633 1618
1634 /* Balance doesn't matter much if we're running a task per cpu */ 1619 /* Balance doesn't matter much if we're running a task per CPU: */
1635 if (imp > env->best_imp && src_rq->nr_running == 1 && 1620 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1636 dst_rq->nr_running == 1) 1621 dst_rq->nr_running == 1)
1637 goto assign; 1622 goto assign;
@@ -1676,7 +1661,7 @@ balance:
1676 */ 1661 */
1677 if (!cur) { 1662 if (!cur) {
1678 /* 1663 /*
1679 * select_idle_siblings() uses an per-cpu cpumask that 1664 * select_idle_siblings() uses an per-CPU cpumask that
1680 * can be used from IRQ context. 1665 * can be used from IRQ context.
1681 */ 1666 */
1682 local_irq_disable(); 1667 local_irq_disable();
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p)
1869static void numa_migrate_preferred(struct task_struct *p) 1854static void numa_migrate_preferred(struct task_struct *p)
1870{ 1855{
1871 unsigned long interval = HZ; 1856 unsigned long interval = HZ;
1857 unsigned long numa_migrate_retry;
1872 1858
1873 /* This task has no NUMA fault statistics yet */ 1859 /* This task has no NUMA fault statistics yet */
1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1860 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p)
1876 1862
1877 /* Periodically retry migrating the task to the preferred node */ 1863 /* Periodically retry migrating the task to the preferred node */
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1864 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval; 1865 numa_migrate_retry = jiffies + interval;
1866
1867 /*
1868 * Check that the new retry threshold is after the current one. If
1869 * the retry is in the future, it implies that wake_affine has
1870 * temporarily asked NUMA balancing to backoff from placement.
1871 */
1872 if (numa_migrate_retry > p->numa_migrate_retry)
1873 return;
1874
1875 /* Safe to try placing the task on the preferred node */
1876 p->numa_migrate_retry = numa_migrate_retry;
1880 1877
1881 /* Success if task is already running on preferred CPU */ 1878 /* Success if task is already running on preferred CPU */
1882 if (task_node(p) == p->numa_preferred_nid) 1879 if (task_node(p) == p->numa_preferred_nid)
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio)
2823} 2820}
2824 2821
2825#ifdef CONFIG_FAIR_GROUP_SCHED 2822#ifdef CONFIG_FAIR_GROUP_SCHED
2826# ifdef CONFIG_SMP 2823#ifdef CONFIG_SMP
2827/* 2824/*
2828 * All this does is approximate the hierarchical proportion which includes that 2825 * All this does is approximate the hierarchical proportion which includes that
2829 * global sum we all love to hate. 2826 * global sum we all love to hate.
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2974 2971
2975 return clamp_t(long, runnable, MIN_SHARES, shares); 2972 return clamp_t(long, runnable, MIN_SHARES, shares);
2976} 2973}
2977# endif /* CONFIG_SMP */ 2974#endif /* CONFIG_SMP */
2978 2975
2979static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 2976static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2980 2977
@@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se)
3012} 3009}
3013#endif /* CONFIG_FAIR_GROUP_SCHED */ 3010#endif /* CONFIG_FAIR_GROUP_SCHED */
3014 3011
3015static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 3012static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3016{ 3013{
3017 struct rq *rq = rq_of(cfs_rq); 3014 struct rq *rq = rq_of(cfs_rq);
3018 3015
3019 if (&rq->cfs == cfs_rq) { 3016 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3020 /* 3017 /*
3021 * There are a few boundary cases this might miss but it should 3018 * There are a few boundary cases this might miss but it should
3022 * get called often enough that that should (hopefully) not be 3019 * get called often enough that that should (hopefully) not be
@@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3031 * 3028 *
3032 * See cpu_util(). 3029 * See cpu_util().
3033 */ 3030 */
3034 cpufreq_update_util(rq, 0); 3031 cpufreq_update_util(rq, flags);
3035 } 3032 }
3036} 3033}
3037 3034
@@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
3246} 3243}
3247 3244
3248/* 3245/*
3246 * When a task is dequeued, its estimated utilization should not be update if
3247 * its util_avg has not been updated at least once.
3248 * This flag is used to synchronize util_avg updates with util_est updates.
3249 * We map this information into the LSB bit of the utilization saved at
3250 * dequeue time (i.e. util_est.dequeued).
3251 */
3252#define UTIL_AVG_UNCHANGED 0x1
3253
3254static inline void cfs_se_util_change(struct sched_avg *avg)
3255{
3256 unsigned int enqueued;
3257
3258 if (!sched_feat(UTIL_EST))
3259 return;
3260
3261 /* Avoid store if the flag has been already set */
3262 enqueued = avg->util_est.enqueued;
3263 if (!(enqueued & UTIL_AVG_UNCHANGED))
3264 return;
3265
3266 /* Reset flag to report util_avg has been updated */
3267 enqueued &= ~UTIL_AVG_UNCHANGED;
3268 WRITE_ONCE(avg->util_est.enqueued, enqueued);
3269}
3270
3271/*
3249 * sched_entity: 3272 * sched_entity:
3250 * 3273 *
3251 * task: 3274 * task:
@@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
3296 cfs_rq->curr == se)) { 3319 cfs_rq->curr == se)) {
3297 3320
3298 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3321 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3322 cfs_se_util_change(&se->avg);
3299 return 1; 3323 return 1;
3300 } 3324 }
3301 3325
@@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3350} 3374}
3351 3375
3352/* 3376/*
3353 * Called within set_task_rq() right before setting a task's cpu. The 3377 * Called within set_task_rq() right before setting a task's CPU. The
3354 * caller only guarantees p->pi_lock is held; no other assumptions, 3378 * caller only guarantees p->pi_lock is held; no other assumptions,
3355 * including the state of rq->lock, should be made. 3379 * including the state of rq->lock, should be made.
3356 */ 3380 */
@@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
3529 3553
3530 /* 3554 /*
3531 * runnable_sum can't be lower than running_sum 3555 * runnable_sum can't be lower than running_sum
3532 * As running sum is scale with cpu capacity wehreas the runnable sum 3556 * As running sum is scale with CPU capacity wehreas the runnable sum
3533 * is not we rescale running_sum 1st 3557 * is not we rescale running_sum 1st
3534 */ 3558 */
3535 running_sum = se->avg.util_sum / 3559 running_sum = se->avg.util_sum /
@@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3689#endif 3713#endif
3690 3714
3691 if (decayed) 3715 if (decayed)
3692 cfs_rq_util_change(cfs_rq); 3716 cfs_rq_util_change(cfs_rq, 0);
3693 3717
3694 return decayed; 3718 return decayed;
3695} 3719}
@@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3702 * Must call update_cfs_rq_load_avg() before this, since we rely on 3726 * Must call update_cfs_rq_load_avg() before this, since we rely on
3703 * cfs_rq->avg.last_update_time being current. 3727 * cfs_rq->avg.last_update_time being current.
3704 */ 3728 */
3705static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3729static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3706{ 3730{
3707 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; 3731 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3708 3732
@@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3738 3762
3739 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3763 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3740 3764
3741 cfs_rq_util_change(cfs_rq); 3765 cfs_rq_util_change(cfs_rq, flags);
3742} 3766}
3743 3767
3744/** 3768/**
@@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3757 3781
3758 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3782 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3759 3783
3760 cfs_rq_util_change(cfs_rq); 3784 cfs_rq_util_change(cfs_rq, 0);
3761} 3785}
3762 3786
3763/* 3787/*
@@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3787 3811
3788 if (!se->avg.last_update_time && (flags & DO_ATTACH)) { 3812 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3789 3813
3790 attach_entity_load_avg(cfs_rq, se); 3814 /*
3815 * DO_ATTACH means we're here from enqueue_entity().
3816 * !last_update_time means we've passed through
3817 * migrate_task_rq_fair() indicating we migrated.
3818 *
3819 * IOW we're enqueueing a task on a new CPU.
3820 */
3821 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3791 update_tg_load_avg(cfs_rq, 0); 3822 update_tg_load_avg(cfs_rq, 0);
3792 3823
3793 } else if (decayed && (flags & UPDATE_TG)) 3824 } else if (decayed && (flags & UPDATE_TG))
@@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3869 3900
3870static int idle_balance(struct rq *this_rq, struct rq_flags *rf); 3901static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3871 3902
3903static inline unsigned long task_util(struct task_struct *p)
3904{
3905 return READ_ONCE(p->se.avg.util_avg);
3906}
3907
3908static inline unsigned long _task_util_est(struct task_struct *p)
3909{
3910 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3911
3912 return max(ue.ewma, ue.enqueued);
3913}
3914
3915static inline unsigned long task_util_est(struct task_struct *p)
3916{
3917 return max(task_util(p), _task_util_est(p));
3918}
3919
3920static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3921 struct task_struct *p)
3922{
3923 unsigned int enqueued;
3924
3925 if (!sched_feat(UTIL_EST))
3926 return;
3927
3928 /* Update root cfs_rq's estimated utilization */
3929 enqueued = cfs_rq->avg.util_est.enqueued;
3930 enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3931 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3932}
3933
3934/*
3935 * Check if a (signed) value is within a specified (unsigned) margin,
3936 * based on the observation that:
3937 *
3938 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3939 *
3940 * NOTE: this only works when value + maring < INT_MAX.
3941 */
3942static inline bool within_margin(int value, int margin)
3943{
3944 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3945}
3946
3947static void
3948util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3949{
3950 long last_ewma_diff;
3951 struct util_est ue;
3952
3953 if (!sched_feat(UTIL_EST))
3954 return;
3955
3956 /*
3957 * Update root cfs_rq's estimated utilization
3958 *
3959 * If *p is the last task then the root cfs_rq's estimated utilization
3960 * of a CPU is 0 by definition.
3961 */
3962 ue.enqueued = 0;
3963 if (cfs_rq->nr_running) {
3964 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3965 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3966 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3967 }
3968 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3969
3970 /*
3971 * Skip update of task's estimated utilization when the task has not
3972 * yet completed an activation, e.g. being migrated.
3973 */
3974 if (!task_sleep)
3975 return;
3976
3977 /*
3978 * If the PELT values haven't changed since enqueue time,
3979 * skip the util_est update.
3980 */
3981 ue = p->se.avg.util_est;
3982 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3983 return;
3984
3985 /*
3986 * Skip update of task's estimated utilization when its EWMA is
3987 * already ~1% close to its last activation value.
3988 */
3989 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3990 last_ewma_diff = ue.enqueued - ue.ewma;
3991 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3992 return;
3993
3994 /*
3995 * Update Task's estimated utilization
3996 *
3997 * When *p completes an activation we can consolidate another sample
3998 * of the task size. This is done by storing the current PELT value
3999 * as ue.enqueued and by using this value to update the Exponential
4000 * Weighted Moving Average (EWMA):
4001 *
4002 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4003 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4004 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4005 * = w * ( last_ewma_diff ) + ewma(t-1)
4006 * = w * (last_ewma_diff + ewma(t-1) / w)
4007 *
4008 * Where 'w' is the weight of new samples, which is configured to be
4009 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4010 */
4011 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4012 ue.ewma += last_ewma_diff;
4013 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4014 WRITE_ONCE(p->se.avg.util_est, ue);
4015}
4016
3872#else /* CONFIG_SMP */ 4017#else /* CONFIG_SMP */
3873 4018
3874static inline int 4019static inline int
@@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3883 4028
3884static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 4029static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3885{ 4030{
3886 cfs_rq_util_change(cfs_rq); 4031 cfs_rq_util_change(cfs_rq, 0);
3887} 4032}
3888 4033
3889static inline void remove_entity_load_avg(struct sched_entity *se) {} 4034static inline void remove_entity_load_avg(struct sched_entity *se) {}
3890 4035
3891static inline void 4036static inline void
3892attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 4037attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3893static inline void 4038static inline void
3894detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 4039detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3895 4040
@@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3898 return 0; 4043 return 0;
3899} 4044}
3900 4045
4046static inline void
4047util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4048
4049static inline void
4050util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
4051 bool task_sleep) {}
4052
3901#endif /* CONFIG_SMP */ 4053#endif /* CONFIG_SMP */
3902 4054
3903static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 4055static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4676 if (!se) 4828 if (!se)
4677 add_nr_running(rq, task_delta); 4829 add_nr_running(rq, task_delta);
4678 4830
4679 /* determine whether we need to wake up potentially idle cpu */ 4831 /* Determine whether we need to wake up potentially idle CPU: */
4680 if (rq->curr == rq->idle && rq->cfs.nr_running) 4832 if (rq->curr == rq->idle && rq->cfs.nr_running)
4681 resched_curr(rq); 4833 resched_curr(rq);
4682} 4834}
@@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5041} 5193}
5042 5194
5043/* 5195/*
5044 * Both these cpu hotplug callbacks race against unregister_fair_sched_group() 5196 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5045 * 5197 *
5046 * The race is harmless, since modifying bandwidth settings of unhooked group 5198 * The race is harmless, since modifying bandwidth settings of unhooked group
5047 * bits doesn't do much. 5199 * bits doesn't do much.
@@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5086 */ 5238 */
5087 cfs_rq->runtime_remaining = 1; 5239 cfs_rq->runtime_remaining = 1;
5088 /* 5240 /*
5089 * Offline rq is schedulable till cpu is completely disabled 5241 * Offline rq is schedulable till CPU is completely disabled
5090 * in take_cpu_down(), so we prevent new cfs throttling here. 5242 * in take_cpu_down(), so we prevent new cfs throttling here.
5091 */ 5243 */
5092 cfs_rq->runtime_enabled = 0; 5244 cfs_rq->runtime_enabled = 0;
@@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5245 if (!se) 5397 if (!se)
5246 add_nr_running(rq, 1); 5398 add_nr_running(rq, 1);
5247 5399
5400 util_est_enqueue(&rq->cfs, p);
5248 hrtick_update(rq); 5401 hrtick_update(rq);
5249} 5402}
5250 5403
@@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5304 if (!se) 5457 if (!se)
5305 sub_nr_running(rq, 1); 5458 sub_nr_running(rq, 1);
5306 5459
5460 util_est_dequeue(&rq->cfs, p, task_sleep);
5307 hrtick_update(rq); 5461 hrtick_update(rq);
5308} 5462}
5309 5463
@@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5323 * 5477 *
5324 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load 5478 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5325 * 5479 *
5326 * If a cpu misses updates for n ticks (as it was idle) and update gets 5480 * If a CPU misses updates for n ticks (as it was idle) and update gets
5327 * called on the n+1-th tick when cpu may be busy, then we have: 5481 * called on the n+1-th tick when CPU may be busy, then we have:
5328 * 5482 *
5329 * load_n = (1 - 1/2^i)^n * load_0 5483 * load_n = (1 - 1/2^i)^n * load_0
5330 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load 5484 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
@@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5379 } 5533 }
5380 return load; 5534 return load;
5381} 5535}
5536
5537static struct {
5538 cpumask_var_t idle_cpus_mask;
5539 atomic_t nr_cpus;
5540 int has_blocked; /* Idle CPUS has blocked load */
5541 unsigned long next_balance; /* in jiffy units */
5542 unsigned long next_blocked; /* Next update of blocked load in jiffies */
5543} nohz ____cacheline_aligned;
5544
5382#endif /* CONFIG_NO_HZ_COMMON */ 5545#endif /* CONFIG_NO_HZ_COMMON */
5383 5546
5384/** 5547/**
@@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
5468#ifdef CONFIG_NO_HZ_COMMON 5631#ifdef CONFIG_NO_HZ_COMMON
5469/* 5632/*
5470 * There is no sane way to deal with nohz on smp when using jiffies because the 5633 * There is no sane way to deal with nohz on smp when using jiffies because the
5471 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 5634 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5472 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 5635 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5473 * 5636 *
5474 * Therefore we need to avoid the delta approach from the regular tick when 5637 * Therefore we need to avoid the delta approach from the regular tick when
@@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq)
5579} 5742}
5580 5743
5581/* 5744/*
5582 * Return a low guess at the load of a migration-source cpu weighted 5745 * Return a low guess at the load of a migration-source CPU weighted
5583 * according to the scheduling class and "nice" value. 5746 * according to the scheduling class and "nice" value.
5584 * 5747 *
5585 * We want to under-estimate the load of migration sources, to 5748 * We want to under-estimate the load of migration sources, to
@@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type)
5597} 5760}
5598 5761
5599/* 5762/*
5600 * Return a high guess at the load of a migration-target cpu weighted 5763 * Return a high guess at the load of a migration-target CPU weighted
5601 * according to the scheduling class and "nice" value. 5764 * according to the scheduling class and "nice" value.
5602 */ 5765 */
5603static unsigned long target_load(int cpu, int type) 5766static unsigned long target_load(int cpu, int type)
@@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5724 unsigned long task_load; 5887 unsigned long task_load;
5725 5888
5726 this_eff_load = target_load(this_cpu, sd->wake_idx); 5889 this_eff_load = target_load(this_cpu, sd->wake_idx);
5727 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5728 5890
5729 if (sync) { 5891 if (sync) {
5730 unsigned long current_load = task_h_load(current); 5892 unsigned long current_load = task_h_load(current);
@@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5742 this_eff_load *= 100; 5904 this_eff_load *= 100;
5743 this_eff_load *= capacity_of(prev_cpu); 5905 this_eff_load *= capacity_of(prev_cpu);
5744 5906
5907 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5745 prev_eff_load -= task_load; 5908 prev_eff_load -= task_load;
5746 if (sched_feat(WA_BIAS)) 5909 if (sched_feat(WA_BIAS))
5747 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5910 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5748 prev_eff_load *= capacity_of(this_cpu); 5911 prev_eff_load *= capacity_of(this_cpu);
5749 5912
5750 return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; 5913 /*
5914 * If sync, adjust the weight of prev_eff_load such that if
5915 * prev_eff == this_eff that select_idle_sibling() will consider
5916 * stacking the wakee on top of the waker if no other CPU is
5917 * idle.
5918 */
5919 if (sync)
5920 prev_eff_load += 1;
5921
5922 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5923}
5924
5925#ifdef CONFIG_NUMA_BALANCING
5926static void
5927update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5928{
5929 unsigned long interval;
5930
5931 if (!static_branch_likely(&sched_numa_balancing))
5932 return;
5933
5934 /* If balancing has no preference then continue gathering data */
5935 if (p->numa_preferred_nid == -1)
5936 return;
5937
5938 /*
5939 * If the wakeup is not affecting locality then it is neutral from
5940 * the perspective of NUMA balacing so continue gathering data.
5941 */
5942 if (cpu_to_node(prev_cpu) == cpu_to_node(target))
5943 return;
5944
5945 /*
5946 * Temporarily prevent NUMA balancing trying to place waker/wakee after
5947 * wakee has been moved by wake_affine. This will potentially allow
5948 * related tasks to converge and update their data placement. The
5949 * 4 * numa_scan_period is to allow the two-pass filter to migrate
5950 * hot data to the wakers node.
5951 */
5952 interval = max(sysctl_numa_balancing_scan_delay,
5953 p->numa_scan_period << 2);
5954 p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5955
5956 interval = max(sysctl_numa_balancing_scan_delay,
5957 current->numa_scan_period << 2);
5958 current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5751} 5959}
5960#else
5961static void
5962update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5963{
5964}
5965#endif
5752 5966
5753static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5967static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5754 int prev_cpu, int sync) 5968 int this_cpu, int prev_cpu, int sync)
5755{ 5969{
5756 int this_cpu = smp_processor_id();
5757 int target = nr_cpumask_bits; 5970 int target = nr_cpumask_bits;
5758 5971
5759 if (sched_feat(WA_IDLE)) 5972 if (sched_feat(WA_IDLE))
@@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5766 if (target == nr_cpumask_bits) 5979 if (target == nr_cpumask_bits)
5767 return prev_cpu; 5980 return prev_cpu;
5768 5981
5982 update_wa_numa_placement(p, prev_cpu, target);
5769 schedstat_inc(sd->ttwu_move_affine); 5983 schedstat_inc(sd->ttwu_move_affine);
5770 schedstat_inc(p->se.statistics.nr_wakeups_affine); 5984 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5771 return target; 5985 return target;
5772} 5986}
5773 5987
5774static inline unsigned long task_util(struct task_struct *p);
5775static unsigned long cpu_util_wake(int cpu, struct task_struct *p); 5988static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5776 5989
5777static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5990static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5826 max_spare_cap = 0; 6039 max_spare_cap = 0;
5827 6040
5828 for_each_cpu(i, sched_group_span(group)) { 6041 for_each_cpu(i, sched_group_span(group)) {
5829 /* Bias balancing toward cpus of our domain */ 6042 /* Bias balancing toward CPUs of our domain */
5830 if (local_group) 6043 if (local_group)
5831 load = source_load(i, load_idx); 6044 load = source_load(i, load_idx);
5832 else 6045 else
@@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5856 if (min_runnable_load > (runnable_load + imbalance)) { 6069 if (min_runnable_load > (runnable_load + imbalance)) {
5857 /* 6070 /*
5858 * The runnable load is significantly smaller 6071 * The runnable load is significantly smaller
5859 * so we can pick this new cpu 6072 * so we can pick this new CPU:
5860 */ 6073 */
5861 min_runnable_load = runnable_load; 6074 min_runnable_load = runnable_load;
5862 min_avg_load = avg_load; 6075 min_avg_load = avg_load;
@@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5865 (100*min_avg_load > imbalance_scale*avg_load)) { 6078 (100*min_avg_load > imbalance_scale*avg_load)) {
5866 /* 6079 /*
5867 * The runnable loads are close so take the 6080 * The runnable loads are close so take the
5868 * blocked load into account through avg_load. 6081 * blocked load into account through avg_load:
5869 */ 6082 */
5870 min_avg_load = avg_load; 6083 min_avg_load = avg_load;
5871 idlest = group; 6084 idlest = group;
@@ -5903,6 +6116,18 @@ skip_spare:
5903 if (!idlest) 6116 if (!idlest)
5904 return NULL; 6117 return NULL;
5905 6118
6119 /*
6120 * When comparing groups across NUMA domains, it's possible for the
6121 * local domain to be very lightly loaded relative to the remote
6122 * domains but "imbalance" skews the comparison making remote CPUs
6123 * look much more favourable. When considering cross-domain, add
6124 * imbalance to the runnable load on the remote node and consider
6125 * staying local.
6126 */
6127 if ((sd->flags & SD_NUMA) &&
6128 min_runnable_load + imbalance >= this_runnable_load)
6129 return NULL;
6130
5906 if (min_runnable_load > (this_runnable_load + imbalance)) 6131 if (min_runnable_load > (this_runnable_load + imbalance))
5907 return NULL; 6132 return NULL;
5908 6133
@@ -5914,7 +6139,7 @@ skip_spare:
5914} 6139}
5915 6140
5916/* 6141/*
5917 * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 6142 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5918 */ 6143 */
5919static int 6144static int
5920find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 6145find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
5992 6217
5993 new_cpu = find_idlest_group_cpu(group, p, cpu); 6218 new_cpu = find_idlest_group_cpu(group, p, cpu);
5994 if (new_cpu == cpu) { 6219 if (new_cpu == cpu) {
5995 /* Now try balancing at a lower domain level of cpu */ 6220 /* Now try balancing at a lower domain level of 'cpu': */
5996 sd = sd->child; 6221 sd = sd->child;
5997 continue; 6222 continue;
5998 } 6223 }
5999 6224
6000 /* Now try balancing at a lower domain level of new_cpu */ 6225 /* Now try balancing at a lower domain level of 'new_cpu': */
6001 cpu = new_cpu; 6226 cpu = new_cpu;
6002 weight = sd->span_weight; 6227 weight = sd->span_weight;
6003 sd = NULL; 6228 sd = NULL;
@@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6007 if (tmp->flags & sd_flag) 6232 if (tmp->flags & sd_flag)
6008 sd = tmp; 6233 sd = tmp;
6009 } 6234 }
6010 /* while loop will break here if sd == NULL */
6011 } 6235 }
6012 6236
6013 return new_cpu; 6237 return new_cpu;
@@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6203 return target; 6427 return target;
6204 6428
6205 /* 6429 /*
6206 * If the previous cpu is cache affine and idle, don't be stupid. 6430 * If the previous CPU is cache affine and idle, don't be stupid:
6207 */ 6431 */
6208 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) 6432 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6209 return prev; 6433 return prev;
6210 6434
6211 /* Check a recently used CPU as a potential idle candidate */ 6435 /* Check a recently used CPU as a potential idle candidate: */
6212 recent_used_cpu = p->recent_used_cpu; 6436 recent_used_cpu = p->recent_used_cpu;
6213 if (recent_used_cpu != prev && 6437 if (recent_used_cpu != prev &&
6214 recent_used_cpu != target && 6438 recent_used_cpu != target &&
@@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6217 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 6441 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6218 /* 6442 /*
6219 * Replace recent_used_cpu with prev as it is a potential 6443 * Replace recent_used_cpu with prev as it is a potential
6220 * candidate for the next wake. 6444 * candidate for the next wake:
6221 */ 6445 */
6222 p->recent_used_cpu = prev; 6446 p->recent_used_cpu = prev;
6223 return recent_used_cpu; 6447 return recent_used_cpu;
@@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6242 return target; 6466 return target;
6243} 6467}
6244 6468
6245/* 6469/**
6246 * cpu_util returns the amount of capacity of a CPU that is used by CFS 6470 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
6247 * tasks. The unit of the return value must be the one of capacity so we can 6471 * @cpu: the CPU to get the utilization of
6248 * compare the utilization with the capacity of the CPU that is available for 6472 *
6249 * CFS task (ie cpu_capacity). 6473 * The unit of the return value must be the one of capacity so we can compare
6474 * the utilization with the capacity of the CPU that is available for CFS task
6475 * (ie cpu_capacity).
6250 * 6476 *
6251 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the 6477 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6252 * recent utilization of currently non-runnable tasks on a CPU. It represents 6478 * recent utilization of currently non-runnable tasks on a CPU. It represents
@@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6257 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is 6483 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6258 * the running time on this CPU scaled by capacity_curr. 6484 * the running time on this CPU scaled by capacity_curr.
6259 * 6485 *
6486 * The estimated utilization of a CPU is defined to be the maximum between its
6487 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
6488 * currently RUNNABLE on that CPU.
6489 * This allows to properly represent the expected utilization of a CPU which
6490 * has just got a big task running since a long sleep period. At the same time
6491 * however it preserves the benefits of the "blocked utilization" in
6492 * describing the potential for other tasks waking up on the same CPU.
6493 *
6260 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even 6494 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6261 * higher than capacity_orig because of unfortunate rounding in 6495 * higher than capacity_orig because of unfortunate rounding in
6262 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until 6496 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
@@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6267 * available capacity. We allow utilization to overshoot capacity_curr (but not 6501 * available capacity. We allow utilization to overshoot capacity_curr (but not
6268 * capacity_orig) as it useful for predicting the capacity required after task 6502 * capacity_orig) as it useful for predicting the capacity required after task
6269 * migrations (scheduler-driven DVFS). 6503 * migrations (scheduler-driven DVFS).
6504 *
6505 * Return: the (estimated) utilization for the specified CPU
6270 */ 6506 */
6271static unsigned long cpu_util(int cpu) 6507static inline unsigned long cpu_util(int cpu)
6272{ 6508{
6273 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; 6509 struct cfs_rq *cfs_rq;
6274 unsigned long capacity = capacity_orig_of(cpu); 6510 unsigned int util;
6275 6511
6276 return (util >= capacity) ? capacity : util; 6512 cfs_rq = &cpu_rq(cpu)->cfs;
6277} 6513 util = READ_ONCE(cfs_rq->avg.util_avg);
6278 6514
6279static inline unsigned long task_util(struct task_struct *p) 6515 if (sched_feat(UTIL_EST))
6280{ 6516 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6281 return p->se.avg.util_avg; 6517
6518 return min_t(unsigned long, util, capacity_orig_of(cpu));
6282} 6519}
6283 6520
6284/* 6521/*
6285 * cpu_util_wake: Compute cpu utilization with any contributions from 6522 * cpu_util_wake: Compute CPU utilization with any contributions from
6286 * the waking task p removed. 6523 * the waking task p removed.
6287 */ 6524 */
6288static unsigned long cpu_util_wake(int cpu, struct task_struct *p) 6525static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6289{ 6526{
6290 unsigned long util, capacity; 6527 struct cfs_rq *cfs_rq;
6528 unsigned int util;
6291 6529
6292 /* Task has no contribution or is new */ 6530 /* Task has no contribution or is new */
6293 if (cpu != task_cpu(p) || !p->se.avg.last_update_time) 6531 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6294 return cpu_util(cpu); 6532 return cpu_util(cpu);
6295 6533
6296 capacity = capacity_orig_of(cpu); 6534 cfs_rq = &cpu_rq(cpu)->cfs;
6297 util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); 6535 util = READ_ONCE(cfs_rq->avg.util_avg);
6298 6536
6299 return (util >= capacity) ? capacity : util; 6537 /* Discount task's blocked util from CPU's util */
6538 util -= min_t(unsigned int, util, task_util(p));
6539
6540 /*
6541 * Covered cases:
6542 *
6543 * a) if *p is the only task sleeping on this CPU, then:
6544 * cpu_util (== task_util) > util_est (== 0)
6545 * and thus we return:
6546 * cpu_util_wake = (cpu_util - task_util) = 0
6547 *
6548 * b) if other tasks are SLEEPING on this CPU, which is now exiting
6549 * IDLE, then:
6550 * cpu_util >= task_util
6551 * cpu_util > util_est (== 0)
6552 * and thus we discount *p's blocked utilization to return:
6553 * cpu_util_wake = (cpu_util - task_util) >= 0
6554 *
6555 * c) if other tasks are RUNNABLE on that CPU and
6556 * util_est > cpu_util
6557 * then we use util_est since it returns a more restrictive
6558 * estimation of the spare capacity on that CPU, by just
6559 * considering the expected utilization of tasks already
6560 * runnable on that CPU.
6561 *
6562 * Cases a) and b) are covered by the above code, while case c) is
6563 * covered by the following code when estimated utilization is
6564 * enabled.
6565 */
6566 if (sched_feat(UTIL_EST))
6567 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6568
6569 /*
6570 * Utilization (estimated) can exceed the CPU capacity, thus let's
6571 * clamp to the maximum CPU capacity to ensure consistency with
6572 * the cpu_util call.
6573 */
6574 return min_t(unsigned long, util, capacity_orig_of(cpu));
6300} 6575}
6301 6576
6302/* 6577/*
@@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6328 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 6603 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6329 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 6604 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6330 * 6605 *
6331 * Balances load by selecting the idlest cpu in the idlest group, or under 6606 * Balances load by selecting the idlest CPU in the idlest group, or under
6332 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. 6607 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6333 * 6608 *
6334 * Returns the target cpu number. 6609 * Returns the target CPU number.
6335 * 6610 *
6336 * preempt must be disabled. 6611 * preempt must be disabled.
6337 */ 6612 */
@@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6342 int cpu = smp_processor_id(); 6617 int cpu = smp_processor_id();
6343 int new_cpu = prev_cpu; 6618 int new_cpu = prev_cpu;
6344 int want_affine = 0; 6619 int want_affine = 0;
6345 int sync = wake_flags & WF_SYNC; 6620 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6346 6621
6347 if (sd_flag & SD_BALANCE_WAKE) { 6622 if (sd_flag & SD_BALANCE_WAKE) {
6348 record_wakee(p); 6623 record_wakee(p);
@@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6356 break; 6631 break;
6357 6632
6358 /* 6633 /*
6359 * If both cpu and prev_cpu are part of this domain, 6634 * If both 'cpu' and 'prev_cpu' are part of this domain,
6360 * cpu is a valid SD_WAKE_AFFINE target. 6635 * cpu is a valid SD_WAKE_AFFINE target.
6361 */ 6636 */
6362 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 6637 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
@@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6376 if (cpu == prev_cpu) 6651 if (cpu == prev_cpu)
6377 goto pick_cpu; 6652 goto pick_cpu;
6378 6653
6379 new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); 6654 new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
6380 } 6655 }
6381 6656
6382 if (sd && !(sd_flag & SD_BALANCE_FORK)) { 6657 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6407,9 +6682,9 @@ pick_cpu:
6407static void detach_entity_cfs_rq(struct sched_entity *se); 6682static void detach_entity_cfs_rq(struct sched_entity *se);
6408 6683
6409/* 6684/*
6410 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 6685 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
6411 * cfs_rq_of(p) references at time of call are still valid and identify the 6686 * cfs_rq_of(p) references at time of call are still valid and identify the
6412 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6687 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6413 */ 6688 */
6414static void migrate_task_rq_fair(struct task_struct *p) 6689static void migrate_task_rq_fair(struct task_struct *p)
6415{ 6690{
@@ -6738,7 +7013,7 @@ simple:
6738 7013
6739 p = task_of(se); 7014 p = task_of(se);
6740 7015
6741done: __maybe_unused 7016done: __maybe_unused;
6742#ifdef CONFIG_SMP 7017#ifdef CONFIG_SMP
6743 /* 7018 /*
6744 * Move the next running task to the front of 7019 * Move the next running task to the front of
@@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6843 * BASICS 7118 * BASICS
6844 * 7119 *
6845 * The purpose of load-balancing is to achieve the same basic fairness the 7120 * The purpose of load-balancing is to achieve the same basic fairness the
6846 * per-cpu scheduler provides, namely provide a proportional amount of compute 7121 * per-CPU scheduler provides, namely provide a proportional amount of compute
6847 * time to each task. This is expressed in the following equation: 7122 * time to each task. This is expressed in the following equation:
6848 * 7123 *
6849 * W_i,n/P_i == W_j,n/P_j for all i,j (1) 7124 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6850 * 7125 *
6851 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight 7126 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
6852 * W_i,0 is defined as: 7127 * W_i,0 is defined as:
6853 * 7128 *
6854 * W_i,0 = \Sum_j w_i,j (2) 7129 * W_i,0 = \Sum_j w_i,j (2)
6855 * 7130 *
6856 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight 7131 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
6857 * is derived from the nice value as per sched_prio_to_weight[]. 7132 * is derived from the nice value as per sched_prio_to_weight[].
6858 * 7133 *
6859 * The weight average is an exponential decay average of the instantaneous 7134 * The weight average is an exponential decay average of the instantaneous
@@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6861 * 7136 *
6862 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 7137 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6863 * 7138 *
6864 * C_i is the compute capacity of cpu i, typically it is the 7139 * C_i is the compute capacity of CPU i, typically it is the
6865 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 7140 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6866 * can also include other factors [XXX]. 7141 * can also include other factors [XXX].
6867 * 7142 *
@@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6882 * SCHED DOMAINS 7157 * SCHED DOMAINS
6883 * 7158 *
6884 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) 7159 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6885 * for all i,j solution, we create a tree of cpus that follows the hardware 7160 * for all i,j solution, we create a tree of CPUs that follows the hardware
6886 * topology where each level pairs two lower groups (or better). This results 7161 * topology where each level pairs two lower groups (or better). This results
6887 * in O(log n) layers. Furthermore we reduce the number of cpus going up the 7162 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
6888 * tree to only the first of the previous level and we decrease the frequency 7163 * tree to only the first of the previous level and we decrease the frequency
6889 * of load-balance at each level inv. proportional to the number of cpus in 7164 * of load-balance at each level inv. proportional to the number of CPUs in
6890 * the groups. 7165 * the groups.
6891 * 7166 *
6892 * This yields: 7167 * This yields:
@@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6895 * \Sum { --- * --- * 2^i } = O(n) (5) 7170 * \Sum { --- * --- * 2^i } = O(n) (5)
6896 * i = 0 2^i 2^i 7171 * i = 0 2^i 2^i
6897 * `- size of each group 7172 * `- size of each group
6898 * | | `- number of cpus doing load-balance 7173 * | | `- number of CPUs doing load-balance
6899 * | `- freq 7174 * | `- freq
6900 * `- sum over all levels 7175 * `- sum over all levels
6901 * 7176 *
@@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6903 * this makes (5) the runtime complexity of the balancer. 7178 * this makes (5) the runtime complexity of the balancer.
6904 * 7179 *
6905 * An important property here is that each CPU is still (indirectly) connected 7180 * An important property here is that each CPU is still (indirectly) connected
6906 * to every other cpu in at most O(log n) steps: 7181 * to every other CPU in at most O(log n) steps:
6907 * 7182 *
6908 * The adjacency matrix of the resulting graph is given by: 7183 * The adjacency matrix of the resulting graph is given by:
6909 * 7184 *
@@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6915 * 7190 *
6916 * A^(log_2 n)_i,j != 0 for all i,j (7) 7191 * A^(log_2 n)_i,j != 0 for all i,j (7)
6917 * 7192 *
6918 * Showing there's indeed a path between every cpu in at most O(log n) steps. 7193 * Showing there's indeed a path between every CPU in at most O(log n) steps.
6919 * The task movement gives a factor of O(m), giving a convergence complexity 7194 * The task movement gives a factor of O(m), giving a convergence complexity
6920 * of: 7195 * of:
6921 * 7196 *
@@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6925 * WORK CONSERVING 7200 * WORK CONSERVING
6926 * 7201 *
6927 * In order to avoid CPUs going idle while there's still work to do, new idle 7202 * In order to avoid CPUs going idle while there's still work to do, new idle
6928 * balancing is more aggressive and has the newly idle cpu iterate up the domain 7203 * balancing is more aggressive and has the newly idle CPU iterate up the domain
6929 * tree itself instead of relying on other CPUs to bring it work. 7204 * tree itself instead of relying on other CPUs to bring it work.
6930 * 7205 *
6931 * This adds some complexity to both (5) and (8) but it reduces the total idle 7206 * This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6946 * 7221 *
6947 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) 7222 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
6948 * 7223 *
6949 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. 7224 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
6950 * 7225 *
6951 * The big problem is S_k, its a global sum needed to compute a local (W_i) 7226 * The big problem is S_k, its a global sum needed to compute a local (W_i)
6952 * property. 7227 * property.
@@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all };
6963#define LBF_NEED_BREAK 0x02 7238#define LBF_NEED_BREAK 0x02
6964#define LBF_DST_PINNED 0x04 7239#define LBF_DST_PINNED 0x04
6965#define LBF_SOME_PINNED 0x08 7240#define LBF_SOME_PINNED 0x08
7241#define LBF_NOHZ_STATS 0x10
7242#define LBF_NOHZ_AGAIN 0x20
6966 7243
6967struct lb_env { 7244struct lb_env {
6968 struct sched_domain *sd; 7245 struct sched_domain *sd;
@@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7110 env->flags |= LBF_SOME_PINNED; 7387 env->flags |= LBF_SOME_PINNED;
7111 7388
7112 /* 7389 /*
7113 * Remember if this task can be migrated to any other cpu in 7390 * Remember if this task can be migrated to any other CPU in
7114 * our sched_group. We may want to revisit it if we couldn't 7391 * our sched_group. We may want to revisit it if we couldn't
7115 * meet load balance goals by pulling other tasks on src_cpu. 7392 * meet load balance goals by pulling other tasks on src_cpu.
7116 * 7393 *
@@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7120 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 7397 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7121 return 0; 7398 return 0;
7122 7399
7123 /* Prevent to re-select dst_cpu via env's cpus */ 7400 /* Prevent to re-select dst_cpu via env's CPUs: */
7124 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7401 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7125 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7402 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7126 env->flags |= LBF_DST_PINNED; 7403 env->flags |= LBF_DST_PINNED;
@@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env)
7347 rq_unlock(env->dst_rq, &rf); 7624 rq_unlock(env->dst_rq, &rf);
7348} 7625}
7349 7626
7627static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7628{
7629 if (cfs_rq->avg.load_avg)
7630 return true;
7631
7632 if (cfs_rq->avg.util_avg)
7633 return true;
7634
7635 return false;
7636}
7637
7350#ifdef CONFIG_FAIR_GROUP_SCHED 7638#ifdef CONFIG_FAIR_GROUP_SCHED
7351 7639
7352static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 7640static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu)
7371 struct rq *rq = cpu_rq(cpu); 7659 struct rq *rq = cpu_rq(cpu);
7372 struct cfs_rq *cfs_rq, *pos; 7660 struct cfs_rq *cfs_rq, *pos;
7373 struct rq_flags rf; 7661 struct rq_flags rf;
7662 bool done = true;
7374 7663
7375 rq_lock_irqsave(rq, &rf); 7664 rq_lock_irqsave(rq, &rf);
7376 update_rq_clock(rq); 7665 update_rq_clock(rq);
@@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu)
7400 */ 7689 */
7401 if (cfs_rq_is_decayed(cfs_rq)) 7690 if (cfs_rq_is_decayed(cfs_rq))
7402 list_del_leaf_cfs_rq(cfs_rq); 7691 list_del_leaf_cfs_rq(cfs_rq);
7692
7693 /* Don't need periodic decay once load/util_avg are null */
7694 if (cfs_rq_has_blocked(cfs_rq))
7695 done = false;
7403 } 7696 }
7697
7698#ifdef CONFIG_NO_HZ_COMMON
7699 rq->last_blocked_load_update_tick = jiffies;
7700 if (done)
7701 rq->has_blocked_load = 0;
7702#endif
7404 rq_unlock_irqrestore(rq, &rf); 7703 rq_unlock_irqrestore(rq, &rf);
7405} 7704}
7406 7705
@@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu)
7460 rq_lock_irqsave(rq, &rf); 7759 rq_lock_irqsave(rq, &rf);
7461 update_rq_clock(rq); 7760 update_rq_clock(rq);
7462 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); 7761 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7762#ifdef CONFIG_NO_HZ_COMMON
7763 rq->last_blocked_load_update_tick = jiffies;
7764 if (!cfs_rq_has_blocked(cfs_rq))
7765 rq->has_blocked_load = 0;
7766#endif
7463 rq_unlock_irqrestore(rq, &rf); 7767 rq_unlock_irqrestore(rq, &rf);
7464} 7768}
7465 7769
@@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7694 * Group imbalance indicates (and tries to solve) the problem where balancing 7998 * Group imbalance indicates (and tries to solve) the problem where balancing
7695 * groups is inadequate due to ->cpus_allowed constraints. 7999 * groups is inadequate due to ->cpus_allowed constraints.
7696 * 8000 *
7697 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a 8001 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
7698 * cpumask covering 1 cpu of the first group and 3 cpus of the second group. 8002 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
7699 * Something like: 8003 * Something like:
7700 * 8004 *
7701 * { 0 1 2 3 } { 4 5 6 7 } 8005 * { 0 1 2 3 } { 4 5 6 7 }
@@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7703 * 8007 *
7704 * If we were to balance group-wise we'd place two tasks in the first group and 8008 * If we were to balance group-wise we'd place two tasks in the first group and
7705 * two tasks in the second group. Clearly this is undesired as it will overload 8009 * two tasks in the second group. Clearly this is undesired as it will overload
7706 * cpu 3 and leave one of the cpus in the second group unused. 8010 * cpu 3 and leave one of the CPUs in the second group unused.
7707 * 8011 *
7708 * The current solution to this issue is detecting the skew in the first group 8012 * The current solution to this issue is detecting the skew in the first group
7709 * by noticing the lower domain failed to reach balance and had difficulty 8013 * by noticing the lower domain failed to reach balance and had difficulty
@@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group,
7794 return group_other; 8098 return group_other;
7795} 8099}
7796 8100
8101static bool update_nohz_stats(struct rq *rq, bool force)
8102{
8103#ifdef CONFIG_NO_HZ_COMMON
8104 unsigned int cpu = rq->cpu;
8105
8106 if (!rq->has_blocked_load)
8107 return false;
8108
8109 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8110 return false;
8111
8112 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8113 return true;
8114
8115 update_blocked_averages(cpu);
8116
8117 return rq->has_blocked_load;
8118#else
8119 return false;
8120#endif
8121}
8122
7797/** 8123/**
7798 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 8124 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
7799 * @env: The load balancing environment. 8125 * @env: The load balancing environment.
@@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
7816 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 8142 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7817 struct rq *rq = cpu_rq(i); 8143 struct rq *rq = cpu_rq(i);
7818 8144
7819 /* Bias balancing toward cpus of our domain */ 8145 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8146 env->flags |= LBF_NOHZ_AGAIN;
8147
8148 /* Bias balancing toward CPUs of our domain: */
7820 if (local_group) 8149 if (local_group)
7821 load = target_load(i, load_idx); 8150 load = target_load(i, load_idx);
7822 else 8151 else
@@ -7902,7 +8231,7 @@ asym_packing:
7902 if (!(env->sd->flags & SD_ASYM_PACKING)) 8231 if (!(env->sd->flags & SD_ASYM_PACKING))
7903 return true; 8232 return true;
7904 8233
7905 /* No ASYM_PACKING if target cpu is already busy */ 8234 /* No ASYM_PACKING if target CPU is already busy */
7906 if (env->idle == CPU_NOT_IDLE) 8235 if (env->idle == CPU_NOT_IDLE)
7907 return true; 8236 return true;
7908 /* 8237 /*
@@ -7915,7 +8244,7 @@ asym_packing:
7915 if (!sds->busiest) 8244 if (!sds->busiest)
7916 return true; 8245 return true;
7917 8246
7918 /* Prefer to move from lowest priority cpu's work */ 8247 /* Prefer to move from lowest priority CPU's work */
7919 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, 8248 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7920 sg->asym_prefer_cpu)) 8249 sg->asym_prefer_cpu))
7921 return true; 8250 return true;
@@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7971 if (child && child->flags & SD_PREFER_SIBLING) 8300 if (child && child->flags & SD_PREFER_SIBLING)
7972 prefer_sibling = 1; 8301 prefer_sibling = 1;
7973 8302
8303#ifdef CONFIG_NO_HZ_COMMON
8304 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8305 env->flags |= LBF_NOHZ_STATS;
8306#endif
8307
7974 load_idx = get_sd_load_idx(env->sd, env->idle); 8308 load_idx = get_sd_load_idx(env->sd, env->idle);
7975 8309
7976 do { 8310 do {
@@ -8024,6 +8358,15 @@ next_group:
8024 sg = sg->next; 8358 sg = sg->next;
8025 } while (sg != env->sd->groups); 8359 } while (sg != env->sd->groups);
8026 8360
8361#ifdef CONFIG_NO_HZ_COMMON
8362 if ((env->flags & LBF_NOHZ_AGAIN) &&
8363 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8364
8365 WRITE_ONCE(nohz.next_blocked,
8366 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8367 }
8368#endif
8369
8027 if (env->sd->flags & SD_NUMA) 8370 if (env->sd->flags & SD_NUMA)
8028 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 8371 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8029 8372
@@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8168 if (busiest->group_type == group_imbalanced) { 8511 if (busiest->group_type == group_imbalanced) {
8169 /* 8512 /*
8170 * In the group_imb case we cannot rely on group-wide averages 8513 * In the group_imb case we cannot rely on group-wide averages
8171 * to ensure cpu-load equilibrium, look at wider averages. XXX 8514 * to ensure CPU-load equilibrium, look at wider averages. XXX
8172 */ 8515 */
8173 busiest->load_per_task = 8516 busiest->load_per_task =
8174 min(busiest->load_per_task, sds->avg_load); 8517 min(busiest->load_per_task, sds->avg_load);
@@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8187 } 8530 }
8188 8531
8189 /* 8532 /*
8190 * If there aren't any idle cpus, avoid creating some. 8533 * If there aren't any idle CPUs, avoid creating some.
8191 */ 8534 */
8192 if (busiest->group_type == group_overloaded && 8535 if (busiest->group_type == group_overloaded &&
8193 local->group_type == group_overloaded) { 8536 local->group_type == group_overloaded) {
@@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8201 } 8544 }
8202 8545
8203 /* 8546 /*
8204 * We're trying to get all the cpus to the average_load, so we don't 8547 * We're trying to get all the CPUs to the average_load, so we don't
8205 * want to push ourselves above the average load, nor do we wish to 8548 * want to push ourselves above the average load, nor do we wish to
8206 * reduce the max loaded cpu below the average load. At the same time, 8549 * reduce the max loaded CPU below the average load. At the same time,
8207 * we also don't want to reduce the group load below the group 8550 * we also don't want to reduce the group load below the group
8208 * capacity. Thus we look for the minimum possible imbalance. 8551 * capacity. Thus we look for the minimum possible imbalance.
8209 */ 8552 */
@@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8297 8640
8298 if (env->idle == CPU_IDLE) { 8641 if (env->idle == CPU_IDLE) {
8299 /* 8642 /*
8300 * This cpu is idle. If the busiest group is not overloaded 8643 * This CPU is idle. If the busiest group is not overloaded
8301 * and there is no imbalance between this and busiest group 8644 * and there is no imbalance between this and busiest group
8302 * wrt idle cpus, it is balanced. The imbalance becomes 8645 * wrt idle CPUs, it is balanced. The imbalance becomes
8303 * significant if the diff is greater than 1 otherwise we 8646 * significant if the diff is greater than 1 otherwise we
8304 * might end up to just move the imbalance on another group 8647 * might end up to just move the imbalance on another group
8305 */ 8648 */
@@ -8327,7 +8670,7 @@ out_balanced:
8327} 8670}
8328 8671
8329/* 8672/*
8330 * find_busiest_queue - find the busiest runqueue among the cpus in group. 8673 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
8331 */ 8674 */
8332static struct rq *find_busiest_queue(struct lb_env *env, 8675static struct rq *find_busiest_queue(struct lb_env *env,
8333 struct sched_group *group) 8676 struct sched_group *group)
@@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8371 8714
8372 /* 8715 /*
8373 * When comparing with imbalance, use weighted_cpuload() 8716 * When comparing with imbalance, use weighted_cpuload()
8374 * which is not scaled with the cpu capacity. 8717 * which is not scaled with the CPU capacity.
8375 */ 8718 */
8376 8719
8377 if (rq->nr_running == 1 && wl > env->imbalance && 8720 if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8379 continue; 8722 continue;
8380 8723
8381 /* 8724 /*
8382 * For the load comparisons with the other cpu's, consider 8725 * For the load comparisons with the other CPU's, consider
8383 * the weighted_cpuload() scaled with the cpu capacity, so 8726 * the weighted_cpuload() scaled with the CPU capacity, so
8384 * that the load can be moved away from the cpu that is 8727 * that the load can be moved away from the CPU that is
8385 * potentially running at a lower capacity. 8728 * potentially running at a lower capacity.
8386 * 8729 *
8387 * Thus we're looking for max(wl_i / capacity_i), crosswise 8730 * Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env)
8452 return 0; 8795 return 0;
8453 8796
8454 /* 8797 /*
8455 * In the newly idle case, we will allow all the cpu's 8798 * In the newly idle case, we will allow all the CPUs
8456 * to do the newly idle load balance. 8799 * to do the newly idle load balance.
8457 */ 8800 */
8458 if (env->idle == CPU_NEWLY_IDLE) 8801 if (env->idle == CPU_NEWLY_IDLE)
8459 return 1; 8802 return 1;
8460 8803
8461 /* Try to find first idle cpu */ 8804 /* Try to find first idle CPU */
8462 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { 8805 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8463 if (!idle_cpu(cpu)) 8806 if (!idle_cpu(cpu))
8464 continue; 8807 continue;
@@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env)
8471 balance_cpu = group_balance_cpu(sg); 8814 balance_cpu = group_balance_cpu(sg);
8472 8815
8473 /* 8816 /*
8474 * First idle cpu or the first cpu(busiest) in this sched group 8817 * First idle CPU or the first CPU(busiest) in this sched group
8475 * is eligible for doing load balancing at this and above domains. 8818 * is eligible for doing load balancing at this and above domains.
8476 */ 8819 */
8477 return balance_cpu == env->dst_cpu; 8820 return balance_cpu == env->dst_cpu;
@@ -8580,7 +8923,7 @@ more_balance:
8580 * Revisit (affine) tasks on src_cpu that couldn't be moved to 8923 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8581 * us and move them to an alternate dst_cpu in our sched_group 8924 * us and move them to an alternate dst_cpu in our sched_group
8582 * where they can run. The upper limit on how many times we 8925 * where they can run. The upper limit on how many times we
8583 * iterate on same src_cpu is dependent on number of cpus in our 8926 * iterate on same src_cpu is dependent on number of CPUs in our
8584 * sched_group. 8927 * sched_group.
8585 * 8928 *
8586 * This changes load balance semantics a bit on who can move 8929 * This changes load balance semantics a bit on who can move
@@ -8597,7 +8940,7 @@ more_balance:
8597 */ 8940 */
8598 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 8941 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8599 8942
8600 /* Prevent to re-select dst_cpu via env's cpus */ 8943 /* Prevent to re-select dst_cpu via env's CPUs */
8601 cpumask_clear_cpu(env.dst_cpu, env.cpus); 8944 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8602 8945
8603 env.dst_rq = cpu_rq(env.new_dst_cpu); 8946 env.dst_rq = cpu_rq(env.new_dst_cpu);
@@ -8659,9 +9002,10 @@ more_balance:
8659 9002
8660 raw_spin_lock_irqsave(&busiest->lock, flags); 9003 raw_spin_lock_irqsave(&busiest->lock, flags);
8661 9004
8662 /* don't kick the active_load_balance_cpu_stop, 9005 /*
8663 * if the curr task on busiest cpu can't be 9006 * Don't kick the active_load_balance_cpu_stop,
8664 * moved to this_cpu 9007 * if the curr task on busiest CPU can't be
9008 * moved to this_cpu:
8665 */ 9009 */
8666 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 9010 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8667 raw_spin_unlock_irqrestore(&busiest->lock, 9011 raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8773,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
8773} 9117}
8774 9118
8775/* 9119/*
8776 * idle_balance is called by schedule() if this_cpu is about to become 9120 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
8777 * idle. Attempts to pull tasks from other CPUs.
8778 */
8779static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
8780{
8781 unsigned long next_balance = jiffies + HZ;
8782 int this_cpu = this_rq->cpu;
8783 struct sched_domain *sd;
8784 int pulled_task = 0;
8785 u64 curr_cost = 0;
8786
8787 /*
8788 * We must set idle_stamp _before_ calling idle_balance(), such that we
8789 * measure the duration of idle_balance() as idle time.
8790 */
8791 this_rq->idle_stamp = rq_clock(this_rq);
8792
8793 /*
8794 * Do not pull tasks towards !active CPUs...
8795 */
8796 if (!cpu_active(this_cpu))
8797 return 0;
8798
8799 /*
8800 * This is OK, because current is on_cpu, which avoids it being picked
8801 * for load-balance and preemption/IRQs are still disabled avoiding
8802 * further scheduler activity on it and we're being very careful to
8803 * re-start the picking loop.
8804 */
8805 rq_unpin_lock(this_rq, rf);
8806
8807 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
8808 !this_rq->rd->overload) {
8809 rcu_read_lock();
8810 sd = rcu_dereference_check_sched_domain(this_rq->sd);
8811 if (sd)
8812 update_next_balance(sd, &next_balance);
8813 rcu_read_unlock();
8814
8815 goto out;
8816 }
8817
8818 raw_spin_unlock(&this_rq->lock);
8819
8820 update_blocked_averages(this_cpu);
8821 rcu_read_lock();
8822 for_each_domain(this_cpu, sd) {
8823 int continue_balancing = 1;
8824 u64 t0, domain_cost;
8825
8826 if (!(sd->flags & SD_LOAD_BALANCE))
8827 continue;
8828
8829 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
8830 update_next_balance(sd, &next_balance);
8831 break;
8832 }
8833
8834 if (sd->flags & SD_BALANCE_NEWIDLE) {
8835 t0 = sched_clock_cpu(this_cpu);
8836
8837 pulled_task = load_balance(this_cpu, this_rq,
8838 sd, CPU_NEWLY_IDLE,
8839 &continue_balancing);
8840
8841 domain_cost = sched_clock_cpu(this_cpu) - t0;
8842 if (domain_cost > sd->max_newidle_lb_cost)
8843 sd->max_newidle_lb_cost = domain_cost;
8844
8845 curr_cost += domain_cost;
8846 }
8847
8848 update_next_balance(sd, &next_balance);
8849
8850 /*
8851 * Stop searching for tasks to pull if there are
8852 * now runnable tasks on this rq.
8853 */
8854 if (pulled_task || this_rq->nr_running > 0)
8855 break;
8856 }
8857 rcu_read_unlock();
8858
8859 raw_spin_lock(&this_rq->lock);
8860
8861 if (curr_cost > this_rq->max_idle_balance_cost)
8862 this_rq->max_idle_balance_cost = curr_cost;
8863
8864 /*
8865 * While browsing the domains, we released the rq lock, a task could
8866 * have been enqueued in the meantime. Since we're not going idle,
8867 * pretend we pulled a task.
8868 */
8869 if (this_rq->cfs.h_nr_running && !pulled_task)
8870 pulled_task = 1;
8871
8872out:
8873 /* Move the next balance forward */
8874 if (time_after(this_rq->next_balance, next_balance))
8875 this_rq->next_balance = next_balance;
8876
8877 /* Is there a task of a high priority class? */
8878 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
8879 pulled_task = -1;
8880
8881 if (pulled_task)
8882 this_rq->idle_stamp = 0;
8883
8884 rq_repin_lock(this_rq, rf);
8885
8886 return pulled_task;
8887}
8888
8889/*
8890 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
8891 * running tasks off the busiest CPU onto idle CPUs. It requires at 9121 * running tasks off the busiest CPU onto idle CPUs. It requires at
8892 * least 1 task to be running on each physical CPU where possible, and 9122 * least 1 task to be running on each physical CPU where possible, and
8893 * avoids physical / logical imbalances. 9123 * avoids physical / logical imbalances.
@@ -8911,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data)
8911 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 9141 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8912 goto out_unlock; 9142 goto out_unlock;
8913 9143
8914 /* make sure the requested cpu hasn't gone down in the meantime */ 9144 /* Make sure the requested CPU hasn't gone down in the meantime: */
8915 if (unlikely(busiest_cpu != smp_processor_id() || 9145 if (unlikely(busiest_cpu != smp_processor_id() ||
8916 !busiest_rq->active_balance)) 9146 !busiest_rq->active_balance))
8917 goto out_unlock; 9147 goto out_unlock;
@@ -8923,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data)
8923 /* 9153 /*
8924 * This condition is "impossible", if it occurs 9154 * This condition is "impossible", if it occurs
8925 * we need to fix it. Originally reported by 9155 * we need to fix it. Originally reported by
8926 * Bjorn Helgaas on a 128-cpu setup. 9156 * Bjorn Helgaas on a 128-CPU setup.
8927 */ 9157 */
8928 BUG_ON(busiest_rq == target_rq); 9158 BUG_ON(busiest_rq == target_rq);
8929 9159
@@ -8977,141 +9207,6 @@ out_unlock:
8977 return 0; 9207 return 0;
8978} 9208}
8979 9209
8980static inline int on_null_domain(struct rq *rq)
8981{
8982 return unlikely(!rcu_dereference_sched(rq->sd));
8983}
8984
8985#ifdef CONFIG_NO_HZ_COMMON
8986/*
8987 * idle load balancing details
8988 * - When one of the busy CPUs notice that there may be an idle rebalancing
8989 * needed, they will kick the idle load balancer, which then does idle
8990 * load balancing for all the idle CPUs.
8991 */
8992static struct {
8993 cpumask_var_t idle_cpus_mask;
8994 atomic_t nr_cpus;
8995 unsigned long next_balance; /* in jiffy units */
8996} nohz ____cacheline_aligned;
8997
8998static inline int find_new_ilb(void)
8999{
9000 int ilb = cpumask_first(nohz.idle_cpus_mask);
9001
9002 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9003 return ilb;
9004
9005 return nr_cpu_ids;
9006}
9007
9008/*
9009 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9010 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9011 * CPU (if there is one).
9012 */
9013static void nohz_balancer_kick(void)
9014{
9015 int ilb_cpu;
9016
9017 nohz.next_balance++;
9018
9019 ilb_cpu = find_new_ilb();
9020
9021 if (ilb_cpu >= nr_cpu_ids)
9022 return;
9023
9024 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
9025 return;
9026 /*
9027 * Use smp_send_reschedule() instead of resched_cpu().
9028 * This way we generate a sched IPI on the target cpu which
9029 * is idle. And the softirq performing nohz idle load balance
9030 * will be run before returning from the IPI.
9031 */
9032 smp_send_reschedule(ilb_cpu);
9033 return;
9034}
9035
9036void nohz_balance_exit_idle(unsigned int cpu)
9037{
9038 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
9039 /*
9040 * Completely isolated CPUs don't ever set, so we must test.
9041 */
9042 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9043 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9044 atomic_dec(&nohz.nr_cpus);
9045 }
9046 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9047 }
9048}
9049
9050static inline void set_cpu_sd_state_busy(void)
9051{
9052 struct sched_domain *sd;
9053 int cpu = smp_processor_id();
9054
9055 rcu_read_lock();
9056 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9057
9058 if (!sd || !sd->nohz_idle)
9059 goto unlock;
9060 sd->nohz_idle = 0;
9061
9062 atomic_inc(&sd->shared->nr_busy_cpus);
9063unlock:
9064 rcu_read_unlock();
9065}
9066
9067void set_cpu_sd_state_idle(void)
9068{
9069 struct sched_domain *sd;
9070 int cpu = smp_processor_id();
9071
9072 rcu_read_lock();
9073 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9074
9075 if (!sd || sd->nohz_idle)
9076 goto unlock;
9077 sd->nohz_idle = 1;
9078
9079 atomic_dec(&sd->shared->nr_busy_cpus);
9080unlock:
9081 rcu_read_unlock();
9082}
9083
9084/*
9085 * This routine will record that the cpu is going idle with tick stopped.
9086 * This info will be used in performing idle load balancing in the future.
9087 */
9088void nohz_balance_enter_idle(int cpu)
9089{
9090 /*
9091 * If this cpu is going down, then nothing needs to be done.
9092 */
9093 if (!cpu_active(cpu))
9094 return;
9095
9096 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
9097 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9098 return;
9099
9100 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9101 return;
9102
9103 /*
9104 * If we're a completely isolated CPU, we don't play.
9105 */
9106 if (on_null_domain(cpu_rq(cpu)))
9107 return;
9108
9109 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9110 atomic_inc(&nohz.nr_cpus);
9111 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9112}
9113#endif
9114
9115static DEFINE_SPINLOCK(balancing); 9210static DEFINE_SPINLOCK(balancing);
9116 9211
9117/* 9212/*
@@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9141 int need_serialize, need_decay = 0; 9236 int need_serialize, need_decay = 0;
9142 u64 max_cost = 0; 9237 u64 max_cost = 0;
9143 9238
9144 update_blocked_averages(cpu);
9145
9146 rcu_read_lock(); 9239 rcu_read_lock();
9147 for_each_domain(cpu, sd) { 9240 for_each_domain(cpu, sd) {
9148 /* 9241 /*
@@ -9232,68 +9325,56 @@ out:
9232 } 9325 }
9233} 9326}
9234 9327
9328static inline int on_null_domain(struct rq *rq)
9329{
9330 return unlikely(!rcu_dereference_sched(rq->sd));
9331}
9332
9235#ifdef CONFIG_NO_HZ_COMMON 9333#ifdef CONFIG_NO_HZ_COMMON
9236/* 9334/*
9237 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 9335 * idle load balancing details
9238 * rebalancing for all the cpus for whom scheduler ticks are stopped. 9336 * - When one of the busy CPUs notice that there may be an idle rebalancing
9337 * needed, they will kick the idle load balancer, which then does idle
9338 * load balancing for all the idle CPUs.
9239 */ 9339 */
9240static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9241{
9242 int this_cpu = this_rq->cpu;
9243 struct rq *rq;
9244 int balance_cpu;
9245 /* Earliest time when we have to do rebalance again */
9246 unsigned long next_balance = jiffies + 60*HZ;
9247 int update_next_balance = 0;
9248 9340
9249 if (idle != CPU_IDLE || 9341static inline int find_new_ilb(void)
9250 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) 9342{
9251 goto end; 9343 int ilb = cpumask_first(nohz.idle_cpus_mask);
9252 9344
9253 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 9345 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9254 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) 9346 return ilb;
9255 continue;
9256 9347
9257 /* 9348 return nr_cpu_ids;
9258 * If this cpu gets work to do, stop the load balancing 9349}
9259 * work being done for other cpus. Next load
9260 * balancing owner will pick it up.
9261 */
9262 if (need_resched())
9263 break;
9264 9350
9265 rq = cpu_rq(balance_cpu); 9351/*
9352 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9353 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9354 * CPU (if there is one).
9355 */
9356static void kick_ilb(unsigned int flags)
9357{
9358 int ilb_cpu;
9266 9359
9267 /* 9360 nohz.next_balance++;
9268 * If time for next balance is due,
9269 * do the balance.
9270 */
9271 if (time_after_eq(jiffies, rq->next_balance)) {
9272 struct rq_flags rf;
9273 9361
9274 rq_lock_irq(rq, &rf); 9362 ilb_cpu = find_new_ilb();
9275 update_rq_clock(rq);
9276 cpu_load_update_idle(rq);
9277 rq_unlock_irq(rq, &rf);
9278 9363
9279 rebalance_domains(rq, CPU_IDLE); 9364 if (ilb_cpu >= nr_cpu_ids)
9280 } 9365 return;
9281 9366
9282 if (time_after(next_balance, rq->next_balance)) { 9367 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9283 next_balance = rq->next_balance; 9368 if (flags & NOHZ_KICK_MASK)
9284 update_next_balance = 1; 9369 return;
9285 }
9286 }
9287 9370
9288 /* 9371 /*
9289 * next_balance will be updated only when there is a need. 9372 * Use smp_send_reschedule() instead of resched_cpu().
9290 * When the CPU is attached to null domain for ex, it will not be 9373 * This way we generate a sched IPI on the target CPU which
9291 * updated. 9374 * is idle. And the softirq performing nohz idle load balance
9375 * will be run before returning from the IPI.
9292 */ 9376 */
9293 if (likely(update_next_balance)) 9377 smp_send_reschedule(ilb_cpu);
9294 nohz.next_balance = next_balance;
9295end:
9296 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
9297} 9378}
9298 9379
9299/* 9380/*
@@ -9307,36 +9388,41 @@ end:
9307 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 9388 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9308 * domain span are idle. 9389 * domain span are idle.
9309 */ 9390 */
9310static inline bool nohz_kick_needed(struct rq *rq) 9391static void nohz_balancer_kick(struct rq *rq)
9311{ 9392{
9312 unsigned long now = jiffies; 9393 unsigned long now = jiffies;
9313 struct sched_domain_shared *sds; 9394 struct sched_domain_shared *sds;
9314 struct sched_domain *sd; 9395 struct sched_domain *sd;
9315 int nr_busy, i, cpu = rq->cpu; 9396 int nr_busy, i, cpu = rq->cpu;
9316 bool kick = false; 9397 unsigned int flags = 0;
9317 9398
9318 if (unlikely(rq->idle_balance)) 9399 if (unlikely(rq->idle_balance))
9319 return false; 9400 return;
9320 9401
9321 /* 9402 /*
9322 * We may be recently in ticked or tickless idle mode. At the first 9403 * We may be recently in ticked or tickless idle mode. At the first
9323 * busy tick after returning from idle, we will update the busy stats. 9404 * busy tick after returning from idle, we will update the busy stats.
9324 */ 9405 */
9325 set_cpu_sd_state_busy(); 9406 nohz_balance_exit_idle(rq);
9326 nohz_balance_exit_idle(cpu);
9327 9407
9328 /* 9408 /*
9329 * None are in tickless mode and hence no need for NOHZ idle load 9409 * None are in tickless mode and hence no need for NOHZ idle load
9330 * balancing. 9410 * balancing.
9331 */ 9411 */
9332 if (likely(!atomic_read(&nohz.nr_cpus))) 9412 if (likely(!atomic_read(&nohz.nr_cpus)))
9333 return false; 9413 return;
9414
9415 if (READ_ONCE(nohz.has_blocked) &&
9416 time_after(now, READ_ONCE(nohz.next_blocked)))
9417 flags = NOHZ_STATS_KICK;
9334 9418
9335 if (time_before(now, nohz.next_balance)) 9419 if (time_before(now, nohz.next_balance))
9336 return false; 9420 goto out;
9337 9421
9338 if (rq->nr_running >= 2) 9422 if (rq->nr_running >= 2) {
9339 return true; 9423 flags = NOHZ_KICK_MASK;
9424 goto out;
9425 }
9340 9426
9341 rcu_read_lock(); 9427 rcu_read_lock();
9342 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 9428 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
@@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
9347 */ 9433 */
9348 nr_busy = atomic_read(&sds->nr_busy_cpus); 9434 nr_busy = atomic_read(&sds->nr_busy_cpus);
9349 if (nr_busy > 1) { 9435 if (nr_busy > 1) {
9350 kick = true; 9436 flags = NOHZ_KICK_MASK;
9351 goto unlock; 9437 goto unlock;
9352 } 9438 }
9353 9439
@@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
9357 if (sd) { 9443 if (sd) {
9358 if ((rq->cfs.h_nr_running >= 1) && 9444 if ((rq->cfs.h_nr_running >= 1) &&
9359 check_cpu_capacity(rq, sd)) { 9445 check_cpu_capacity(rq, sd)) {
9360 kick = true; 9446 flags = NOHZ_KICK_MASK;
9361 goto unlock; 9447 goto unlock;
9362 } 9448 }
9363 } 9449 }
@@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq)
9370 continue; 9456 continue;
9371 9457
9372 if (sched_asym_prefer(i, cpu)) { 9458 if (sched_asym_prefer(i, cpu)) {
9373 kick = true; 9459 flags = NOHZ_KICK_MASK;
9374 goto unlock; 9460 goto unlock;
9375 } 9461 }
9376 } 9462 }
9377 } 9463 }
9378unlock: 9464unlock:
9379 rcu_read_unlock(); 9465 rcu_read_unlock();
9380 return kick; 9466out:
9467 if (flags)
9468 kick_ilb(flags);
9469}
9470
9471static void set_cpu_sd_state_busy(int cpu)
9472{
9473 struct sched_domain *sd;
9474
9475 rcu_read_lock();
9476 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9477
9478 if (!sd || !sd->nohz_idle)
9479 goto unlock;
9480 sd->nohz_idle = 0;
9481
9482 atomic_inc(&sd->shared->nr_busy_cpus);
9483unlock:
9484 rcu_read_unlock();
9485}
9486
9487void nohz_balance_exit_idle(struct rq *rq)
9488{
9489 SCHED_WARN_ON(rq != this_rq());
9490
9491 if (likely(!rq->nohz_tick_stopped))
9492 return;
9493
9494 rq->nohz_tick_stopped = 0;
9495 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9496 atomic_dec(&nohz.nr_cpus);
9497
9498 set_cpu_sd_state_busy(rq->cpu);
9499}
9500
9501static void set_cpu_sd_state_idle(int cpu)
9502{
9503 struct sched_domain *sd;
9504
9505 rcu_read_lock();
9506 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9507
9508 if (!sd || sd->nohz_idle)
9509 goto unlock;
9510 sd->nohz_idle = 1;
9511
9512 atomic_dec(&sd->shared->nr_busy_cpus);
9513unlock:
9514 rcu_read_unlock();
9515}
9516
9517/*
9518 * This routine will record that the CPU is going idle with tick stopped.
9519 * This info will be used in performing idle load balancing in the future.
9520 */
9521void nohz_balance_enter_idle(int cpu)
9522{
9523 struct rq *rq = cpu_rq(cpu);
9524
9525 SCHED_WARN_ON(cpu != smp_processor_id());
9526
9527 /* If this CPU is going down, then nothing needs to be done: */
9528 if (!cpu_active(cpu))
9529 return;
9530
9531 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
9532 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9533 return;
9534
9535 /*
9536 * Can be set safely without rq->lock held
9537 * If a clear happens, it will have evaluated last additions because
9538 * rq->lock is held during the check and the clear
9539 */
9540 rq->has_blocked_load = 1;
9541
9542 /*
9543 * The tick is still stopped but load could have been added in the
9544 * meantime. We set the nohz.has_blocked flag to trig a check of the
9545 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
9546 * of nohz.has_blocked can only happen after checking the new load
9547 */
9548 if (rq->nohz_tick_stopped)
9549 goto out;
9550
9551 /* If we're a completely isolated CPU, we don't play: */
9552 if (on_null_domain(rq))
9553 return;
9554
9555 rq->nohz_tick_stopped = 1;
9556
9557 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9558 atomic_inc(&nohz.nr_cpus);
9559
9560 /*
9561 * Ensures that if nohz_idle_balance() fails to observe our
9562 * @idle_cpus_mask store, it must observe the @has_blocked
9563 * store.
9564 */
9565 smp_mb__after_atomic();
9566
9567 set_cpu_sd_state_idle(cpu);
9568
9569out:
9570 /*
9571 * Each time a cpu enter idle, we assume that it has blocked load and
9572 * enable the periodic update of the load of idle cpus
9573 */
9574 WRITE_ONCE(nohz.has_blocked, 1);
9575}
9576
9577/*
9578 * Internal function that runs load balance for all idle cpus. The load balance
9579 * can be a simple update of blocked load or a complete load balance with
9580 * tasks movement depending of flags.
9581 * The function returns false if the loop has stopped before running
9582 * through all idle CPUs.
9583 */
9584static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9585 enum cpu_idle_type idle)
9586{
9587 /* Earliest time when we have to do rebalance again */
9588 unsigned long now = jiffies;
9589 unsigned long next_balance = now + 60*HZ;
9590 bool has_blocked_load = false;
9591 int update_next_balance = 0;
9592 int this_cpu = this_rq->cpu;
9593 int balance_cpu;
9594 int ret = false;
9595 struct rq *rq;
9596
9597 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9598
9599 /*
9600 * We assume there will be no idle load after this update and clear
9601 * the has_blocked flag. If a cpu enters idle in the mean time, it will
9602 * set the has_blocked flag and trig another update of idle load.
9603 * Because a cpu that becomes idle, is added to idle_cpus_mask before
9604 * setting the flag, we are sure to not clear the state and not
9605 * check the load of an idle cpu.
9606 */
9607 WRITE_ONCE(nohz.has_blocked, 0);
9608
9609 /*
9610 * Ensures that if we miss the CPU, we must see the has_blocked
9611 * store from nohz_balance_enter_idle().
9612 */
9613 smp_mb();
9614
9615 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9616 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9617 continue;
9618
9619 /*
9620 * If this CPU gets work to do, stop the load balancing
9621 * work being done for other CPUs. Next load
9622 * balancing owner will pick it up.
9623 */
9624 if (need_resched()) {
9625 has_blocked_load = true;
9626 goto abort;
9627 }
9628
9629 rq = cpu_rq(balance_cpu);
9630
9631 has_blocked_load |= update_nohz_stats(rq, true);
9632
9633 /*
9634 * If time for next balance is due,
9635 * do the balance.
9636 */
9637 if (time_after_eq(jiffies, rq->next_balance)) {
9638 struct rq_flags rf;
9639
9640 rq_lock_irqsave(rq, &rf);
9641 update_rq_clock(rq);
9642 cpu_load_update_idle(rq);
9643 rq_unlock_irqrestore(rq, &rf);
9644
9645 if (flags & NOHZ_BALANCE_KICK)
9646 rebalance_domains(rq, CPU_IDLE);
9647 }
9648
9649 if (time_after(next_balance, rq->next_balance)) {
9650 next_balance = rq->next_balance;
9651 update_next_balance = 1;
9652 }
9653 }
9654
9655 /* Newly idle CPU doesn't need an update */
9656 if (idle != CPU_NEWLY_IDLE) {
9657 update_blocked_averages(this_cpu);
9658 has_blocked_load |= this_rq->has_blocked_load;
9659 }
9660
9661 if (flags & NOHZ_BALANCE_KICK)
9662 rebalance_domains(this_rq, CPU_IDLE);
9663
9664 WRITE_ONCE(nohz.next_blocked,
9665 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9666
9667 /* The full idle balance loop has been done */
9668 ret = true;
9669
9670abort:
9671 /* There is still blocked load, enable periodic update */
9672 if (has_blocked_load)
9673 WRITE_ONCE(nohz.has_blocked, 1);
9674
9675 /*
9676 * next_balance will be updated only when there is a need.
9677 * When the CPU is attached to null domain for ex, it will not be
9678 * updated.
9679 */
9680 if (likely(update_next_balance))
9681 nohz.next_balance = next_balance;
9682
9683 return ret;
9684}
9685
9686/*
9687 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9688 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9689 */
9690static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9691{
9692 int this_cpu = this_rq->cpu;
9693 unsigned int flags;
9694
9695 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9696 return false;
9697
9698 if (idle != CPU_IDLE) {
9699 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9700 return false;
9701 }
9702
9703 /*
9704 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
9705 */
9706 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9707 if (!(flags & NOHZ_KICK_MASK))
9708 return false;
9709
9710 _nohz_idle_balance(this_rq, flags, idle);
9711
9712 return true;
9713}
9714
9715static void nohz_newidle_balance(struct rq *this_rq)
9716{
9717 int this_cpu = this_rq->cpu;
9718
9719 /*
9720 * This CPU doesn't want to be disturbed by scheduler
9721 * housekeeping
9722 */
9723 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9724 return;
9725
9726 /* Will wake up very soon. No time for doing anything else*/
9727 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9728 return;
9729
9730 /* Don't need to update blocked load of idle CPUs*/
9731 if (!READ_ONCE(nohz.has_blocked) ||
9732 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9733 return;
9734
9735 raw_spin_unlock(&this_rq->lock);
9736 /*
9737 * This CPU is going to be idle and blocked load of idle CPUs
9738 * need to be updated. Run the ilb locally as it is a good
9739 * candidate for ilb instead of waking up another idle CPU.
9740 * Kick an normal ilb if we failed to do the update.
9741 */
9742 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9743 kick_ilb(NOHZ_STATS_KICK);
9744 raw_spin_lock(&this_rq->lock);
9745}
9746
9747#else /* !CONFIG_NO_HZ_COMMON */
9748static inline void nohz_balancer_kick(struct rq *rq) { }
9749
9750static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9751{
9752 return false;
9753}
9754
9755static inline void nohz_newidle_balance(struct rq *this_rq) { }
9756#endif /* CONFIG_NO_HZ_COMMON */
9757
9758/*
9759 * idle_balance is called by schedule() if this_cpu is about to become
9760 * idle. Attempts to pull tasks from other CPUs.
9761 */
9762static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9763{
9764 unsigned long next_balance = jiffies + HZ;
9765 int this_cpu = this_rq->cpu;
9766 struct sched_domain *sd;
9767 int pulled_task = 0;
9768 u64 curr_cost = 0;
9769
9770 /*
9771 * We must set idle_stamp _before_ calling idle_balance(), such that we
9772 * measure the duration of idle_balance() as idle time.
9773 */
9774 this_rq->idle_stamp = rq_clock(this_rq);
9775
9776 /*
9777 * Do not pull tasks towards !active CPUs...
9778 */
9779 if (!cpu_active(this_cpu))
9780 return 0;
9781
9782 /*
9783 * This is OK, because current is on_cpu, which avoids it being picked
9784 * for load-balance and preemption/IRQs are still disabled avoiding
9785 * further scheduler activity on it and we're being very careful to
9786 * re-start the picking loop.
9787 */
9788 rq_unpin_lock(this_rq, rf);
9789
9790 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9791 !this_rq->rd->overload) {
9792
9793 rcu_read_lock();
9794 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9795 if (sd)
9796 update_next_balance(sd, &next_balance);
9797 rcu_read_unlock();
9798
9799 nohz_newidle_balance(this_rq);
9800
9801 goto out;
9802 }
9803
9804 raw_spin_unlock(&this_rq->lock);
9805
9806 update_blocked_averages(this_cpu);
9807 rcu_read_lock();
9808 for_each_domain(this_cpu, sd) {
9809 int continue_balancing = 1;
9810 u64 t0, domain_cost;
9811
9812 if (!(sd->flags & SD_LOAD_BALANCE))
9813 continue;
9814
9815 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9816 update_next_balance(sd, &next_balance);
9817 break;
9818 }
9819
9820 if (sd->flags & SD_BALANCE_NEWIDLE) {
9821 t0 = sched_clock_cpu(this_cpu);
9822
9823 pulled_task = load_balance(this_cpu, this_rq,
9824 sd, CPU_NEWLY_IDLE,
9825 &continue_balancing);
9826
9827 domain_cost = sched_clock_cpu(this_cpu) - t0;
9828 if (domain_cost > sd->max_newidle_lb_cost)
9829 sd->max_newidle_lb_cost = domain_cost;
9830
9831 curr_cost += domain_cost;
9832 }
9833
9834 update_next_balance(sd, &next_balance);
9835
9836 /*
9837 * Stop searching for tasks to pull if there are
9838 * now runnable tasks on this rq.
9839 */
9840 if (pulled_task || this_rq->nr_running > 0)
9841 break;
9842 }
9843 rcu_read_unlock();
9844
9845 raw_spin_lock(&this_rq->lock);
9846
9847 if (curr_cost > this_rq->max_idle_balance_cost)
9848 this_rq->max_idle_balance_cost = curr_cost;
9849
9850 /*
9851 * While browsing the domains, we released the rq lock, a task could
9852 * have been enqueued in the meantime. Since we're not going idle,
9853 * pretend we pulled a task.
9854 */
9855 if (this_rq->cfs.h_nr_running && !pulled_task)
9856 pulled_task = 1;
9857
9858out:
9859 /* Move the next balance forward */
9860 if (time_after(this_rq->next_balance, next_balance))
9861 this_rq->next_balance = next_balance;
9862
9863 /* Is there a task of a high priority class? */
9864 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9865 pulled_task = -1;
9866
9867 if (pulled_task)
9868 this_rq->idle_stamp = 0;
9869
9870 rq_repin_lock(this_rq, rf);
9871
9872 return pulled_task;
9381} 9873}
9382#else
9383static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
9384#endif
9385 9874
9386/* 9875/*
9387 * run_rebalance_domains is triggered when needed from the scheduler tick. 9876 * run_rebalance_domains is triggered when needed from the scheduler tick.
@@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9394 CPU_IDLE : CPU_NOT_IDLE; 9883 CPU_IDLE : CPU_NOT_IDLE;
9395 9884
9396 /* 9885 /*
9397 * If this cpu has a pending nohz_balance_kick, then do the 9886 * If this CPU has a pending nohz_balance_kick, then do the
9398 * balancing on behalf of the other idle cpus whose ticks are 9887 * balancing on behalf of the other idle CPUs whose ticks are
9399 * stopped. Do nohz_idle_balance *before* rebalance_domains to 9888 * stopped. Do nohz_idle_balance *before* rebalance_domains to
9400 * give the idle cpus a chance to load balance. Else we may 9889 * give the idle CPUs a chance to load balance. Else we may
9401 * load balance only within the local sched_domain hierarchy 9890 * load balance only within the local sched_domain hierarchy
9402 * and abort nohz_idle_balance altogether if we pull some load. 9891 * and abort nohz_idle_balance altogether if we pull some load.
9403 */ 9892 */
9404 nohz_idle_balance(this_rq, idle); 9893 if (nohz_idle_balance(this_rq, idle))
9894 return;
9895
9896 /* normal load balance */
9897 update_blocked_averages(this_rq->cpu);
9405 rebalance_domains(this_rq, idle); 9898 rebalance_domains(this_rq, idle);
9406} 9899}
9407 9900
@@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq)
9416 9909
9417 if (time_after_eq(jiffies, rq->next_balance)) 9910 if (time_after_eq(jiffies, rq->next_balance))
9418 raise_softirq(SCHED_SOFTIRQ); 9911 raise_softirq(SCHED_SOFTIRQ);
9419#ifdef CONFIG_NO_HZ_COMMON 9912
9420 if (nohz_kick_needed(rq)) 9913 nohz_balancer_kick(rq);
9421 nohz_balancer_kick();
9422#endif
9423} 9914}
9424 9915
9425static void rq_online_fair(struct rq *rq) 9916static void rq_online_fair(struct rq *rq)
@@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq)
9440#endif /* CONFIG_SMP */ 9931#endif /* CONFIG_SMP */
9441 9932
9442/* 9933/*
9443 * scheduler tick hitting a task of our scheduling class: 9934 * scheduler tick hitting a task of our scheduling class.
9935 *
9936 * NOTE: This function can be called remotely by the tick offload that
9937 * goes along full dynticks. Therefore no local assumption can be made
9938 * and everything must be accessed through the @rq and @curr passed in
9939 * parameters.
9444 */ 9940 */
9445static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 9941static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9446{ 9942{
@@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
9591 10087
9592 /* Synchronize entity with its cfs_rq */ 10088 /* Synchronize entity with its cfs_rq */
9593 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 10089 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
9594 attach_entity_load_avg(cfs_rq, se); 10090 attach_entity_load_avg(cfs_rq, se, 0);
9595 update_tg_load_avg(cfs_rq, false); 10091 update_tg_load_avg(cfs_rq, false);
9596 propagate_entity_cfs_rq(se); 10092 propagate_entity_cfs_rq(se);
9597} 10093}
@@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void)
9993 10489
9994#ifdef CONFIG_NO_HZ_COMMON 10490#ifdef CONFIG_NO_HZ_COMMON
9995 nohz.next_balance = jiffies; 10491 nohz.next_balance = jiffies;
10492 nohz.next_blocked = jiffies;
9996 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 10493 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
9997#endif 10494#endif
9998#endif /* SMP */ 10495#endif /* SMP */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..85ae8488039c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
85SCHED_FEAT(WA_IDLE, true) 85SCHED_FEAT(WA_IDLE, true)
86SCHED_FEAT(WA_WEIGHT, true) 86SCHED_FEAT(WA_WEIGHT, true)
87SCHED_FEAT(WA_BIAS, true) 87SCHED_FEAT(WA_BIAS, true)
88
89/*
90 * UtilEstimation. Use estimated CPU utilization.
91 */
92SCHED_FEAT(UTIL_EST, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..2975f195e1c4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,23 +1,14 @@
1/* 1/*
2 * Generic entry point for the idle threads 2 * Generic entry points for the idle threads and
3 * implementation of the idle task scheduling class.
4 *
5 * (NOTE: these are not related to SCHED_IDLE batch scheduled
6 * tasks which are handled in sched/fair.c )
3 */ 7 */
4#include <linux/sched.h> 8#include "sched.h"
5#include <linux/sched/idle.h>
6#include <linux/cpu.h>
7#include <linux/cpuidle.h>
8#include <linux/cpuhotplug.h>
9#include <linux/tick.h>
10#include <linux/mm.h>
11#include <linux/stackprotector.h>
12#include <linux/suspend.h>
13#include <linux/livepatch.h>
14
15#include <asm/tlb.h>
16 9
17#include <trace/events/power.h> 10#include <trace/events/power.h>
18 11
19#include "sched.h"
20
21/* Linker adds these: start and end of __cpuidle functions */ 12/* Linker adds these: start and end of __cpuidle functions */
22extern char __cpuidle_text_start[], __cpuidle_text_end[]; 13extern char __cpuidle_text_start[], __cpuidle_text_end[];
23 14
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
46static int __init cpu_idle_poll_setup(char *__unused) 37static int __init cpu_idle_poll_setup(char *__unused)
47{ 38{
48 cpu_idle_force_poll = 1; 39 cpu_idle_force_poll = 1;
40
49 return 1; 41 return 1;
50} 42}
51__setup("nohlt", cpu_idle_poll_setup); 43__setup("nohlt", cpu_idle_poll_setup);
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
53static int __init cpu_idle_nopoll_setup(char *__unused) 45static int __init cpu_idle_nopoll_setup(char *__unused)
54{ 46{
55 cpu_idle_force_poll = 0; 47 cpu_idle_force_poll = 0;
48
56 return 1; 49 return 1;
57} 50}
58__setup("hlt", cpu_idle_nopoll_setup); 51__setup("hlt", cpu_idle_nopoll_setup);
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
64 trace_cpu_idle_rcuidle(0, smp_processor_id()); 57 trace_cpu_idle_rcuidle(0, smp_processor_id());
65 local_irq_enable(); 58 local_irq_enable();
66 stop_critical_timings(); 59 stop_critical_timings();
60
67 while (!tif_need_resched() && 61 while (!tif_need_resched() &&
68 (cpu_idle_force_poll || tick_check_broadcast_expired())) 62 (cpu_idle_force_poll || tick_check_broadcast_expired()))
69 cpu_relax(); 63 cpu_relax();
70 start_critical_timings(); 64 start_critical_timings();
71 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 65 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
72 rcu_idle_exit(); 66 rcu_idle_exit();
67
73 return 1; 68 return 1;
74} 69}
75 70
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
332{ 327{
333 /* 328 /*
334 * This #ifdef needs to die, but it's too late in the cycle to 329 * This #ifdef needs to die, but it's too late in the cycle to
335 * make this generic (arm and sh have never invoked the canary 330 * make this generic (ARM and SH have never invoked the canary
336 * init for the non boot cpus!). Will be fixed in 3.11 331 * init for the non boot CPUs!). Will be fixed in 3.11
337 */ 332 */
338#ifdef CONFIG_X86 333#ifdef CONFIG_X86
339 /* 334 /*
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
350 while (1) 345 while (1)
351 do_idle(); 346 do_idle();
352} 347}
348
349/*
350 * idle-task scheduling class.
351 */
352
353#ifdef CONFIG_SMP
354static int
355select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
356{
357 return task_cpu(p); /* IDLE tasks as never migrated */
358}
359#endif
360
361/*
362 * Idle tasks are unconditionally rescheduled:
363 */
364static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
365{
366 resched_curr(rq);
367}
368
369static struct task_struct *
370pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
371{
372 put_prev_task(rq, prev);
373 update_idle_core(rq);
374 schedstat_inc(rq->sched_goidle);
375
376 return rq->idle;
377}
378
379/*
380 * It is not legal to sleep in the idle task - print a warning
381 * message if some code attempts to do it:
382 */
383static void
384dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
385{
386 raw_spin_unlock_irq(&rq->lock);
387 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
388 dump_stack();
389 raw_spin_lock_irq(&rq->lock);
390}
391
392static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
393{
394}
395
396/*
397 * scheduler tick hitting a task of our scheduling class.
398 *
399 * NOTE: This function can be called remotely by the tick offload that
400 * goes along full dynticks. Therefore no local assumption can be made
401 * and everything must be accessed through the @rq and @curr passed in
402 * parameters.
403 */
404static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
405{
406}
407
408static void set_curr_task_idle(struct rq *rq)
409{
410}
411
412static void switched_to_idle(struct rq *rq, struct task_struct *p)
413{
414 BUG();
415}
416
417static void
418prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
419{
420 BUG();
421}
422
423static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
424{
425 return 0;
426}
427
428static void update_curr_idle(struct rq *rq)
429{
430}
431
432/*
433 * Simple, special scheduling class for the per-CPU idle tasks:
434 */
435const struct sched_class idle_sched_class = {
436 /* .next is NULL */
437 /* no enqueue/yield_task for idle tasks */
438
439 /* dequeue is not valid, we print a debug message there: */
440 .dequeue_task = dequeue_task_idle,
441
442 .check_preempt_curr = check_preempt_curr_idle,
443
444 .pick_next_task = pick_next_task_idle,
445 .put_prev_task = put_prev_task_idle,
446
447#ifdef CONFIG_SMP
448 .select_task_rq = select_task_rq_idle,
449 .set_cpus_allowed = set_cpus_allowed_common,
450#endif
451
452 .set_curr_task = set_curr_task_idle,
453 .task_tick = task_tick_idle,
454
455 .get_rr_interval = get_rr_interval_idle,
456
457 .prio_changed = prio_changed_idle,
458 .switched_to = switched_to_idle,
459 .update_curr = update_curr_idle,
460};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index d518664cce4f..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,110 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/*
5 * idle-task scheduling class.
6 *
7 * (NOTE: these are not related to SCHED_IDLE tasks which are
8 * handled in sched/fair.c)
9 */
10
11#ifdef CONFIG_SMP
12static int
13select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
14{
15 return task_cpu(p); /* IDLE tasks as never migrated */
16}
17#endif /* CONFIG_SMP */
18
19/*
20 * Idle tasks are unconditionally rescheduled:
21 */
22static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
23{
24 resched_curr(rq);
25}
26
27static struct task_struct *
28pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
29{
30 put_prev_task(rq, prev);
31 update_idle_core(rq);
32 schedstat_inc(rq->sched_goidle);
33 return rq->idle;
34}
35
36/*
37 * It is not legal to sleep in the idle task - print a warning
38 * message if some code attempts to do it:
39 */
40static void
41dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
42{
43 raw_spin_unlock_irq(&rq->lock);
44 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
45 dump_stack();
46 raw_spin_lock_irq(&rq->lock);
47}
48
49static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
50{
51 rq_last_tick_reset(rq);
52}
53
54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_idle(struct rq *rq)
59{
60}
61
62static void switched_to_idle(struct rq *rq, struct task_struct *p)
63{
64 BUG();
65}
66
67static void
68prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
69{
70 BUG();
71}
72
73static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
74{
75 return 0;
76}
77
78static void update_curr_idle(struct rq *rq)
79{
80}
81
82/*
83 * Simple, special scheduling class for the per-CPU idle tasks:
84 */
85const struct sched_class idle_sched_class = {
86 /* .next is NULL */
87 /* no enqueue/yield_task for idle tasks */
88
89 /* dequeue is not valid, we print a debug message there: */
90 .dequeue_task = dequeue_task_idle,
91
92 .check_preempt_curr = check_preempt_curr_idle,
93
94 .pick_next_task = pick_next_task_idle,
95 .put_prev_task = put_prev_task_idle,
96
97#ifdef CONFIG_SMP
98 .select_task_rq = select_task_rq_idle,
99 .set_cpus_allowed = set_cpus_allowed_common,
100#endif
101
102 .set_curr_task = set_curr_task_idle,
103 .task_tick = task_tick_idle,
104
105 .get_rr_interval = get_rr_interval_idle,
106
107 .prio_changed = prio_changed_idle,
108 .switched_to = switched_to_idle,
109 .update_curr = update_curr_idle,
110};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436f59f2..e6802181900f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -3,15 +3,10 @@
3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work. 3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
4 * 4 *
5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker 5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
6 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
6 * 7 *
7 */ 8 */
8 9#include "sched.h"
9#include <linux/sched/isolation.h>
10#include <linux/tick.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/static_key.h>
14#include <linux/ctype.h>
15 10
16DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); 11DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
17EXPORT_SYMBOL_GPL(housekeeping_overriden); 12EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
60 55
61 static_branch_enable(&housekeeping_overriden); 56 static_branch_enable(&housekeeping_overriden);
62 57
58 if (housekeeping_flags & HK_FLAG_TICK)
59 sched_tick_offload_init();
60
63 /* We need at least one CPU to handle housekeeping work */ 61 /* We need at least one CPU to handle housekeeping work */
64 WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); 62 WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
65} 63}
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
119{ 117{
120 unsigned int flags; 118 unsigned int flags;
121 119
122 flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; 120 flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
123 121
124 return housekeeping_setup(str, flags); 122 return housekeeping_setup(str, flags);
125} 123}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a171c1258109 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,10 +6,6 @@
6 * figure. Its a silly number but people think its important. We go through 6 * figure. Its a silly number but people think its important. We go through
7 * great pains to make it work on big machines and tickless kernels. 7 * great pains to make it work on big machines and tickless kernels.
8 */ 8 */
9
10#include <linux/export.h>
11#include <linux/sched/loadavg.h>
12
13#include "sched.h" 9#include "sched.h"
14 10
15/* 11/*
@@ -32,29 +28,29 @@
32 * Due to a number of reasons the above turns in the mess below: 28 * Due to a number of reasons the above turns in the mess below:
33 * 29 *
34 * - for_each_possible_cpu() is prohibitively expensive on machines with 30 * - for_each_possible_cpu() is prohibitively expensive on machines with
35 * serious number of cpus, therefore we need to take a distributed approach 31 * serious number of CPUs, therefore we need to take a distributed approach
36 * to calculating nr_active. 32 * to calculating nr_active.
37 * 33 *
38 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 34 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
39 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } 35 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
40 * 36 *
41 * So assuming nr_active := 0 when we start out -- true per definition, we 37 * So assuming nr_active := 0 when we start out -- true per definition, we
42 * can simply take per-cpu deltas and fold those into a global accumulate 38 * can simply take per-CPU deltas and fold those into a global accumulate
43 * to obtain the same result. See calc_load_fold_active(). 39 * to obtain the same result. See calc_load_fold_active().
44 * 40 *
45 * Furthermore, in order to avoid synchronizing all per-cpu delta folding 41 * Furthermore, in order to avoid synchronizing all per-CPU delta folding
46 * across the machine, we assume 10 ticks is sufficient time for every 42 * across the machine, we assume 10 ticks is sufficient time for every
47 * cpu to have completed this task. 43 * CPU to have completed this task.
48 * 44 *
49 * This places an upper-bound on the IRQ-off latency of the machine. Then 45 * This places an upper-bound on the IRQ-off latency of the machine. Then
50 * again, being late doesn't loose the delta, just wrecks the sample. 46 * again, being late doesn't loose the delta, just wrecks the sample.
51 * 47 *
52 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because 48 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
53 * this would add another cross-cpu cacheline miss and atomic operation 49 * this would add another cross-CPU cacheline miss and atomic operation
54 * to the wakeup path. Instead we increment on whatever cpu the task ran 50 * to the wakeup path. Instead we increment on whatever CPU the task ran
55 * when it went into uninterruptible state and decrement on whatever cpu 51 * when it went into uninterruptible state and decrement on whatever CPU
56 * did the wakeup. This means that only the sum of nr_uninterruptible over 52 * did the wakeup. This means that only the sum of nr_uninterruptible over
57 * all cpus yields the correct result. 53 * all CPUs yields the correct result.
58 * 54 *
59 * This covers the NO_HZ=n code, for extra head-aches, see the comment below. 55 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
60 */ 56 */
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
115 * Handle NO_HZ for the global load-average. 111 * Handle NO_HZ for the global load-average.
116 * 112 *
117 * Since the above described distributed algorithm to compute the global 113 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by 114 * load-average relies on per-CPU sampling from the tick, it is affected by
119 * NO_HZ. 115 * NO_HZ.
120 * 116 *
121 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon 117 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 118 * entering NO_HZ state such that we can include this as an 'extra' CPU delta
123 * when we read the global state. 119 * when we read the global state.
124 * 120 *
125 * Obviously reality has to ruin such a delightfully simple scheme: 121 * Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
146 * busy state. 142 * busy state.
147 * 143 *
148 * This is solved by pushing the window forward, and thus skipping the 144 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which 145 * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
150 * was in effect at the time the window opened). This also solves the issue 146 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ 147 * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
152 * intervals. 148 * intervals.
153 * 149 *
154 * When making the ILB scale, we should try to pull this in as well. 150 * When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
299} 295}
300 296
301/* 297/*
302 * NO_HZ can leave us missing all per-cpu ticks calling 298 * NO_HZ can leave us missing all per-CPU ticks calling
303 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into 299 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
304 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold 300 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
305 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. 301 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
363 return; 359 return;
364 360
365 /* 361 /*
366 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. 362 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
367 */ 363 */
368 delta = calc_load_nohz_fold(); 364 delta = calc_load_nohz_fold();
369 if (delta) 365 if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..76e0eaf4654e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -13,32 +13,25 @@
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 */ 15 */
16 16#include "sched.h"
17#include <linux/syscalls.h>
18#include <linux/membarrier.h>
19#include <linux/tick.h>
20#include <linux/cpumask.h>
21#include <linux/atomic.h>
22
23#include "sched.h" /* for cpu_rq(). */
24 17
25/* 18/*
26 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 19 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
27 * except MEMBARRIER_CMD_QUERY. 20 * except MEMBARRIER_CMD_QUERY.
28 */ 21 */
29#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 22#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
30#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 23#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
31 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 24 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
32 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
33#else 26#else
34#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 27#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
35#endif 28#endif
36 29
37#define MEMBARRIER_CMD_BITMASK \ 30#define MEMBARRIER_CMD_BITMASK \
38 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 31 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
39 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 32 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
40 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 33 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
41 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 34 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
42 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 35 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
43 36
44static void ipi_mb(void *info) 37static void ipi_mb(void *info)
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
85 */ 78 */
86 if (cpu == raw_smp_processor_id()) 79 if (cpu == raw_smp_processor_id())
87 continue; 80 continue;
81
88 rcu_read_lock(); 82 rcu_read_lock();
89 p = task_rcu_dereference(&cpu_rq(cpu)->curr); 83 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
90 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & 84 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
188 * rq->curr modification in scheduler. 182 * rq->curr modification in scheduler.
189 */ 183 */
190 smp_mb(); /* exit from system call is not a mb */ 184 smp_mb(); /* exit from system call is not a mb */
185
191 return 0; 186 return 0;
192} 187}
193 188
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
219 } 214 }
220 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 215 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
221 &mm->membarrier_state); 216 &mm->membarrier_state);
217
222 return 0; 218 return 0;
223} 219}
224 220
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
253 synchronize_sched(); 249 synchronize_sched();
254 } 250 }
255 atomic_or(state, &mm->membarrier_state); 251 atomic_or(state, &mm->membarrier_state);
252
256 return 0; 253 return 0;
257} 254}
258 255
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aad49451584e..86b77987435e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies) 4 * policies)
5 */ 5 */
6
7#include "sched.h" 6#include "sched.h"
8 7
9#include <linux/slab.h>
10#include <linux/irq_work.h>
11
12int sched_rr_timeslice = RR_TIMESLICE; 8int sched_rr_timeslice = RR_TIMESLICE;
13int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 9int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
14 10
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
359static void push_rt_tasks(struct rq *); 355static void push_rt_tasks(struct rq *);
360static void pull_rt_task(struct rq *); 356static void pull_rt_task(struct rq *);
361 357
362static inline void queue_push_tasks(struct rq *rq) 358static inline void rt_queue_push_tasks(struct rq *rq)
363{ 359{
364 if (!has_pushable_tasks(rq)) 360 if (!has_pushable_tasks(rq))
365 return; 361 return;
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
367 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 363 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
368} 364}
369 365
370static inline void queue_pull_task(struct rq *rq) 366static inline void rt_queue_pull_task(struct rq *rq)
371{ 367{
372 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 368 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
373} 369}
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
425{ 421{
426} 422}
427 423
428static inline void queue_push_tasks(struct rq *rq) 424static inline void rt_queue_push_tasks(struct rq *rq)
429{ 425{
430} 426}
431#endif /* CONFIG_SMP */ 427#endif /* CONFIG_SMP */
@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq)
961 if (unlikely((s64)delta_exec <= 0)) 957 if (unlikely((s64)delta_exec <= 0))
962 return; 958 return;
963 959
964 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
965 cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
966
967 schedstat_set(curr->se.statistics.exec_max, 960 schedstat_set(curr->se.statistics.exec_max,
968 max(curr->se.statistics.exec_max, delta_exec)); 961 max(curr->se.statistics.exec_max, delta_exec));
969 962
@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
1005 998
1006 sub_nr_running(rq, rt_rq->rt_nr_running); 999 sub_nr_running(rq, rt_rq->rt_nr_running);
1007 rt_rq->rt_queued = 0; 1000 rt_rq->rt_queued = 0;
1001
1002 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1003 cpufreq_update_util(rq, 0);
1008} 1004}
1009 1005
1010static void 1006static void
@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
1021 1017
1022 add_nr_running(rq, rt_rq->rt_nr_running); 1018 add_nr_running(rq, rt_rq->rt_nr_running);
1023 rt_rq->rt_queued = 1; 1019 rt_rq->rt_queued = 1;
1020
1021 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1022 cpufreq_update_util(rq, 0);
1024} 1023}
1025 1024
1026#if defined CONFIG_SMP 1025#if defined CONFIG_SMP
@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1453 return; 1452 return;
1454 1453
1455 /* 1454 /*
1456 * There appears to be other cpus that can accept 1455 * There appear to be other CPUs that can accept
1457 * current and none to run 'p', so lets reschedule 1456 * the current task but none can run 'p', so lets reschedule
1458 * to try and push current away: 1457 * to try and push the current task away:
1459 */ 1458 */
1460 requeue_task_rt(rq, p, 1); 1459 requeue_task_rt(rq, p, 1);
1461 resched_curr(rq); 1460 resched_curr(rq);
@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1569 /* The running task is never eligible for pushing */ 1568 /* The running task is never eligible for pushing */
1570 dequeue_pushable_task(rq, p); 1569 dequeue_pushable_task(rq, p);
1571 1570
1572 queue_push_tasks(rq); 1571 rt_queue_push_tasks(rq);
1573 1572
1574 return p; 1573 return p;
1575} 1574}
@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1596 if (!task_running(rq, p) && 1595 if (!task_running(rq, p) &&
1597 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1596 cpumask_test_cpu(cpu, &p->cpus_allowed))
1598 return 1; 1597 return 1;
1598
1599 return 0; 1599 return 0;
1600} 1600}
1601 1601
1602/* 1602/*
1603 * Return the highest pushable rq's task, which is suitable to be executed 1603 * Return the highest pushable rq's task, which is suitable to be executed
1604 * on the cpu, NULL otherwise 1604 * on the CPU, NULL otherwise
1605 */ 1605 */
1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1607{ 1607{
@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task)
1639 return -1; /* No targets found */ 1639 return -1; /* No targets found */
1640 1640
1641 /* 1641 /*
1642 * At this point we have built a mask of cpus representing the 1642 * At this point we have built a mask of CPUs representing the
1643 * lowest priority tasks in the system. Now we want to elect 1643 * lowest priority tasks in the system. Now we want to elect
1644 * the best one based on our affinity and topology. 1644 * the best one based on our affinity and topology.
1645 * 1645 *
1646 * We prioritize the last cpu that the task executed on since 1646 * We prioritize the last CPU that the task executed on since
1647 * it is most likely cache-hot in that location. 1647 * it is most likely cache-hot in that location.
1648 */ 1648 */
1649 if (cpumask_test_cpu(cpu, lowest_mask)) 1649 if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task)
1651 1651
1652 /* 1652 /*
1653 * Otherwise, we consult the sched_domains span maps to figure 1653 * Otherwise, we consult the sched_domains span maps to figure
1654 * out which cpu is logically closest to our hot cache data. 1654 * out which CPU is logically closest to our hot cache data.
1655 */ 1655 */
1656 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1656 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task)
1692 cpu = cpumask_any(lowest_mask); 1692 cpu = cpumask_any(lowest_mask);
1693 if (cpu < nr_cpu_ids) 1693 if (cpu < nr_cpu_ids)
1694 return cpu; 1694 return cpu;
1695
1695 return -1; 1696 return -1;
1696} 1697}
1697 1698
@@ -1827,7 +1828,7 @@ retry:
1827 * The task hasn't migrated, and is still the next 1828 * The task hasn't migrated, and is still the next
1828 * eligible task, but we failed to find a run-queue 1829 * eligible task, but we failed to find a run-queue
1829 * to push it to. Do not retry in this case, since 1830 * to push it to. Do not retry in this case, since
1830 * other cpus will pull from us when ready. 1831 * other CPUs will pull from us when ready.
1831 */ 1832 */
1832 goto out; 1833 goto out;
1833 } 1834 }
@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd)
1919 * rt_next_cpu() will simply return the first CPU found in 1920 * rt_next_cpu() will simply return the first CPU found in
1920 * the rto_mask. 1921 * the rto_mask.
1921 * 1922 *
1922 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it 1923 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1923 * will return the next CPU found in the rto_mask. 1924 * will return the next CPU found in the rto_mask.
1924 * 1925 *
1925 * If there are no more CPUs left in the rto_mask, then a check is made 1926 * If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq)
1980 raw_spin_lock(&rq->rd->rto_lock); 1981 raw_spin_lock(&rq->rd->rto_lock);
1981 1982
1982 /* 1983 /*
1983 * The rto_cpu is updated under the lock, if it has a valid cpu 1984 * The rto_cpu is updated under the lock, if it has a valid CPU
1984 * then the IPI is still running and will continue due to the 1985 * then the IPI is still running and will continue due to the
1985 * update to loop_next, and nothing needs to be done here. 1986 * update to loop_next, and nothing needs to be done here.
1986 * Otherwise it is finishing up and an ipi needs to be sent. 1987 * Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq)
2105 2106
2106 /* 2107 /*
2107 * There's a chance that p is higher in priority 2108 * There's a chance that p is higher in priority
2108 * than what's currently running on its cpu. 2109 * than what's currently running on its CPU.
2109 * This is just that p is wakeing up and hasn't 2110 * This is just that p is wakeing up and hasn't
2110 * had a chance to schedule. We only pull 2111 * had a chance to schedule. We only pull
2111 * p if it is lower in priority than the 2112 * p if it is lower in priority than the
@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
2187 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2188 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2188 return; 2189 return;
2189 2190
2190 queue_pull_task(rq); 2191 rt_queue_pull_task(rq);
2191} 2192}
2192 2193
2193void __init init_sched_rt_class(void) 2194void __init init_sched_rt_class(void)
@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
2218 if (task_on_rq_queued(p) && rq->curr != p) { 2219 if (task_on_rq_queued(p) && rq->curr != p) {
2219#ifdef CONFIG_SMP 2220#ifdef CONFIG_SMP
2220 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2221 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2221 queue_push_tasks(rq); 2222 rt_queue_push_tasks(rq);
2222#endif /* CONFIG_SMP */ 2223#endif /* CONFIG_SMP */
2223 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2224 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2224 resched_curr(rq); 2225 resched_curr(rq);
@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2242 * may need to pull tasks to this runqueue. 2243 * may need to pull tasks to this runqueue.
2243 */ 2244 */
2244 if (oldprio < p->prio) 2245 if (oldprio < p->prio)
2245 queue_pull_task(rq); 2246 rt_queue_pull_task(rq);
2246 2247
2247 /* 2248 /*
2248 * If there's a higher priority task waiting to run 2249 * If there's a higher priority task waiting to run
@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
2292static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2293static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2293#endif 2294#endif
2294 2295
2296/*
2297 * scheduler tick hitting a task of our scheduling class.
2298 *
2299 * NOTE: This function can be called remotely by the tick offload that
2300 * goes along full dynticks. Therefore no local assumption can be made
2301 * and everything must be accessed through the @rq and @curr passed in
2302 * parameters.
2303 */
2295static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2304static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2296{ 2305{
2297 struct sched_rt_entity *rt_se = &p->rt; 2306 struct sched_rt_entity *rt_se = &p->rt;
@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
2685 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2694 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2686 } 2695 }
2687 mutex_unlock(&mutex); 2696 mutex_unlock(&mutex);
2697
2688 return ret; 2698 return ret;
2689} 2699}
2690 2700
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb5fc458547f..c3deaee7a7a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,39 +1,73 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2 2/*
3 * Scheduler internal types and methods:
4 */
3#include <linux/sched.h> 5#include <linux/sched.h>
6
4#include <linux/sched/autogroup.h> 7#include <linux/sched/autogroup.h>
5#include <linux/sched/sysctl.h>
6#include <linux/sched/topology.h>
7#include <linux/sched/rt.h>
8#include <linux/sched/deadline.h>
9#include <linux/sched/clock.h> 8#include <linux/sched/clock.h>
10#include <linux/sched/wake_q.h> 9#include <linux/sched/coredump.h>
11#include <linux/sched/signal.h>
12#include <linux/sched/numa_balancing.h>
13#include <linux/sched/mm.h>
14#include <linux/sched/cpufreq.h> 10#include <linux/sched/cpufreq.h>
15#include <linux/sched/stat.h> 11#include <linux/sched/cputime.h>
16#include <linux/sched/nohz.h> 12#include <linux/sched/deadline.h>
17#include <linux/sched/debug.h> 13#include <linux/sched/debug.h>
18#include <linux/sched/hotplug.h> 14#include <linux/sched/hotplug.h>
15#include <linux/sched/idle.h>
16#include <linux/sched/init.h>
17#include <linux/sched/isolation.h>
18#include <linux/sched/jobctl.h>
19#include <linux/sched/loadavg.h>
20#include <linux/sched/mm.h>
21#include <linux/sched/nohz.h>
22#include <linux/sched/numa_balancing.h>
23#include <linux/sched/prio.h>
24#include <linux/sched/rt.h>
25#include <linux/sched/signal.h>
26#include <linux/sched/stat.h>
27#include <linux/sched/sysctl.h>
19#include <linux/sched/task.h> 28#include <linux/sched/task.h>
20#include <linux/sched/task_stack.h> 29#include <linux/sched/task_stack.h>
21#include <linux/sched/cputime.h> 30#include <linux/sched/topology.h>
22#include <linux/sched/init.h> 31#include <linux/sched/user.h>
32#include <linux/sched/wake_q.h>
33#include <linux/sched/xacct.h>
34
35#include <uapi/linux/sched/types.h>
23 36
24#include <linux/u64_stats_sync.h>
25#include <linux/kernel_stat.h>
26#include <linux/binfmts.h> 37#include <linux/binfmts.h>
27#include <linux/mutex.h> 38#include <linux/blkdev.h>
28#include <linux/spinlock.h> 39#include <linux/compat.h>
40#include <linux/context_tracking.h>
41#include <linux/cpufreq.h>
42#include <linux/cpuidle.h>
43#include <linux/cpuset.h>
44#include <linux/ctype.h>
45#include <linux/debugfs.h>
46#include <linux/delayacct.h>
47#include <linux/init_task.h>
48#include <linux/kprobes.h>
49#include <linux/kthread.h>
50#include <linux/membarrier.h>
51#include <linux/migrate.h>
52#include <linux/mmu_context.h>
53#include <linux/nmi.h>
54#include <linux/proc_fs.h>
55#include <linux/prefetch.h>
56#include <linux/profile.h>
57#include <linux/rcupdate_wait.h>
58#include <linux/security.h>
59#include <linux/stackprotector.h>
29#include <linux/stop_machine.h> 60#include <linux/stop_machine.h>
30#include <linux/irq_work.h> 61#include <linux/suspend.h>
31#include <linux/tick.h> 62#include <linux/swait.h>
32#include <linux/slab.h> 63#include <linux/syscalls.h>
33#include <linux/cgroup.h> 64#include <linux/task_work.h>
65#include <linux/tsacct_kern.h>
66
67#include <asm/tlb.h>
34 68
35#ifdef CONFIG_PARAVIRT 69#ifdef CONFIG_PARAVIRT
36#include <asm/paravirt.h> 70# include <asm/paravirt.h>
37#endif 71#endif
38 72
39#include "cpupri.h" 73#include "cpupri.h"
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
79 * and does not change the user-interface for setting shares/weights. 113 * and does not change the user-interface for setting shares/weights.
80 * 114 *
81 * We increase resolution only if we have enough bits to allow this increased 115 * We increase resolution only if we have enough bits to allow this increased
82 * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are 116 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
83 * pretty high and the returns do not justify the increased costs. 117 * are pretty high and the returns do not justify the increased costs.
84 * 118 *
85 * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to 119 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
86 * increase coverage and consistency always enable it on 64bit platforms. 120 * increase coverage and consistency always enable it on 64-bit platforms.
87 */ 121 */
88#ifdef CONFIG_64BIT 122#ifdef CONFIG_64BIT
89# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 123# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
111 * 10 -> just above 1us 145 * 10 -> just above 1us
112 * 9 -> just above 0.5us 146 * 9 -> just above 0.5us
113 */ 147 */
114#define DL_SCALE (10) 148#define DL_SCALE 10
115 149
116/* 150/*
117 * These are the 'tuning knobs' of the scheduler: 151 * Single value that denotes runtime == period, ie unlimited time.
118 */ 152 */
119 153#define RUNTIME_INF ((u64)~0ULL)
120/*
121 * single value that denotes runtime == period, ie unlimited time.
122 */
123#define RUNTIME_INF ((u64)~0ULL)
124 154
125static inline int idle_policy(int policy) 155static inline int idle_policy(int policy)
126{ 156{
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
235 * control. 265 * control.
236 */ 266 */
237struct dl_bandwidth { 267struct dl_bandwidth {
238 raw_spinlock_t dl_runtime_lock; 268 raw_spinlock_t dl_runtime_lock;
239 u64 dl_runtime; 269 u64 dl_runtime;
240 u64 dl_period; 270 u64 dl_period;
241}; 271};
242 272
243static inline int dl_bandwidth_enabled(void) 273static inline int dl_bandwidth_enabled(void)
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
246} 276}
247 277
248struct dl_bw { 278struct dl_bw {
249 raw_spinlock_t lock; 279 raw_spinlock_t lock;
250 u64 bw, total_bw; 280 u64 bw;
281 u64 total_bw;
251}; 282};
252 283
253static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 284static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
273 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 304 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
274} 305}
275 306
276void dl_change_utilization(struct task_struct *p, u64 new_bw); 307extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
277extern void init_dl_bw(struct dl_bw *dl_b); 308extern void init_dl_bw(struct dl_bw *dl_b);
278extern int sched_dl_global_validate(void); 309extern int sched_dl_global_validate(void);
279extern void sched_dl_do_global(void); 310extern void sched_dl_do_global(void);
280extern int sched_dl_overflow(struct task_struct *p, int policy, 311extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
281 const struct sched_attr *attr);
282extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 312extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
283extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 313extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
284extern bool __checkparam_dl(const struct sched_attr *attr); 314extern bool __checkparam_dl(const struct sched_attr *attr);
285extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 315extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
286extern int dl_task_can_attach(struct task_struct *p, 316extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
287 const struct cpumask *cs_cpus_allowed); 317extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
288extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
289 const struct cpumask *trial);
290extern bool dl_cpu_busy(unsigned int cpu); 318extern bool dl_cpu_busy(unsigned int cpu);
291 319
292#ifdef CONFIG_CGROUP_SCHED 320#ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +328,36 @@ extern struct list_head task_groups;
300 328
301struct cfs_bandwidth { 329struct cfs_bandwidth {
302#ifdef CONFIG_CFS_BANDWIDTH 330#ifdef CONFIG_CFS_BANDWIDTH
303 raw_spinlock_t lock; 331 raw_spinlock_t lock;
304 ktime_t period; 332 ktime_t period;
305 u64 quota, runtime; 333 u64 quota;
306 s64 hierarchical_quota; 334 u64 runtime;
307 u64 runtime_expires; 335 s64 hierarchical_quota;
308 336 u64 runtime_expires;
309 int idle, period_active; 337
310 struct hrtimer period_timer, slack_timer; 338 int idle;
311 struct list_head throttled_cfs_rq; 339 int period_active;
312 340 struct hrtimer period_timer;
313 /* statistics */ 341 struct hrtimer slack_timer;
314 int nr_periods, nr_throttled; 342 struct list_head throttled_cfs_rq;
315 u64 throttled_time; 343
344 /* Statistics: */
345 int nr_periods;
346 int nr_throttled;
347 u64 throttled_time;
316#endif 348#endif
317}; 349};
318 350
319/* task group related information */ 351/* Task group related information */
320struct task_group { 352struct task_group {
321 struct cgroup_subsys_state css; 353 struct cgroup_subsys_state css;
322 354
323#ifdef CONFIG_FAIR_GROUP_SCHED 355#ifdef CONFIG_FAIR_GROUP_SCHED
324 /* schedulable entities of this group on each cpu */ 356 /* schedulable entities of this group on each CPU */
325 struct sched_entity **se; 357 struct sched_entity **se;
326 /* runqueue "owned" by this group on each cpu */ 358 /* runqueue "owned" by this group on each CPU */
327 struct cfs_rq **cfs_rq; 359 struct cfs_rq **cfs_rq;
328 unsigned long shares; 360 unsigned long shares;
329 361
330#ifdef CONFIG_SMP 362#ifdef CONFIG_SMP
331 /* 363 /*
@@ -333,29 +365,29 @@ struct task_group {
333 * it in its own cacheline separated from the fields above which 365 * it in its own cacheline separated from the fields above which
334 * will also be accessed at each tick. 366 * will also be accessed at each tick.
335 */ 367 */
336 atomic_long_t load_avg ____cacheline_aligned; 368 atomic_long_t load_avg ____cacheline_aligned;
337#endif 369#endif
338#endif 370#endif
339 371
340#ifdef CONFIG_RT_GROUP_SCHED 372#ifdef CONFIG_RT_GROUP_SCHED
341 struct sched_rt_entity **rt_se; 373 struct sched_rt_entity **rt_se;
342 struct rt_rq **rt_rq; 374 struct rt_rq **rt_rq;
343 375
344 struct rt_bandwidth rt_bandwidth; 376 struct rt_bandwidth rt_bandwidth;
345#endif 377#endif
346 378
347 struct rcu_head rcu; 379 struct rcu_head rcu;
348 struct list_head list; 380 struct list_head list;
349 381
350 struct task_group *parent; 382 struct task_group *parent;
351 struct list_head siblings; 383 struct list_head siblings;
352 struct list_head children; 384 struct list_head children;
353 385
354#ifdef CONFIG_SCHED_AUTOGROUP 386#ifdef CONFIG_SCHED_AUTOGROUP
355 struct autogroup *autogroup; 387 struct autogroup *autogroup;
356#endif 388#endif
357 389
358 struct cfs_bandwidth cfs_bandwidth; 390 struct cfs_bandwidth cfs_bandwidth;
359}; 391};
360 392
361#ifdef CONFIG_FAIR_GROUP_SCHED 393#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +401,8 @@ struct task_group {
369 * (The default weight is 1024 - so there's no practical 401 * (The default weight is 1024 - so there's no practical
370 * limitation from this.) 402 * limitation from this.)
371 */ 403 */
372#define MIN_SHARES (1UL << 1) 404#define MIN_SHARES (1UL << 1)
373#define MAX_SHARES (1UL << 18) 405#define MAX_SHARES (1UL << 18)
374#endif 406#endif
375 407
376typedef int (*tg_visitor)(struct task_group *, void *); 408typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
443 475
444/* CFS-related fields in a runqueue */ 476/* CFS-related fields in a runqueue */
445struct cfs_rq { 477struct cfs_rq {
446 struct load_weight load; 478 struct load_weight load;
447 unsigned long runnable_weight; 479 unsigned long runnable_weight;
448 unsigned int nr_running, h_nr_running; 480 unsigned int nr_running;
481 unsigned int h_nr_running;
449 482
450 u64 exec_clock; 483 u64 exec_clock;
451 u64 min_vruntime; 484 u64 min_vruntime;
452#ifndef CONFIG_64BIT 485#ifndef CONFIG_64BIT
453 u64 min_vruntime_copy; 486 u64 min_vruntime_copy;
454#endif 487#endif
455 488
456 struct rb_root_cached tasks_timeline; 489 struct rb_root_cached tasks_timeline;
457 490
458 /* 491 /*
459 * 'curr' points to currently running entity on this cfs_rq. 492 * 'curr' points to currently running entity on this cfs_rq.
460 * It is set to NULL otherwise (i.e when none are currently running). 493 * It is set to NULL otherwise (i.e when none are currently running).
461 */ 494 */
462 struct sched_entity *curr, *next, *last, *skip; 495 struct sched_entity *curr;
496 struct sched_entity *next;
497 struct sched_entity *last;
498 struct sched_entity *skip;
463 499
464#ifdef CONFIG_SCHED_DEBUG 500#ifdef CONFIG_SCHED_DEBUG
465 unsigned int nr_spread_over; 501 unsigned int nr_spread_over;
466#endif 502#endif
467 503
468#ifdef CONFIG_SMP 504#ifdef CONFIG_SMP
469 /* 505 /*
470 * CFS load tracking 506 * CFS load tracking
471 */ 507 */
472 struct sched_avg avg; 508 struct sched_avg avg;
473#ifndef CONFIG_64BIT 509#ifndef CONFIG_64BIT
474 u64 load_last_update_time_copy; 510 u64 load_last_update_time_copy;
475#endif 511#endif
476 struct { 512 struct {
477 raw_spinlock_t lock ____cacheline_aligned; 513 raw_spinlock_t lock ____cacheline_aligned;
@@ -482,9 +518,9 @@ struct cfs_rq {
482 } removed; 518 } removed;
483 519
484#ifdef CONFIG_FAIR_GROUP_SCHED 520#ifdef CONFIG_FAIR_GROUP_SCHED
485 unsigned long tg_load_avg_contrib; 521 unsigned long tg_load_avg_contrib;
486 long propagate; 522 long propagate;
487 long prop_runnable_sum; 523 long prop_runnable_sum;
488 524
489 /* 525 /*
490 * h_load = weight * f(tg) 526 * h_load = weight * f(tg)
@@ -492,36 +528,38 @@ struct cfs_rq {
492 * Where f(tg) is the recursive weight fraction assigned to 528 * Where f(tg) is the recursive weight fraction assigned to
493 * this group. 529 * this group.
494 */ 530 */
495 unsigned long h_load; 531 unsigned long h_load;
496 u64 last_h_load_update; 532 u64 last_h_load_update;
497 struct sched_entity *h_load_next; 533 struct sched_entity *h_load_next;
498#endif /* CONFIG_FAIR_GROUP_SCHED */ 534#endif /* CONFIG_FAIR_GROUP_SCHED */
499#endif /* CONFIG_SMP */ 535#endif /* CONFIG_SMP */
500 536
501#ifdef CONFIG_FAIR_GROUP_SCHED 537#ifdef CONFIG_FAIR_GROUP_SCHED
502 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 538 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
503 539
504 /* 540 /*
505 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 541 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
506 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 542 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
507 * (like users, containers etc.) 543 * (like users, containers etc.)
508 * 544 *
509 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 545 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
510 * list is used during load balance. 546 * This list is used during load balance.
511 */ 547 */
512 int on_list; 548 int on_list;
513 struct list_head leaf_cfs_rq_list; 549 struct list_head leaf_cfs_rq_list;
514 struct task_group *tg; /* group that "owns" this runqueue */ 550 struct task_group *tg; /* group that "owns" this runqueue */
515 551
516#ifdef CONFIG_CFS_BANDWIDTH 552#ifdef CONFIG_CFS_BANDWIDTH
517 int runtime_enabled; 553 int runtime_enabled;
518 u64 runtime_expires; 554 u64 runtime_expires;
519 s64 runtime_remaining; 555 s64 runtime_remaining;
520 556
521 u64 throttled_clock, throttled_clock_task; 557 u64 throttled_clock;
522 u64 throttled_clock_task_time; 558 u64 throttled_clock_task;
523 int throttled, throttle_count; 559 u64 throttled_clock_task_time;
524 struct list_head throttled_list; 560 int throttled;
561 int throttle_count;
562 struct list_head throttled_list;
525#endif /* CONFIG_CFS_BANDWIDTH */ 563#endif /* CONFIG_CFS_BANDWIDTH */
526#endif /* CONFIG_FAIR_GROUP_SCHED */ 564#endif /* CONFIG_FAIR_GROUP_SCHED */
527}; 565};
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
538 576
539/* Real-Time classes' related field in a runqueue: */ 577/* Real-Time classes' related field in a runqueue: */
540struct rt_rq { 578struct rt_rq {
541 struct rt_prio_array active; 579 struct rt_prio_array active;
542 unsigned int rt_nr_running; 580 unsigned int rt_nr_running;
543 unsigned int rr_nr_running; 581 unsigned int rr_nr_running;
544#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 582#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
545 struct { 583 struct {
546 int curr; /* highest queued rt task prio */ 584 int curr; /* highest queued rt task prio */
547#ifdef CONFIG_SMP 585#ifdef CONFIG_SMP
548 int next; /* next highest */ 586 int next; /* next highest */
549#endif 587#endif
550 } highest_prio; 588 } highest_prio;
551#endif 589#endif
552#ifdef CONFIG_SMP 590#ifdef CONFIG_SMP
553 unsigned long rt_nr_migratory; 591 unsigned long rt_nr_migratory;
554 unsigned long rt_nr_total; 592 unsigned long rt_nr_total;
555 int overloaded; 593 int overloaded;
556 struct plist_head pushable_tasks; 594 struct plist_head pushable_tasks;
557#endif /* CONFIG_SMP */ 595#endif /* CONFIG_SMP */
558 int rt_queued; 596 int rt_queued;
559 597
560 int rt_throttled; 598 int rt_throttled;
561 u64 rt_time; 599 u64 rt_time;
562 u64 rt_runtime; 600 u64 rt_runtime;
563 /* Nests inside the rq lock: */ 601 /* Nests inside the rq lock: */
564 raw_spinlock_t rt_runtime_lock; 602 raw_spinlock_t rt_runtime_lock;
565 603
566#ifdef CONFIG_RT_GROUP_SCHED 604#ifdef CONFIG_RT_GROUP_SCHED
567 unsigned long rt_nr_boosted; 605 unsigned long rt_nr_boosted;
568 606
569 struct rq *rq; 607 struct rq *rq;
570 struct task_group *tg; 608 struct task_group *tg;
571#endif 609#endif
572}; 610};
573 611
574/* Deadline class' related fields in a runqueue */ 612/* Deadline class' related fields in a runqueue */
575struct dl_rq { 613struct dl_rq {
576 /* runqueue is an rbtree, ordered by deadline */ 614 /* runqueue is an rbtree, ordered by deadline */
577 struct rb_root_cached root; 615 struct rb_root_cached root;
578 616
579 unsigned long dl_nr_running; 617 unsigned long dl_nr_running;
580 618
581#ifdef CONFIG_SMP 619#ifdef CONFIG_SMP
582 /* 620 /*
@@ -586,28 +624,28 @@ struct dl_rq {
586 * should migrate somewhere else. 624 * should migrate somewhere else.
587 */ 625 */
588 struct { 626 struct {
589 u64 curr; 627 u64 curr;
590 u64 next; 628 u64 next;
591 } earliest_dl; 629 } earliest_dl;
592 630
593 unsigned long dl_nr_migratory; 631 unsigned long dl_nr_migratory;
594 int overloaded; 632 int overloaded;
595 633
596 /* 634 /*
597 * Tasks on this rq that can be pushed away. They are kept in 635 * Tasks on this rq that can be pushed away. They are kept in
598 * an rb-tree, ordered by tasks' deadlines, with caching 636 * an rb-tree, ordered by tasks' deadlines, with caching
599 * of the leftmost (earliest deadline) element. 637 * of the leftmost (earliest deadline) element.
600 */ 638 */
601 struct rb_root_cached pushable_dl_tasks_root; 639 struct rb_root_cached pushable_dl_tasks_root;
602#else 640#else
603 struct dl_bw dl_bw; 641 struct dl_bw dl_bw;
604#endif 642#endif
605 /* 643 /*
606 * "Active utilization" for this runqueue: increased when a 644 * "Active utilization" for this runqueue: increased when a
607 * task wakes up (becomes TASK_RUNNING) and decreased when a 645 * task wakes up (becomes TASK_RUNNING) and decreased when a
608 * task blocks 646 * task blocks
609 */ 647 */
610 u64 running_bw; 648 u64 running_bw;
611 649
612 /* 650 /*
613 * Utilization of the tasks "assigned" to this runqueue (including 651 * Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +656,14 @@ struct dl_rq {
618 * This is needed to compute the "inactive utilization" for the 656 * This is needed to compute the "inactive utilization" for the
619 * runqueue (inactive utilization = this_bw - running_bw). 657 * runqueue (inactive utilization = this_bw - running_bw).
620 */ 658 */
621 u64 this_bw; 659 u64 this_bw;
622 u64 extra_bw; 660 u64 extra_bw;
623 661
624 /* 662 /*
625 * Inverse of the fraction of CPU utilization that can be reclaimed 663 * Inverse of the fraction of CPU utilization that can be reclaimed
626 * by the GRUB algorithm. 664 * by the GRUB algorithm.
627 */ 665 */
628 u64 bw_ratio; 666 u64 bw_ratio;
629}; 667};
630 668
631#ifdef CONFIG_SMP 669#ifdef CONFIG_SMP
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
638/* 676/*
639 * We add the notion of a root-domain which will be used to define per-domain 677 * We add the notion of a root-domain which will be used to define per-domain
640 * variables. Each exclusive cpuset essentially defines an island domain by 678 * variables. Each exclusive cpuset essentially defines an island domain by
641 * fully partitioning the member cpus from any other cpuset. Whenever a new 679 * fully partitioning the member CPUs from any other cpuset. Whenever a new
642 * exclusive cpuset is created, we also create and attach a new root-domain 680 * exclusive cpuset is created, we also create and attach a new root-domain
643 * object. 681 * object.
644 * 682 *
645 */ 683 */
646struct root_domain { 684struct root_domain {
647 atomic_t refcount; 685 atomic_t refcount;
648 atomic_t rto_count; 686 atomic_t rto_count;
649 struct rcu_head rcu; 687 struct rcu_head rcu;
650 cpumask_var_t span; 688 cpumask_var_t span;
651 cpumask_var_t online; 689 cpumask_var_t online;
652 690
653 /* Indicate more than one runnable task for any CPU */ 691 /* Indicate more than one runnable task for any CPU */
654 bool overload; 692 bool overload;
655 693
656 /* 694 /*
657 * The bit corresponding to a CPU gets set here if such CPU has more 695 * The bit corresponding to a CPU gets set here if such CPU has more
658 * than one runnable -deadline task (as it is below for RT tasks). 696 * than one runnable -deadline task (as it is below for RT tasks).
659 */ 697 */
660 cpumask_var_t dlo_mask; 698 cpumask_var_t dlo_mask;
661 atomic_t dlo_count; 699 atomic_t dlo_count;
662 struct dl_bw dl_bw; 700 struct dl_bw dl_bw;
663 struct cpudl cpudl; 701 struct cpudl cpudl;
664 702
665#ifdef HAVE_RT_PUSH_IPI 703#ifdef HAVE_RT_PUSH_IPI
666 /* 704 /*
667 * For IPI pull requests, loop across the rto_mask. 705 * For IPI pull requests, loop across the rto_mask.
668 */ 706 */
669 struct irq_work rto_push_work; 707 struct irq_work rto_push_work;
670 raw_spinlock_t rto_lock; 708 raw_spinlock_t rto_lock;
671 /* These are only updated and read within rto_lock */ 709 /* These are only updated and read within rto_lock */
672 int rto_loop; 710 int rto_loop;
673 int rto_cpu; 711 int rto_cpu;
674 /* These atomics are updated outside of a lock */ 712 /* These atomics are updated outside of a lock */
675 atomic_t rto_loop_next; 713 atomic_t rto_loop_next;
676 atomic_t rto_loop_start; 714 atomic_t rto_loop_start;
677#endif 715#endif
678 /* 716 /*
679 * The "RT overload" flag: it gets set if a CPU has more than 717 * The "RT overload" flag: it gets set if a CPU has more than
680 * one runnable RT task. 718 * one runnable RT task.
681 */ 719 */
682 cpumask_var_t rto_mask; 720 cpumask_var_t rto_mask;
683 struct cpupri cpupri; 721 struct cpupri cpupri;
684 722
685 unsigned long max_cpu_capacity; 723 unsigned long max_cpu_capacity;
686}; 724};
687 725
688extern struct root_domain def_root_domain; 726extern struct root_domain def_root_domain;
@@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work);
708 */ 746 */
709struct rq { 747struct rq {
710 /* runqueue lock: */ 748 /* runqueue lock: */
711 raw_spinlock_t lock; 749 raw_spinlock_t lock;
712 750
713 /* 751 /*
714 * nr_running and cpu_load should be in the same cacheline because 752 * nr_running and cpu_load should be in the same cacheline because
715 * remote CPUs use both these fields when doing load calculation. 753 * remote CPUs use both these fields when doing load calculation.
716 */ 754 */
717 unsigned int nr_running; 755 unsigned int nr_running;
718#ifdef CONFIG_NUMA_BALANCING 756#ifdef CONFIG_NUMA_BALANCING
719 unsigned int nr_numa_running; 757 unsigned int nr_numa_running;
720 unsigned int nr_preferred_running; 758 unsigned int nr_preferred_running;
721#endif 759#endif
722 #define CPU_LOAD_IDX_MAX 5 760 #define CPU_LOAD_IDX_MAX 5
723 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 761 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
724#ifdef CONFIG_NO_HZ_COMMON 762#ifdef CONFIG_NO_HZ_COMMON
725#ifdef CONFIG_SMP 763#ifdef CONFIG_SMP
726 unsigned long last_load_update_tick; 764 unsigned long last_load_update_tick;
765 unsigned long last_blocked_load_update_tick;
766 unsigned int has_blocked_load;
727#endif /* CONFIG_SMP */ 767#endif /* CONFIG_SMP */
728 unsigned long nohz_flags; 768 unsigned int nohz_tick_stopped;
769 atomic_t nohz_flags;
729#endif /* CONFIG_NO_HZ_COMMON */ 770#endif /* CONFIG_NO_HZ_COMMON */
730#ifdef CONFIG_NO_HZ_FULL
731 unsigned long last_sched_tick;
732#endif
733 /* capture load from *all* tasks on this cpu: */
734 struct load_weight load;
735 unsigned long nr_load_updates;
736 u64 nr_switches;
737 771
738 struct cfs_rq cfs; 772 /* capture load from *all* tasks on this CPU: */
739 struct rt_rq rt; 773 struct load_weight load;
740 struct dl_rq dl; 774 unsigned long nr_load_updates;
775 u64 nr_switches;
776
777 struct cfs_rq cfs;
778 struct rt_rq rt;
779 struct dl_rq dl;
741 780
742#ifdef CONFIG_FAIR_GROUP_SCHED 781#ifdef CONFIG_FAIR_GROUP_SCHED
743 /* list of leaf cfs_rq on this cpu: */ 782 /* list of leaf cfs_rq on this CPU: */
744 struct list_head leaf_cfs_rq_list; 783 struct list_head leaf_cfs_rq_list;
745 struct list_head *tmp_alone_branch; 784 struct list_head *tmp_alone_branch;
746#endif /* CONFIG_FAIR_GROUP_SCHED */ 785#endif /* CONFIG_FAIR_GROUP_SCHED */
747 786
748 /* 787 /*
@@ -751,94 +790,98 @@ struct rq {
751 * one CPU and if it got migrated afterwards it may decrease 790 * one CPU and if it got migrated afterwards it may decrease
752 * it on another CPU. Always updated under the runqueue lock: 791 * it on another CPU. Always updated under the runqueue lock:
753 */ 792 */
754 unsigned long nr_uninterruptible; 793 unsigned long nr_uninterruptible;
755 794
756 struct task_struct *curr, *idle, *stop; 795 struct task_struct *curr;
757 unsigned long next_balance; 796 struct task_struct *idle;
758 struct mm_struct *prev_mm; 797 struct task_struct *stop;
798 unsigned long next_balance;
799 struct mm_struct *prev_mm;
759 800
760 unsigned int clock_update_flags; 801 unsigned int clock_update_flags;
761 u64 clock; 802 u64 clock;
762 u64 clock_task; 803 u64 clock_task;
763 804
764 atomic_t nr_iowait; 805 atomic_t nr_iowait;
765 806
766#ifdef CONFIG_SMP 807#ifdef CONFIG_SMP
767 struct root_domain *rd; 808 struct root_domain *rd;
768 struct sched_domain *sd; 809 struct sched_domain *sd;
769 810
770 unsigned long cpu_capacity; 811 unsigned long cpu_capacity;
771 unsigned long cpu_capacity_orig; 812 unsigned long cpu_capacity_orig;
772 813
773 struct callback_head *balance_callback; 814 struct callback_head *balance_callback;
815
816 unsigned char idle_balance;
774 817
775 unsigned char idle_balance;
776 /* For active balancing */ 818 /* For active balancing */
777 int active_balance; 819 int active_balance;
778 int push_cpu; 820 int push_cpu;
779 struct cpu_stop_work active_balance_work; 821 struct cpu_stop_work active_balance_work;
780 /* cpu of this runqueue: */ 822
781 int cpu; 823 /* CPU of this runqueue: */
782 int online; 824 int cpu;
825 int online;
783 826
784 struct list_head cfs_tasks; 827 struct list_head cfs_tasks;
785 828
786 u64 rt_avg; 829 u64 rt_avg;
787 u64 age_stamp; 830 u64 age_stamp;
788 u64 idle_stamp; 831 u64 idle_stamp;
789 u64 avg_idle; 832 u64 avg_idle;
790 833
791 /* This is used to determine avg_idle's max value */ 834 /* This is used to determine avg_idle's max value */
792 u64 max_idle_balance_cost; 835 u64 max_idle_balance_cost;
793#endif 836#endif
794 837
795#ifdef CONFIG_IRQ_TIME_ACCOUNTING 838#ifdef CONFIG_IRQ_TIME_ACCOUNTING
796 u64 prev_irq_time; 839 u64 prev_irq_time;
797#endif 840#endif
798#ifdef CONFIG_PARAVIRT 841#ifdef CONFIG_PARAVIRT
799 u64 prev_steal_time; 842 u64 prev_steal_time;
800#endif 843#endif
801#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 844#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
802 u64 prev_steal_time_rq; 845 u64 prev_steal_time_rq;
803#endif 846#endif
804 847
805 /* calc_load related fields */ 848 /* calc_load related fields */
806 unsigned long calc_load_update; 849 unsigned long calc_load_update;
807 long calc_load_active; 850 long calc_load_active;
808 851
809#ifdef CONFIG_SCHED_HRTICK 852#ifdef CONFIG_SCHED_HRTICK
810#ifdef CONFIG_SMP 853#ifdef CONFIG_SMP
811 int hrtick_csd_pending; 854 int hrtick_csd_pending;
812 call_single_data_t hrtick_csd; 855 call_single_data_t hrtick_csd;
813#endif 856#endif
814 struct hrtimer hrtick_timer; 857 struct hrtimer hrtick_timer;
815#endif 858#endif
816 859
817#ifdef CONFIG_SCHEDSTATS 860#ifdef CONFIG_SCHEDSTATS
818 /* latency stats */ 861 /* latency stats */
819 struct sched_info rq_sched_info; 862 struct sched_info rq_sched_info;
820 unsigned long long rq_cpu_time; 863 unsigned long long rq_cpu_time;
821 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 864 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
822 865
823 /* sys_sched_yield() stats */ 866 /* sys_sched_yield() stats */
824 unsigned int yld_count; 867 unsigned int yld_count;
825 868
826 /* schedule() stats */ 869 /* schedule() stats */
827 unsigned int sched_count; 870 unsigned int sched_count;
828 unsigned int sched_goidle; 871 unsigned int sched_goidle;
829 872
830 /* try_to_wake_up() stats */ 873 /* try_to_wake_up() stats */
831 unsigned int ttwu_count; 874 unsigned int ttwu_count;
832 unsigned int ttwu_local; 875 unsigned int ttwu_local;
833#endif 876#endif
834 877
835#ifdef CONFIG_SMP 878#ifdef CONFIG_SMP
836 struct llist_head wake_list; 879 struct llist_head wake_list;
837#endif 880#endif
838 881
839#ifdef CONFIG_CPU_IDLE 882#ifdef CONFIG_CPU_IDLE
840 /* Must be inspected within a rcu lock section */ 883 /* Must be inspected within a rcu lock section */
841 struct cpuidle_state *idle_state; 884 struct cpuidle_state *idle_state;
842#endif 885#endif
843}; 886};
844 887
@@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
904 * one position though, because the next rq_unpin_lock() will shift it 947 * one position though, because the next rq_unpin_lock() will shift it
905 * back. 948 * back.
906 */ 949 */
907#define RQCF_REQ_SKIP 0x01 950#define RQCF_REQ_SKIP 0x01
908#define RQCF_ACT_SKIP 0x02 951#define RQCF_ACT_SKIP 0x02
909#define RQCF_UPDATED 0x04 952#define RQCF_UPDATED 0x04
910 953
911static inline void assert_clock_updated(struct rq *rq) 954static inline void assert_clock_updated(struct rq *rq)
912{ 955{
@@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void);
1059 1102
1060/** 1103/**
1061 * highest_flag_domain - Return highest sched_domain containing flag. 1104 * highest_flag_domain - Return highest sched_domain containing flag.
1062 * @cpu: The cpu whose highest level of sched domain is to 1105 * @cpu: The CPU whose highest level of sched domain is to
1063 * be returned. 1106 * be returned.
1064 * @flag: The flag to check for the highest sched_domain 1107 * @flag: The flag to check for the highest sched_domain
1065 * for the given cpu. 1108 * for the given CPU.
1066 * 1109 *
1067 * Returns the highest sched_domain of a cpu which contains the given flag. 1110 * Returns the highest sched_domain of a CPU which contains the given flag.
1068 */ 1111 */
1069static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1112static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
1070{ 1113{
@@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
1099DECLARE_PER_CPU(struct sched_domain *, sd_asym); 1142DECLARE_PER_CPU(struct sched_domain *, sd_asym);
1100 1143
1101struct sched_group_capacity { 1144struct sched_group_capacity {
1102 atomic_t ref; 1145 atomic_t ref;
1103 /* 1146 /*
1104 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 1147 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
1105 * for a single CPU. 1148 * for a single CPU.
1106 */ 1149 */
1107 unsigned long capacity; 1150 unsigned long capacity;
1108 unsigned long min_capacity; /* Min per-CPU capacity in group */ 1151 unsigned long min_capacity; /* Min per-CPU capacity in group */
1109 unsigned long next_update; 1152 unsigned long next_update;
1110 int imbalance; /* XXX unrelated to capacity but shared group state */ 1153 int imbalance; /* XXX unrelated to capacity but shared group state */
1111 1154
1112#ifdef CONFIG_SCHED_DEBUG 1155#ifdef CONFIG_SCHED_DEBUG
1113 int id; 1156 int id;
1114#endif 1157#endif
1115 1158
1116 unsigned long cpumask[0]; /* balance mask */ 1159 unsigned long cpumask[0]; /* Balance mask */
1117}; 1160};
1118 1161
1119struct sched_group { 1162struct sched_group {
1120 struct sched_group *next; /* Must be a circular list */ 1163 struct sched_group *next; /* Must be a circular list */
1121 atomic_t ref; 1164 atomic_t ref;
1122 1165
1123 unsigned int group_weight; 1166 unsigned int group_weight;
1124 struct sched_group_capacity *sgc; 1167 struct sched_group_capacity *sgc;
1125 int asym_prefer_cpu; /* cpu of highest priority in group */ 1168 int asym_prefer_cpu; /* CPU of highest priority in group */
1126 1169
1127 /* 1170 /*
1128 * The CPUs this group covers. 1171 * The CPUs this group covers.
@@ -1131,7 +1174,7 @@ struct sched_group {
1131 * by attaching extra space to the end of the structure, 1174 * by attaching extra space to the end of the structure,
1132 * depending on how many CPUs the kernel has booted up with) 1175 * depending on how many CPUs the kernel has booted up with)
1133 */ 1176 */
1134 unsigned long cpumask[0]; 1177 unsigned long cpumask[0];
1135}; 1178};
1136 1179
1137static inline struct cpumask *sched_group_span(struct sched_group *sg) 1180static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
1148} 1191}
1149 1192
1150/** 1193/**
1151 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 1194 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
1152 * @group: The group whose first cpu is to be returned. 1195 * @group: The group whose first CPU is to be returned.
1153 */ 1196 */
1154static inline unsigned int group_first_cpu(struct sched_group *group) 1197static inline unsigned int group_first_cpu(struct sched_group *group)
1155{ 1198{
@@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1349 return p->on_rq == TASK_ON_RQ_MIGRATING; 1392 return p->on_rq == TASK_ON_RQ_MIGRATING;
1350} 1393}
1351 1394
1352#ifndef prepare_arch_switch
1353# define prepare_arch_switch(next) do { } while (0)
1354#endif
1355#ifndef finish_arch_post_lock_switch
1356# define finish_arch_post_lock_switch() do { } while (0)
1357#endif
1358
1359/* 1395/*
1360 * wake flags 1396 * wake flags
1361 */ 1397 */
1362#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ 1398#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
1363#define WF_FORK 0x02 /* child wakeup after fork */ 1399#define WF_FORK 0x02 /* Child wakeup after fork */
1364#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 1400#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
1365 1401
1366/* 1402/*
1367 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1403 * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1372 * slice expiry etc. 1408 * slice expiry etc.
1373 */ 1409 */
1374 1410
1375#define WEIGHT_IDLEPRIO 3 1411#define WEIGHT_IDLEPRIO 3
1376#define WMULT_IDLEPRIO 1431655765 1412#define WMULT_IDLEPRIO 1431655765
1377 1413
1378extern const int sched_prio_to_weight[40]; 1414extern const int sched_prio_to_weight[40];
1379extern const u32 sched_prio_to_wmult[40]; 1415extern const u32 sched_prio_to_wmult[40];
1380 1416
1381/* 1417/*
1382 * {de,en}queue flags: 1418 * {de,en}queue flags:
@@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40];
1398 */ 1434 */
1399 1435
1400#define DEQUEUE_SLEEP 0x01 1436#define DEQUEUE_SLEEP 0x01
1401#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ 1437#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
1402#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ 1438#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
1403#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ 1439#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
1404 1440
1405#define ENQUEUE_WAKEUP 0x01 1441#define ENQUEUE_WAKEUP 0x01
1406#define ENQUEUE_RESTORE 0x02 1442#define ENQUEUE_RESTORE 0x02
@@ -1422,10 +1458,10 @@ struct sched_class {
1422 1458
1423 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1459 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1424 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1460 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1425 void (*yield_task) (struct rq *rq); 1461 void (*yield_task) (struct rq *rq);
1426 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 1462 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
1427 1463
1428 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1464 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
1429 1465
1430 /* 1466 /*
1431 * It is the responsibility of the pick_next_task() method that will 1467 * It is the responsibility of the pick_next_task() method that will
@@ -1435,16 +1471,16 @@ struct sched_class {
1435 * May return RETRY_TASK when it finds a higher prio class has runnable 1471 * May return RETRY_TASK when it finds a higher prio class has runnable
1436 * tasks. 1472 * tasks.
1437 */ 1473 */
1438 struct task_struct * (*pick_next_task) (struct rq *rq, 1474 struct task_struct * (*pick_next_task)(struct rq *rq,
1439 struct task_struct *prev, 1475 struct task_struct *prev,
1440 struct rq_flags *rf); 1476 struct rq_flags *rf);
1441 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1477 void (*put_prev_task)(struct rq *rq, struct task_struct *p);
1442 1478
1443#ifdef CONFIG_SMP 1479#ifdef CONFIG_SMP
1444 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1480 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1445 void (*migrate_task_rq)(struct task_struct *p); 1481 void (*migrate_task_rq)(struct task_struct *p);
1446 1482
1447 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1483 void (*task_woken)(struct rq *this_rq, struct task_struct *task);
1448 1484
1449 void (*set_cpus_allowed)(struct task_struct *p, 1485 void (*set_cpus_allowed)(struct task_struct *p,
1450 const struct cpumask *newmask); 1486 const struct cpumask *newmask);
@@ -1453,31 +1489,31 @@ struct sched_class {
1453 void (*rq_offline)(struct rq *rq); 1489 void (*rq_offline)(struct rq *rq);
1454#endif 1490#endif
1455 1491
1456 void (*set_curr_task) (struct rq *rq); 1492 void (*set_curr_task)(struct rq *rq);
1457 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1493 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1458 void (*task_fork) (struct task_struct *p); 1494 void (*task_fork)(struct task_struct *p);
1459 void (*task_dead) (struct task_struct *p); 1495 void (*task_dead)(struct task_struct *p);
1460 1496
1461 /* 1497 /*
1462 * The switched_from() call is allowed to drop rq->lock, therefore we 1498 * The switched_from() call is allowed to drop rq->lock, therefore we
1463 * cannot assume the switched_from/switched_to pair is serliazed by 1499 * cannot assume the switched_from/switched_to pair is serliazed by
1464 * rq->lock. They are however serialized by p->pi_lock. 1500 * rq->lock. They are however serialized by p->pi_lock.
1465 */ 1501 */
1466 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1502 void (*switched_from)(struct rq *this_rq, struct task_struct *task);
1467 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1503 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1468 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1504 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1469 int oldprio); 1505 int oldprio);
1470 1506
1471 unsigned int (*get_rr_interval) (struct rq *rq, 1507 unsigned int (*get_rr_interval)(struct rq *rq,
1472 struct task_struct *task); 1508 struct task_struct *task);
1473 1509
1474 void (*update_curr) (struct rq *rq); 1510 void (*update_curr)(struct rq *rq);
1475 1511
1476#define TASK_SET_GROUP 0 1512#define TASK_SET_GROUP 0
1477#define TASK_MOVE_GROUP 1 1513#define TASK_MOVE_GROUP 1
1478 1514
1479#ifdef CONFIG_FAIR_GROUP_SCHED 1515#ifdef CONFIG_FAIR_GROUP_SCHED
1480 void (*task_change_group) (struct task_struct *p, int type); 1516 void (*task_change_group)(struct task_struct *p, int type);
1481#endif 1517#endif
1482}; 1518};
1483 1519
@@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq,
1526static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1562static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1527{ 1563{
1528 SCHED_WARN_ON(!rcu_read_lock_held()); 1564 SCHED_WARN_ON(!rcu_read_lock_held());
1565
1529 return rq->idle_state; 1566 return rq->idle_state;
1530} 1567}
1531#else 1568#else
@@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1564extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1601extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
1565extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1602extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
1566 1603
1567#define BW_SHIFT 20 1604#define BW_SHIFT 20
1568#define BW_UNIT (1 << BW_SHIFT) 1605#define BW_UNIT (1 << BW_SHIFT)
1569#define RATIO_SHIFT 8 1606#define RATIO_SHIFT 8
1570unsigned long to_ratio(u64 period, u64 runtime); 1607unsigned long to_ratio(u64 period, u64 runtime);
1571 1608
1572extern void init_entity_runnable_average(struct sched_entity *se); 1609extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
1574 1611
1575#ifdef CONFIG_NO_HZ_FULL 1612#ifdef CONFIG_NO_HZ_FULL
1576extern bool sched_can_stop_tick(struct rq *rq); 1613extern bool sched_can_stop_tick(struct rq *rq);
1614extern int __init sched_tick_offload_init(void);
1577 1615
1578/* 1616/*
1579 * Tick may be needed by tasks in the runqueue depending on their policy and 1617 * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
1598 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 1636 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
1599} 1637}
1600#else 1638#else
1639static inline int sched_tick_offload_init(void) { return 0; }
1601static inline void sched_update_tick_dependency(struct rq *rq) { } 1640static inline void sched_update_tick_dependency(struct rq *rq) { }
1602#endif 1641#endif
1603 1642
@@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
1624 sched_update_tick_dependency(rq); 1663 sched_update_tick_dependency(rq);
1625} 1664}
1626 1665
1627static inline void rq_last_tick_reset(struct rq *rq)
1628{
1629#ifdef CONFIG_NO_HZ_FULL
1630 rq->last_sched_tick = jiffies;
1631#endif
1632}
1633
1634extern void update_rq_clock(struct rq *rq); 1666extern void update_rq_clock(struct rq *rq);
1635 1667
1636extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1668extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1821/* 1853/*
1822 * Unfair double_lock_balance: Optimizes throughput at the expense of 1854 * Unfair double_lock_balance: Optimizes throughput at the expense of
1823 * latency by eliminating extra atomic operations when the locks are 1855 * latency by eliminating extra atomic operations when the locks are
1824 * already in proper order on entry. This favors lower cpu-ids and will 1856 * already in proper order on entry. This favors lower CPU-ids and will
1825 * grant the double lock to lower cpus over higher ids under contention, 1857 * grant the double lock to lower CPUs over higher ids under contention,
1826 * regardless of entry order into the function. 1858 * regardless of entry order into the function.
1827 */ 1859 */
1828static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1860static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1854static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1886static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1855{ 1887{
1856 if (unlikely(!irqs_disabled())) { 1888 if (unlikely(!irqs_disabled())) {
1857 /* printk() doesn't work good under rq->lock */ 1889 /* printk() doesn't work well under rq->lock */
1858 raw_spin_unlock(&this_rq->lock); 1890 raw_spin_unlock(&this_rq->lock);
1859 BUG_ON(1); 1891 BUG_ON(1);
1860 } 1892 }
@@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void);
2005extern void cfs_bandwidth_usage_dec(void); 2037extern void cfs_bandwidth_usage_dec(void);
2006 2038
2007#ifdef CONFIG_NO_HZ_COMMON 2039#ifdef CONFIG_NO_HZ_COMMON
2008enum rq_nohz_flag_bits { 2040#define NOHZ_BALANCE_KICK_BIT 0
2009 NOHZ_TICK_STOPPED, 2041#define NOHZ_STATS_KICK_BIT 1
2010 NOHZ_BALANCE_KICK, 2042
2011}; 2043#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
2044#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
2045
2046#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
2012 2047
2013#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 2048#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
2014 2049
2015extern void nohz_balance_exit_idle(unsigned int cpu); 2050extern void nohz_balance_exit_idle(struct rq *rq);
2016#else 2051#else
2017static inline void nohz_balance_exit_idle(unsigned int cpu) { } 2052static inline void nohz_balance_exit_idle(struct rq *rq) { }
2018#endif 2053#endif
2019 2054
2020 2055
@@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2113#endif /* CONFIG_CPU_FREQ */ 2148#endif /* CONFIG_CPU_FREQ */
2114 2149
2115#ifdef arch_scale_freq_capacity 2150#ifdef arch_scale_freq_capacity
2116#ifndef arch_scale_freq_invariant 2151# ifndef arch_scale_freq_invariant
2117#define arch_scale_freq_invariant() (true) 2152# define arch_scale_freq_invariant() true
2118#endif 2153# endif
2119#else /* arch_scale_freq_capacity */ 2154#else
2120#define arch_scale_freq_invariant() (false) 2155# define arch_scale_freq_invariant() false
2121#endif 2156#endif
2122 2157
2123#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2158#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2124
2125static inline unsigned long cpu_util_dl(struct rq *rq) 2159static inline unsigned long cpu_util_dl(struct rq *rq)
2126{ 2160{
2127 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2161 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
2129 2163
2130static inline unsigned long cpu_util_cfs(struct rq *rq) 2164static inline unsigned long cpu_util_cfs(struct rq *rq)
2131{ 2165{
2132 return rq->cfs.avg.util_avg; 2166 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
2133} 2167
2168 if (sched_feat(UTIL_EST)) {
2169 util = max_t(unsigned long, util,
2170 READ_ONCE(rq->cfs.avg.util_est.enqueued));
2171 }
2134 2172
2173 return util;
2174}
2135#endif 2175#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..ab112cbfd7c8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,14 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2 2/*
3#include <linux/slab.h> 3 * /proc/schedstat implementation
4#include <linux/fs.h> 4 */
5#include <linux/seq_file.h>
6#include <linux/proc_fs.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10/* 7/*
11 * bump this up when changing the output format or the meaning of an existing 8 * Current schedstat API version.
9 *
10 * Bump this up when changing the output format or the meaning of an existing
12 * format, so that tools can adapt (or abort) 11 * format, so that tools can adapt (or abort)
13 */ 12 */
14#define SCHEDSTAT_VERSION 15 13#define SCHEDSTAT_VERSION 15
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
78 * This itererator needs some explanation. 77 * This itererator needs some explanation.
79 * It returns 1 for the header position. 78 * It returns 1 for the header position.
80 * This means 2 is cpu 0. 79 * This means 2 is cpu 0.
81 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 80 * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
82 * to use cpumask_* to iterate over the cpus. 81 * to use cpumask_* to iterate over the CPUs.
83 */ 82 */
84static void *schedstat_start(struct seq_file *file, loff_t *offset) 83static void *schedstat_start(struct seq_file *file, loff_t *offset)
85{ 84{
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
99 98
100 if (n < nr_cpu_ids) 99 if (n < nr_cpu_ids)
101 return (void *)(unsigned long)(n + 2); 100 return (void *)(unsigned long)(n + 2);
101
102 return NULL; 102 return NULL;
103} 103}
104 104
105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) 105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
106{ 106{
107 (*offset)++; 107 (*offset)++;
108
108 return schedstat_start(file, offset); 109 return schedstat_start(file, offset);
109} 110}
110 111
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
134static int __init proc_schedstat_init(void) 135static int __init proc_schedstat_init(void)
135{ 136{
136 proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 137 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
138
137 return 0; 139 return 0;
138} 140}
139subsys_initcall(proc_schedstat_init); 141subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
30 if (rq) 30 if (rq)
31 rq->rq_sched_info.run_delay += delta; 31 rq->rq_sched_info.run_delay += delta;
32} 32}
33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
34#define __schedstat_inc(var) do { var++; } while (0) 34#define __schedstat_inc(var) do { var++; } while (0)
35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) 35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
36#define __schedstat_add(var, amt) do { var += (amt); } while (0) 36#define __schedstat_add(var, amt) do { var += (amt); } while (0)
37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) 37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
38#define __schedstat_set(var, val) do { var = (val); } while (0) 38#define __schedstat_set(var, val) do { var = (val); } while (0)
39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
40#define schedstat_val(var) (var) 40#define schedstat_val(var) (var)
41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) 41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
42 42
43#else /* !CONFIG_SCHEDSTATS */ 43#else /* !CONFIG_SCHEDSTATS: */
44static inline void 44static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
45rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 45static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
46{} 46static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
47static inline void 47# define schedstat_enabled() 0
48rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) 48# define __schedstat_inc(var) do { } while (0)
49{} 49# define schedstat_inc(var) do { } while (0)
50static inline void 50# define __schedstat_add(var, amt) do { } while (0)
51rq_sched_info_depart(struct rq *rq, unsigned long long delta) 51# define schedstat_add(var, amt) do { } while (0)
52{} 52# define __schedstat_set(var, val) do { } while (0)
53#define schedstat_enabled() 0 53# define schedstat_set(var, val) do { } while (0)
54#define __schedstat_inc(var) do { } while (0) 54# define schedstat_val(var) 0
55#define schedstat_inc(var) do { } while (0) 55# define schedstat_val_or_zero(var) 0
56#define __schedstat_add(var, amt) do { } while (0)
57#define schedstat_add(var, amt) do { } while (0)
58#define __schedstat_set(var, val) do { } while (0)
59#define schedstat_set(var, val) do { } while (0)
60#define schedstat_val(var) 0
61#define schedstat_val_or_zero(var) 0
62#endif /* CONFIG_SCHEDSTATS */ 56#endif /* CONFIG_SCHEDSTATS */
63 57
64#ifdef CONFIG_SCHED_INFO 58#ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
69 63
70/* 64/*
71 * We are interested in knowing how long it was from the *first* time a 65 * We are interested in knowing how long it was from the *first* time a
72 * task was queued to the time that it finally hit a cpu, we call this routine 66 * task was queued to the time that it finally hit a CPU, we call this routine
73 * from dequeue_task() to account for possible rq->clock skew across cpus. The 67 * from dequeue_task() to account for possible rq->clock skew across CPUs. The
74 * delta taken on each cpu would annul the skew. 68 * delta taken on each CPU would annul the skew.
75 */ 69 */
76static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) 70static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
77{ 71{
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
87} 81}
88 82
89/* 83/*
90 * Called when a task finally hits the cpu. We can now calculate how 84 * Called when a task finally hits the CPU. We can now calculate how
91 * long it was waiting to run. We also note when it began so that we 85 * long it was waiting to run. We also note when it began so that we
92 * can keep stats on how long its timeslice is. 86 * can keep stats on how long its timeslice is.
93 */ 87 */
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
112 */ 106 */
113static inline void sched_info_queued(struct rq *rq, struct task_struct *t) 107static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
114{ 108{
115 if (unlikely(sched_info_on())) 109 if (unlikely(sched_info_on())) {
116 if (!t->sched_info.last_queued) 110 if (!t->sched_info.last_queued)
117 t->sched_info.last_queued = rq_clock(rq); 111 t->sched_info.last_queued = rq_clock(rq);
112 }
118} 113}
119 114
120/* 115/*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
127 */ 122 */
128static inline void sched_info_depart(struct rq *rq, struct task_struct *t) 123static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
129{ 124{
130 unsigned long long delta = rq_clock(rq) - 125 unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
131 t->sched_info.last_arrival;
132 126
133 rq_sched_info_depart(rq, delta); 127 rq_sched_info_depart(rq, delta);
134 128
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
142 * the idle task.) We are only called when prev != next. 136 * the idle task.) We are only called when prev != next.
143 */ 137 */
144static inline void 138static inline void
145__sched_info_switch(struct rq *rq, 139__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
146 struct task_struct *prev, struct task_struct *next)
147{ 140{
148 /* 141 /*
149 * prev now departs the cpu. It's not interesting to record 142 * prev now departs the CPU. It's not interesting to record
150 * stats about how efficient we were at scheduling the idle 143 * stats about how efficient we were at scheduling the idle
151 * process, however. 144 * process, however.
152 */ 145 */
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
156 if (next != rq->idle) 149 if (next != rq->idle)
157 sched_info_arrive(rq, next); 150 sched_info_arrive(rq, next);
158} 151}
152
159static inline void 153static inline void
160sched_info_switch(struct rq *rq, 154sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
161 struct task_struct *prev, struct task_struct *next)
162{ 155{
163 if (unlikely(sched_info_on())) 156 if (unlikely(sched_info_on()))
164 __sched_info_switch(rq, prev, next); 157 __sched_info_switch(rq, prev, next);
165} 158}
166#else 159
167#define sched_info_queued(rq, t) do { } while (0) 160#else /* !CONFIG_SCHED_INFO: */
168#define sched_info_reset_dequeued(t) do { } while (0) 161# define sched_info_queued(rq, t) do { } while (0)
169#define sched_info_dequeued(rq, t) do { } while (0) 162# define sched_info_reset_dequeued(t) do { } while (0)
170#define sched_info_depart(rq, t) do { } while (0) 163# define sched_info_dequeued(rq, t) do { } while (0)
171#define sched_info_arrive(rq, next) do { } while (0) 164# define sched_info_depart(rq, t) do { } while (0)
172#define sched_info_switch(rq, t, next) do { } while (0) 165# define sched_info_arrive(rq, next) do { } while (0)
166# define sched_info_switch(rq, t, next) do { } while (0)
173#endif /* CONFIG_SCHED_INFO */ 167#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 210b1f2146ff..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/* 2/*
5 * stop-task scheduling class. 3 * stop-task scheduling class.
6 * 4 *
@@ -9,6 +7,7 @@
9 * 7 *
10 * See kernel/stop_machine.c 8 * See kernel/stop_machine.c
11 */ 9 */
10#include "sched.h"
12 11
13#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
14static int 13static int
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
75 cgroup_account_cputime(curr, delta_exec); 74 cgroup_account_cputime(curr, delta_exec);
76} 75}
77 76
77/*
78 * scheduler tick hitting a task of our scheduling class.
79 *
80 * NOTE: This function can be called remotely by the tick offload that
81 * goes along full dynticks. Therefore no local assumption can be made
82 * and everything must be accessed through the @rq and @curr passed in
83 * parameters.
84 */
78static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) 85static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
79{ 86{
80} 87}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b6fb2c3b3ff7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,6 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/sched/signal.h> 2/*
3#include <linux/swait.h> 3 * <linux/swait.h> (simple wait queues ) implementation:
4 */
5#include "sched.h"
4 6
5void __init_swait_queue_head(struct swait_queue_head *q, const char *name, 7void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
6 struct lock_class_key *key) 8 struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..64cc564f5255 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,10 +2,6 @@
2/* 2/*
3 * Scheduler topology setup/handling methods 3 * Scheduler topology setup/handling methods
4 */ 4 */
5#include <linux/sched.h>
6#include <linux/mutex.h>
7#include <linux/sched/isolation.h>
8
9#include "sched.h" 5#include "sched.h"
10 6
11DEFINE_MUTEX(sched_domains_mutex); 7DEFINE_MUTEX(sched_domains_mutex);
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
41 if (!(sd->flags & SD_LOAD_BALANCE)) { 37 if (!(sd->flags & SD_LOAD_BALANCE)) {
42 printk("does not load-balance\n"); 38 printk("does not load-balance\n");
43 if (sd->parent) 39 if (sd->parent)
44 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 40 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
45 " has parent");
46 return -1; 41 return -1;
47 } 42 }
48 43
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
50 cpumask_pr_args(sched_domain_span(sd)), sd->name); 45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
51 46
52 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
53 printk(KERN_ERR "ERROR: domain->span does not contain " 48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
54 "CPU%d\n", cpu);
55 } 49 }
56 if (!cpumask_test_cpu(cpu, sched_group_span(group))) { 50 if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
57 printk(KERN_ERR "ERROR: domain->groups does not contain" 51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
58 " CPU%d\n", cpu);
59 } 52 }
60 53
61 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 54 printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
115 108
116 if (sd->parent && 109 if (sd->parent &&
117 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 110 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
118 printk(KERN_ERR "ERROR: parent span is not a superset " 111 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
119 "of domain->span\n");
120 return 0; 112 return 0;
121} 113}
122 114
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
595 * are not. 587 * are not.
596 * 588 *
597 * This leads to a few particularly weird cases where the sched_domain's are 589 * This leads to a few particularly weird cases where the sched_domain's are
598 * not of the same number for each cpu. Consider: 590 * not of the same number for each CPU. Consider:
599 * 591 *
600 * NUMA-2 0-3 0-3 592 * NUMA-2 0-3 0-3
601 * groups: {0-2},{1-3} {1-3},{0-2} 593 * groups: {0-2},{1-3} {1-3},{0-2}
@@ -780,7 +772,7 @@ fail:
780 * ^ ^ ^ ^ 772 * ^ ^ ^ ^
781 * `-' `-' 773 * `-' `-'
782 * 774 *
783 * The sched_domains are per-cpu and have a two way link (parent & child) and 775 * The sched_domains are per-CPU and have a two way link (parent & child) and
784 * denote the ever growing mask of CPUs belonging to that level of topology. 776 * denote the ever growing mask of CPUs belonging to that level of topology.
785 * 777 *
786 * Each sched_domain has a circular (double) linked list of sched_group's, each 778 * Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1021 d->rd = alloc_rootdomain(); 1013 d->rd = alloc_rootdomain();
1022 if (!d->rd) 1014 if (!d->rd)
1023 return sa_sd; 1015 return sa_sd;
1016
1024 return sa_rootdomain; 1017 return sa_rootdomain;
1025} 1018}
1026 1019
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
1047} 1040}
1048 1041
1049#ifdef CONFIG_NUMA 1042#ifdef CONFIG_NUMA
1050static int sched_domains_numa_levels;
1051enum numa_topology_type sched_numa_topology_type; 1043enum numa_topology_type sched_numa_topology_type;
1052static int *sched_domains_numa_distance; 1044
1053int sched_max_numa_distance; 1045static int sched_domains_numa_levels;
1054static struct cpumask ***sched_domains_numa_masks; 1046static int sched_domains_curr_level;
1055static int sched_domains_curr_level; 1047
1048int sched_max_numa_distance;
1049static int *sched_domains_numa_distance;
1050static struct cpumask ***sched_domains_numa_masks;
1056#endif 1051#endif
1057 1052
1058/* 1053/*
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
1074 * SD_ASYM_PACKING - describes SMT quirks 1069 * SD_ASYM_PACKING - describes SMT quirks
1075 */ 1070 */
1076#define TOPOLOGY_SD_FLAGS \ 1071#define TOPOLOGY_SD_FLAGS \
1077 (SD_SHARE_CPUCAPACITY | \ 1072 (SD_SHARE_CPUCAPACITY | \
1078 SD_SHARE_PKG_RESOURCES | \ 1073 SD_SHARE_PKG_RESOURCES | \
1079 SD_NUMA | \ 1074 SD_NUMA | \
1080 SD_ASYM_PACKING | \ 1075 SD_ASYM_PACKING | \
1081 SD_ASYM_CPUCAPACITY | \ 1076 SD_ASYM_CPUCAPACITY | \
1082 SD_SHARE_POWERDOMAIN) 1077 SD_SHARE_POWERDOMAIN)
1083 1078
1084static struct sched_domain * 1079static struct sched_domain *
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
1628 pr_err(" the %s domain not a subset of the %s domain\n", 1623 pr_err(" the %s domain not a subset of the %s domain\n",
1629 child->name, sd->name); 1624 child->name, sd->name);
1630#endif 1625#endif
1631 /* Fixup, ensure @sd has at least @child cpus. */ 1626 /* Fixup, ensure @sd has at least @child CPUs. */
1632 cpumask_or(sched_domain_span(sd), 1627 cpumask_or(sched_domain_span(sd),
1633 sched_domain_span(sd), 1628 sched_domain_span(sd),
1634 sched_domain_span(child)); 1629 sched_domain_span(child));
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
1720 ret = 0; 1715 ret = 0;
1721error: 1716error:
1722 __free_domain_allocs(&d, alloc_state, cpu_map); 1717 __free_domain_allocs(&d, alloc_state, cpu_map);
1718
1723 return ret; 1719 return ret;
1724} 1720}
1725 1721
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1824 return 1; 1820 return 1;
1825 1821
1826 tmp = SD_ATTR_INIT; 1822 tmp = SD_ATTR_INIT;
1823
1827 return !memcmp(cur ? (cur + idx_cur) : &tmp, 1824 return !memcmp(cur ? (cur + idx_cur) : &tmp,
1828 new ? (new + idx_new) : &tmp, 1825 new ? (new + idx_new) : &tmp,
1829 sizeof(struct sched_domain_attr)); 1826 sizeof(struct sched_domain_attr));
@@ -1929,4 +1926,3 @@ match2:
1929 1926
1930 mutex_unlock(&sched_domains_mutex); 1927 mutex_unlock(&sched_domains_mutex);
1931} 1928}
1932
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..928be527477e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -3,14 +3,7 @@
3 * 3 *
4 * (C) 2004 Nadia Yvette Chambers, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include "sched.h"
7#include <linux/export.h>
8#include <linux/sched/signal.h>
9#include <linux/sched/debug.h>
10#include <linux/mm.h>
11#include <linux/wait.h>
12#include <linux/hash.h>
13#include <linux/kthread.h>
14 7
15void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) 8void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
16{ 9{
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
107 break; 100 break;
108 } 101 }
109 } 102 }
103
110 return nr_exclusive; 104 return nr_exclusive;
111} 105}
112 106
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
317 spin_unlock(&wq->lock); 311 spin_unlock(&wq->lock);
318 schedule(); 312 schedule();
319 spin_lock(&wq->lock); 313 spin_lock(&wq->lock);
314
320 return 0; 315 return 0;
321} 316}
322EXPORT_SYMBOL(do_wait_intr); 317EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
333 spin_unlock_irq(&wq->lock); 328 spin_unlock_irq(&wq->lock);
334 schedule(); 329 schedule();
335 spin_lock_irq(&wq->lock); 330 spin_lock_irq(&wq->lock);
331
336 return 0; 332 return 0;
337} 333}
338EXPORT_SYMBOL(do_wait_intr_irq); 334EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
378 374
379 if (ret) 375 if (ret)
380 list_del_init(&wq_entry->entry); 376 list_del_init(&wq_entry->entry);
377
381 return ret; 378 return ret;
382} 379}
383EXPORT_SYMBOL(autoremove_wake_function); 380EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..c67c6d24adc2 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,10 +1,7 @@
1/* 1/*
2 * The implementation of the wait_bit*() and related waiting APIs: 2 * The implementation of the wait_bit*() and related waiting APIs:
3 */ 3 */
4#include <linux/wait_bit.h> 4#include "sched.h"
5#include <linux/sched/signal.h>
6#include <linux/sched/debug.h>
7#include <linux/hash.h>
8 5
9#define WAIT_TABLE_BITS 8 6#define WAIT_TABLE_BITS 8
10#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) 7#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
29 wait_bit->key.bit_nr != key->bit_nr || 26 wait_bit->key.bit_nr != key->bit_nr ||
30 test_bit(key->bit_nr, key->flags)) 27 test_bit(key->bit_nr, key->flags))
31 return 0; 28 return 0;
32 else 29
33 return autoremove_wake_function(wq_entry, mode, sync, key); 30 return autoremove_wake_function(wq_entry, mode, sync, key);
34} 31}
35EXPORT_SYMBOL(wake_bit_function); 32EXPORT_SYMBOL(wake_bit_function);
36 33
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
50 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) 47 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
51 ret = (*action)(&wbq_entry->key, mode); 48 ret = (*action)(&wbq_entry->key, mode);
52 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); 49 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
50
53 finish_wait(wq_head, &wbq_entry->wq_entry); 51 finish_wait(wq_head, &wbq_entry->wq_entry);
52
54 return ret; 53 return ret;
55} 54}
56EXPORT_SYMBOL(__wait_on_bit); 55EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
73 DEFINE_WAIT_BIT(wq_entry, word, bit); 72 DEFINE_WAIT_BIT(wq_entry, word, bit);
74 73
75 wq_entry.key.timeout = jiffies + timeout; 74 wq_entry.key.timeout = jiffies + timeout;
75
76 return __wait_on_bit(wq_head, &wq_entry, action, mode); 76 return __wait_on_bit(wq_head, &wq_entry, action, mode);
77} 77}
78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); 78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) 120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
121{ 121{
122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
123
123 if (waitqueue_active(wq_head)) 124 if (waitqueue_active(wq_head))
124 __wake_up(wq_head, TASK_NORMAL, 1, &key); 125 __wake_up(wq_head, TASK_NORMAL, 1, &key);
125} 126}
@@ -148,108 +149,55 @@ void wake_up_bit(void *word, int bit)
148} 149}
149EXPORT_SYMBOL(wake_up_bit); 150EXPORT_SYMBOL(wake_up_bit);
150 151
151/* 152wait_queue_head_t *__var_waitqueue(void *p)
152 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
153 * index (we're keying off bit -1, but that would produce a horrible hash
154 * value).
155 */
156static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
157{ 153{
158 if (BITS_PER_LONG == 64) { 154 return bit_wait_table + hash_ptr(p, WAIT_TABLE_BITS);
159 unsigned long q = (unsigned long)p;
160 return bit_waitqueue((void *)(q & ~1), q & 1);
161 }
162 return bit_waitqueue(p, 0);
163} 155}
156EXPORT_SYMBOL(__var_waitqueue);
164 157
165static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, 158static int
166 void *arg) 159var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
160 int sync, void *arg)
167{ 161{
168 struct wait_bit_key *key = arg; 162 struct wait_bit_key *key = arg;
169 struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); 163 struct wait_bit_queue_entry *wbq_entry =
170 atomic_t *val = key->flags; 164 container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
171 165
172 if (wait_bit->key.flags != key->flags || 166 if (wbq_entry->key.flags != key->flags ||
173 wait_bit->key.bit_nr != key->bit_nr || 167 wbq_entry->key.bit_nr != key->bit_nr)
174 atomic_read(val) != 0)
175 return 0; 168 return 0;
176 return autoremove_wake_function(wq_entry, mode, sync, key);
177}
178 169
179/* 170 return autoremove_wake_function(wq_entry, mode, sync, key);
180 * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
181 * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero
182 * return codes halt waiting and return.
183 */
184static __sched
185int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
186 wait_atomic_t_action_f action, unsigned int mode)
187{
188 atomic_t *val;
189 int ret = 0;
190
191 do {
192 prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
193 val = wbq_entry->key.flags;
194 if (atomic_read(val) == 0)
195 break;
196 ret = (*action)(val, mode);
197 } while (!ret && atomic_read(val) != 0);
198 finish_wait(wq_head, &wbq_entry->wq_entry);
199 return ret;
200} 171}
201 172
202#define DEFINE_WAIT_ATOMIC_T(name, p) \ 173void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
203 struct wait_bit_queue_entry name = { \
204 .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \
205 .wq_entry = { \
206 .private = current, \
207 .func = wake_atomic_t_function, \
208 .entry = \
209 LIST_HEAD_INIT((name).wq_entry.entry), \
210 }, \
211 }
212
213__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
214 wait_atomic_t_action_f action,
215 unsigned int mode)
216{ 174{
217 struct wait_queue_head *wq_head = atomic_t_waitqueue(p); 175 *wbq_entry = (struct wait_bit_queue_entry){
218 DEFINE_WAIT_ATOMIC_T(wq_entry, p); 176 .key = {
219 177 .flags = (var),
220 return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); 178 .bit_nr = -1,
179 },
180 .wq_entry = {
181 .private = current,
182 .func = var_wake_function,
183 .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
184 },
185 };
221} 186}
222EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); 187EXPORT_SYMBOL(init_wait_var_entry);
223 188
224__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) 189void wake_up_var(void *var)
225{ 190{
226 schedule(); 191 __wake_up_bit(__var_waitqueue(var), var, -1);
227 if (signal_pending_state(mode, current))
228 return -EINTR;
229 return 0;
230} 192}
231EXPORT_SYMBOL(atomic_t_wait); 193EXPORT_SYMBOL(wake_up_var);
232
233/**
234 * wake_up_atomic_t - Wake up a waiter on a atomic_t
235 * @p: The atomic_t being waited on, a kernel virtual address
236 *
237 * Wake up anyone waiting for the atomic_t to go to zero.
238 *
239 * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
240 * check is done by the waiter's wake function, not the by the waker itself).
241 */
242void wake_up_atomic_t(atomic_t *p)
243{
244 __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
245}
246EXPORT_SYMBOL(wake_up_atomic_t);
247 194
248__sched int bit_wait(struct wait_bit_key *word, int mode) 195__sched int bit_wait(struct wait_bit_key *word, int mode)
249{ 196{
250 schedule(); 197 schedule();
251 if (signal_pending_state(mode, current)) 198 if (signal_pending_state(mode, current))
252 return -EINTR; 199 return -EINTR;
200
253 return 0; 201 return 0;
254} 202}
255EXPORT_SYMBOL(bit_wait); 203EXPORT_SYMBOL(bit_wait);
@@ -259,6 +207,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
259 io_schedule(); 207 io_schedule();
260 if (signal_pending_state(mode, current)) 208 if (signal_pending_state(mode, current))
261 return -EINTR; 209 return -EINTR;
210
262 return 0; 211 return 0;
263} 212}
264EXPORT_SYMBOL(bit_wait_io); 213EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +215,13 @@ EXPORT_SYMBOL(bit_wait_io);
266__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) 215__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
267{ 216{
268 unsigned long now = READ_ONCE(jiffies); 217 unsigned long now = READ_ONCE(jiffies);
218
269 if (time_after_eq(now, word->timeout)) 219 if (time_after_eq(now, word->timeout))
270 return -EAGAIN; 220 return -EAGAIN;
271 schedule_timeout(word->timeout - now); 221 schedule_timeout(word->timeout - now);
272 if (signal_pending_state(mode, current)) 222 if (signal_pending_state(mode, current))
273 return -EINTR; 223 return -EINTR;
224
274 return 0; 225 return 0;
275} 226}
276EXPORT_SYMBOL_GPL(bit_wait_timeout); 227EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +229,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
278__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) 229__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
279{ 230{
280 unsigned long now = READ_ONCE(jiffies); 231 unsigned long now = READ_ONCE(jiffies);
232
281 if (time_after_eq(now, word->timeout)) 233 if (time_after_eq(now, word->timeout))
282 return -EAGAIN; 234 return -EAGAIN;
283 io_schedule_timeout(word->timeout - now); 235 io_schedule_timeout(word->timeout - now);
284 if (signal_pending_state(mode, current)) 236 if (signal_pending_state(mode, current))
285 return -EINTR; 237 return -EINTR;
238
286 return 0; 239 return 0;
287} 240}
288EXPORT_SYMBOL_GPL(bit_wait_io_timeout); 241EXPORT_SYMBOL_GPL(bit_wait_io_timeout);