aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-02 14:49:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-02 14:49:41 -0400
commit46e0d28bdb8e6d00e27a0fe9e1d15df6098f0ffb (patch)
treed5cb66fbd85b3d5c3220aacd2d9a60f9a515903a
parent86bbbebac1933e6e95e8234c4f7d220c5ddd38bc (diff)
parentb720342849fe685310fca01748a32730a6eca5aa (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main scheduler changes in this cycle were: - NUMA balancing improvements (Mel Gorman) - Further load tracking improvements (Patrick Bellasi) - Various NOHZ balancing cleanups and optimizations (Peter Zijlstra) - Improve blocked load handling, in particular we can now reduce and eventually stop periodic load updates on 'very idle' CPUs. (Vincent Guittot) - On isolated CPUs offload the final 1Hz scheduler tick as well, plus related cleanups and reorganization. (Frederic Weisbecker) - Core scheduler code cleanups (Ingo Molnar)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits) sched/core: Update preempt_notifier_key to modern API sched/cpufreq: Rate limits for SCHED_DEADLINE sched/fair: Update util_est only on util_avg updates sched/cpufreq/schedutil: Use util_est for OPP selection sched/fair: Use util_est in LB and WU paths sched/fair: Add util_est on top of PELT sched/core: Remove TASK_ALL sched/completions: Use bool in try_wait_for_completion() sched/fair: Update blocked load when newly idle sched/fair: Move idle_balance() sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks sched/fair: Move rebalance_domains() sched/nohz: Optimize nohz_idle_balance() sched/fair: Reduce the periodic update duration sched/nohz: Stop NOHZ stats when decayed sched/cpufreq: Provide migration hint sched/nohz: Clean up nohz enter/exit sched/fair: Update blocked load from NEWIDLE sched/fair: Add NOHZ stats balancing sched/fair: Restructure nohz_balance_kick() ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt11
-rw-r--r--include/linux/sched.h30
-rw-r--r--include/linux/sched/cpufreq.h5
-rw-r--r--include/linux/sched/deadline.h6
-rw-r--r--include/linux/sched/isolation.h1
-rw-r--r--include/linux/sched/nohz.h6
-rw-r--r--include/linux/tick.h4
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/autogroup.c21
-rw-r--r--kernel/sched/autogroup.h12
-rw-r--r--kernel/sched/clock.c36
-rw-r--r--kernel/sched/completion.c11
-rw-r--r--kernel/sched/core.c186
-rw-r--r--kernel/sched/cpuacct.c33
-rw-r--r--kernel/sched/cpudeadline.c23
-rw-r--r--kernel/sched/cpudeadline.h29
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c221
-rw-r--r--kernel/sched/cpupri.c15
-rw-r--r--kernel/sched/cpupri.h25
-rw-r--r--kernel/sched/cputime.c58
-rw-r--r--kernel/sched/deadline.c82
-rw-r--r--kernel/sched/debug.c103
-rw-r--r--kernel/sched/fair.c1415
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/idle.c142
-rw-r--r--kernel/sched/idle_task.c110
-rw-r--r--kernel/sched/isolation.c14
-rw-r--r--kernel/sched/loadavg.c34
-rw-r--r--kernel/sched/membarrier.c27
-rw-r--r--kernel/sched/rt.c60
-rw-r--r--kernel/sched/sched.h650
-rw-r--r--kernel/sched/stats.c20
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--kernel/sched/stop_task.c11
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/sched/topology.c46
-rw-r--r--kernel/sched/wait.c13
-rw-r--r--kernel/sched/wait_bit.c23
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/workqueue.c3
41 files changed, 2082 insertions, 1529 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1d1d53f85ddd..50b9837e985b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1766,6 +1766,17 @@
1766 1766
1767 nohz 1767 nohz
1768 Disable the tick when a single task runs. 1768 Disable the tick when a single task runs.
1769
1770 A residual 1Hz tick is offloaded to workqueues, which you
1771 need to affine to housekeeping through the global
1772 workqueue's affinity configured via the
1773 /sys/devices/virtual/workqueue/cpumask sysfs file, or
1774 by using the 'domain' flag described below.
1775
1776 NOTE: by default the global workqueue runs on all CPUs,
1777 so to protect individual CPUs the 'cpumask' file has to
1778 be configured manually after bootup.
1779
1769 domain 1780 domain
1770 Isolate from the general SMP balancing and scheduling 1781 Isolate from the general SMP balancing and scheduling
1771 algorithms. Note that performing domain isolation this way 1782 algorithms. Note that performing domain isolation this way
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b161ef8a902e..f228c6033832 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -93,7 +93,6 @@ struct task_group;
93 93
94/* Convenience macros for the sake of wake_up(): */ 94/* Convenience macros for the sake of wake_up(): */
95#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) 95#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
96#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
97 96
98/* get_task_state(): */ 97/* get_task_state(): */
99#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ 98#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
@@ -275,6 +274,34 @@ struct load_weight {
275 u32 inv_weight; 274 u32 inv_weight;
276}; 275};
277 276
277/**
278 * struct util_est - Estimation utilization of FAIR tasks
279 * @enqueued: instantaneous estimated utilization of a task/cpu
280 * @ewma: the Exponential Weighted Moving Average (EWMA)
281 * utilization of a task
282 *
283 * Support data structure to track an Exponential Weighted Moving Average
284 * (EWMA) of a FAIR task's utilization. New samples are added to the moving
285 * average each time a task completes an activation. Sample's weight is chosen
286 * so that the EWMA will be relatively insensitive to transient changes to the
287 * task's workload.
288 *
289 * The enqueued attribute has a slightly different meaning for tasks and cpus:
290 * - task: the task's util_avg at last task dequeue time
291 * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
292 * Thus, the util_est.enqueued of a task represents the contribution on the
293 * estimated utilization of the CPU where that task is currently enqueued.
294 *
295 * Only for tasks we track a moving average of the past instantaneous
296 * estimated utilization. This allows to absorb sporadic drops in utilization
297 * of an otherwise almost periodic task.
298 */
299struct util_est {
300 unsigned int enqueued;
301 unsigned int ewma;
302#define UTIL_EST_WEIGHT_SHIFT 2
303};
304
278/* 305/*
279 * The load_avg/util_avg accumulates an infinite geometric series 306 * The load_avg/util_avg accumulates an infinite geometric series
280 * (see __update_load_avg() in kernel/sched/fair.c). 307 * (see __update_load_avg() in kernel/sched/fair.c).
@@ -336,6 +363,7 @@ struct sched_avg {
336 unsigned long load_avg; 363 unsigned long load_avg;
337 unsigned long runnable_load_avg; 364 unsigned long runnable_load_avg;
338 unsigned long util_avg; 365 unsigned long util_avg;
366 struct util_est util_est;
339}; 367};
340 368
341struct sched_statistics { 369struct sched_statistics {
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 0b55834efd46..59667444669f 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -8,9 +8,8 @@
8 * Interface between cpufreq drivers and the scheduler: 8 * Interface between cpufreq drivers and the scheduler:
9 */ 9 */
10 10
11#define SCHED_CPUFREQ_RT (1U << 0) 11#define SCHED_CPUFREQ_IOWAIT (1U << 0)
12#define SCHED_CPUFREQ_DL (1U << 1) 12#define SCHED_CPUFREQ_MIGRATION (1U << 1)
13#define SCHED_CPUFREQ_IOWAIT (1U << 2)
14 13
15#ifdef CONFIG_CPU_FREQ 14#ifdef CONFIG_CPU_FREQ
16struct update_util_data { 15struct update_util_data {
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index a5bc8728ead7..0cb034331cbb 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,8 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_SCHED_DEADLINE_H
3#define _LINUX_SCHED_DEADLINE_H
4
5#include <linux/sched.h>
6 2
7/* 3/*
8 * SCHED_DEADLINE tasks has negative priorities, reflecting 4 * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
28{ 24{
29 return (s64)(a - b) < 0; 25 return (s64)(a - b) < 0;
30} 26}
31
32#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index d849431c8060..4a6582c27dea 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -12,6 +12,7 @@ enum hk_flags {
12 HK_FLAG_SCHED = (1 << 3), 12 HK_FLAG_SCHED = (1 << 3),
13 HK_FLAG_TICK = (1 << 4), 13 HK_FLAG_TICK = (1 << 4),
14 HK_FLAG_DOMAIN = (1 << 5), 14 HK_FLAG_DOMAIN = (1 << 5),
15 HK_FLAG_WQ = (1 << 6),
15}; 16};
16 17
17#ifdef CONFIG_CPU_ISOLATION 18#ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 3d3a97d9399d..b36f4cf38111 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { }
16 16
17#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 17#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
18extern void nohz_balance_enter_idle(int cpu); 18extern void nohz_balance_enter_idle(int cpu);
19extern void set_cpu_sd_state_idle(void);
20extern int get_nohz_timer_target(void); 19extern int get_nohz_timer_target(void);
21#else 20#else
22static inline void nohz_balance_enter_idle(int cpu) { } 21static inline void nohz_balance_enter_idle(int cpu) { }
23static inline void set_cpu_sd_state_idle(void) { }
24#endif 22#endif
25 23
26#ifdef CONFIG_NO_HZ_COMMON 24#ifdef CONFIG_NO_HZ_COMMON
@@ -37,8 +35,4 @@ extern void wake_up_nohz_cpu(int cpu);
37static inline void wake_up_nohz_cpu(int cpu) { } 35static inline void wake_up_nohz_cpu(int cpu) { }
38#endif 36#endif
39 37
40#ifdef CONFIG_NO_HZ_FULL
41extern u64 scheduler_tick_max_deferment(void);
42#endif
43
44#endif /* _LINUX_SCHED_NOHZ_H */ 38#endif /* _LINUX_SCHED_NOHZ_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7cc35921218e..7f8c9a127f5a 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -113,7 +113,8 @@ enum tick_dep_bits {
113 113
114#ifdef CONFIG_NO_HZ_COMMON 114#ifdef CONFIG_NO_HZ_COMMON
115extern bool tick_nohz_enabled; 115extern bool tick_nohz_enabled;
116extern int tick_nohz_tick_stopped(void); 116extern bool tick_nohz_tick_stopped(void);
117extern bool tick_nohz_tick_stopped_cpu(int cpu);
117extern void tick_nohz_idle_enter(void); 118extern void tick_nohz_idle_enter(void);
118extern void tick_nohz_idle_exit(void); 119extern void tick_nohz_idle_exit(void);
119extern void tick_nohz_irq_exit(void); 120extern void tick_nohz_irq_exit(void);
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
125#else /* !CONFIG_NO_HZ_COMMON */ 126#else /* !CONFIG_NO_HZ_COMMON */
126#define tick_nohz_enabled (0) 127#define tick_nohz_enabled (0)
127static inline int tick_nohz_tick_stopped(void) { return 0; } 128static inline int tick_nohz_tick_stopped(void) { return 0; }
129static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
128static inline void tick_nohz_idle_enter(void) { } 130static inline void tick_nohz_idle_enter(void) { }
129static inline void tick_nohz_idle_exit(void) { } 131static inline void tick_nohz_idle_exit(void) { }
130 132
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4feff40..d9a02b318108 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
17endif 17endif
18 18
19obj-y += core.o loadavg.o clock.o cputime.o 19obj-y += core.o loadavg.o clock.o cputime.o
20obj-y += idle_task.o fair.o rt.o deadline.o 20obj-y += idle.o fair.o rt.o deadline.o
21obj-y += wait.o wait_bit.o swait.o completion.o idle.o 21obj-y += wait.o wait_bit.o swait.o completion.o
22
22obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o 23obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
23obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o 24obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
24obj-$(CONFIG_SCHEDSTATS) += stats.o 25obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..6be6c575b6cd 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,10 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/proc_fs.h> 2/*
3#include <linux/seq_file.h> 3 * Auto-group scheduling implementation:
4#include <linux/utsname.h> 4 */
5#include <linux/security.h>
6#include <linux/export.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 7unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
168 autogroup_kref_put(prev); 165 autogroup_kref_put(prev);
169} 166}
170 167
171/* Allocates GFP_KERNEL, cannot be called under any spinlock */ 168/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
172void sched_autogroup_create_attach(struct task_struct *p) 169void sched_autogroup_create_attach(struct task_struct *p)
173{ 170{
174 struct autogroup *ag = autogroup_create(); 171 struct autogroup *ag = autogroup_create();
175 172
176 autogroup_move_group(p, ag); 173 autogroup_move_group(p, ag);
177 /* drop extra reference added by autogroup_create() */ 174
175 /* Drop extra reference added by autogroup_create(): */
178 autogroup_kref_put(ag); 176 autogroup_kref_put(ag);
179} 177}
180EXPORT_SYMBOL(sched_autogroup_create_attach); 178EXPORT_SYMBOL(sched_autogroup_create_attach);
181 179
182/* Cannot be called under siglock. Currently has no users */ 180/* Cannot be called under siglock. Currently has no users: */
183void sched_autogroup_detach(struct task_struct *p) 181void sched_autogroup_detach(struct task_struct *p)
184{ 182{
185 autogroup_move_group(p, &autogroup_default); 183 autogroup_move_group(p, &autogroup_default);
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
202 200
203 return 1; 201 return 1;
204} 202}
205
206__setup("noautogroup", setup_autogroup); 203__setup("noautogroup", setup_autogroup);
207 204
208#ifdef CONFIG_PROC_FS 205#ifdef CONFIG_PROC_FS
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
224 if (nice < 0 && !can_nice(current, nice)) 221 if (nice < 0 && !can_nice(current, nice))
225 return -EPERM; 222 return -EPERM;
226 223
227 /* this is a heavy operation taking global locks.. */ 224 /* This is a heavy operation, taking global locks.. */
228 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) 225 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
229 return -EAGAIN; 226 return -EAGAIN;
230 227
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
267 264
268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 265 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
269} 266}
270#endif /* CONFIG_SCHED_DEBUG */ 267#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..b96419974a1f 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,15 +1,11 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifdef CONFIG_SCHED_AUTOGROUP 2#ifdef CONFIG_SCHED_AUTOGROUP
3 3
4#include <linux/kref.h>
5#include <linux/rwsem.h>
6#include <linux/sched/autogroup.h>
7
8struct autogroup { 4struct autogroup {
9 /* 5 /*
10 * reference doesn't mean how many thread attach to this 6 * Reference doesn't mean how many threads attach to this
11 * autogroup now. It just stands for the number of task 7 * autogroup now. It just stands for the number of tasks
12 * could use this autogroup. 8 * which could use this autogroup.
13 */ 9 */
14 struct kref kref; 10 struct kref kref;
15 struct task_group *tg; 11 struct task_group *tg;
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
56 return tg; 52 return tg;
57} 53}
58 54
59#ifdef CONFIG_SCHED_DEBUG
60static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 55static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
61{ 56{
62 return 0; 57 return 0;
63} 58}
64#endif
65 59
66#endif /* CONFIG_SCHED_AUTOGROUP */ 60#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..10c83e73837a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * sched_clock for unstable cpu clocks 2 * sched_clock() for unstable CPU clocks
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
5 * 5 *
@@ -11,7 +11,7 @@
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * 13 *
14 * What: 14 * What this file implements:
15 * 15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution 16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i) 17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current CPU.
30 * 30 *
31 * sched_clock_cpu(i) 31 * sched_clock_cpu(i)
32 * 32 *
33 * How: 33 * How it is implemented:
34 * 34 *
35 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the 36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -52,19 +52,7 @@
52 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
53 * 53 *
54 */ 54 */
55#include <linux/spinlock.h> 55#include "sched.h"
56#include <linux/hardirq.h>
57#include <linux/export.h>
58#include <linux/percpu.h>
59#include <linux/ktime.h>
60#include <linux/sched.h>
61#include <linux/nmi.h>
62#include <linux/sched/clock.h>
63#include <linux/static_key.h>
64#include <linux/workqueue.h>
65#include <linux/compiler.h>
66#include <linux/tick.h>
67#include <linux/init.h>
68 56
69/* 57/*
70 * Scheduler clock - returns current time in nanosec units. 58 * Scheduler clock - returns current time in nanosec units.
@@ -302,21 +290,21 @@ again:
302 * cmpxchg64 below only protects one readout. 290 * cmpxchg64 below only protects one readout.
303 * 291 *
304 * We must reread via sched_clock_local() in the retry case on 292 * We must reread via sched_clock_local() in the retry case on
305 * 32bit as an NMI could use sched_clock_local() via the 293 * 32-bit kernels as an NMI could use sched_clock_local() via the
306 * tracer and hit between the readout of 294 * tracer and hit between the readout of
307 * the low32bit and the high 32bit portion. 295 * the low 32-bit and the high 32-bit portion.
308 */ 296 */
309 this_clock = sched_clock_local(my_scd); 297 this_clock = sched_clock_local(my_scd);
310 /* 298 /*
311 * We must enforce atomic readout on 32bit, otherwise the 299 * We must enforce atomic readout on 32-bit, otherwise the
312 * update on the remote cpu can hit inbetween the readout of 300 * update on the remote CPU can hit inbetween the readout of
313 * the low32bit and the high 32bit portion. 301 * the low 32-bit and the high 32-bit portion.
314 */ 302 */
315 remote_clock = cmpxchg64(&scd->clock, 0, 0); 303 remote_clock = cmpxchg64(&scd->clock, 0, 0);
316#else 304#else
317 /* 305 /*
318 * On 64bit the read of [my]scd->clock is atomic versus the 306 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
319 * update, so we can avoid the above 32bit dance. 307 * update, so we can avoid the above 32-bit dance.
320 */ 308 */
321 sched_clock_local(my_scd); 309 sched_clock_local(my_scd);
322again: 310again:
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0926aef10dad..e426b0cb9ac6 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
11 * typically be used for exclusion which gives rise to priority inversion. 11 * typically be used for exclusion which gives rise to priority inversion.
12 * Waiting for completion is a typically sync point, but not an exclusion point. 12 * Waiting for completion is a typically sync point, but not an exclusion point.
13 */ 13 */
14 14#include "sched.h"
15#include <linux/sched/signal.h>
16#include <linux/sched/debug.h>
17#include <linux/completion.h>
18 15
19/** 16/**
20 * complete: - signals a single thread waiting on this completion 17 * complete: - signals a single thread waiting on this completion
@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
283bool try_wait_for_completion(struct completion *x) 280bool try_wait_for_completion(struct completion *x)
284{ 281{
285 unsigned long flags; 282 unsigned long flags;
286 int ret = 1; 283 bool ret = true;
287 284
288 /* 285 /*
289 * Since x->done will need to be locked only 286 * Since x->done will need to be locked only
@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x)
292 * return early in the blocking case. 289 * return early in the blocking case.
293 */ 290 */
294 if (!READ_ONCE(x->done)) 291 if (!READ_ONCE(x->done))
295 return 0; 292 return false;
296 293
297 spin_lock_irqsave(&x->wait.lock, flags); 294 spin_lock_irqsave(&x->wait.lock, flags);
298 if (!x->done) 295 if (!x->done)
299 ret = 0; 296 ret = false;
300 else if (x->done != UINT_MAX) 297 else if (x->done != UINT_MAX)
301 x->done--; 298 x->done--;
302 spin_unlock_irqrestore(&x->wait.lock, flags); 299 spin_unlock_irqrestore(&x->wait.lock, flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c94895bc5a2c..de440456f15c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,37 +5,11 @@
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 */ 7 */
8#include <linux/sched.h> 8#include "sched.h"
9#include <linux/sched/clock.h>
10#include <uapi/linux/sched/types.h>
11#include <linux/sched/loadavg.h>
12#include <linux/sched/hotplug.h>
13#include <linux/wait_bit.h>
14#include <linux/cpuset.h>
15#include <linux/delayacct.h>
16#include <linux/init_task.h>
17#include <linux/context_tracking.h>
18#include <linux/rcupdate_wait.h>
19#include <linux/compat.h>
20
21#include <linux/blkdev.h>
22#include <linux/kprobes.h>
23#include <linux/mmu_context.h>
24#include <linux/module.h>
25#include <linux/nmi.h>
26#include <linux/prefetch.h>
27#include <linux/profile.h>
28#include <linux/security.h>
29#include <linux/syscalls.h>
30#include <linux/sched/isolation.h>
31 9
32#include <asm/switch_to.h> 10#include <asm/switch_to.h>
33#include <asm/tlb.h> 11#include <asm/tlb.h>
34#ifdef CONFIG_PARAVIRT
35#include <asm/paravirt.h>
36#endif
37 12
38#include "sched.h"
39#include "../workqueue_internal.h" 13#include "../workqueue_internal.h"
40#include "../smpboot.h" 14#include "../smpboot.h"
41 15
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
135 * [L] ->on_rq 109 * [L] ->on_rq
136 * RELEASE (rq->lock) 110 * RELEASE (rq->lock)
137 * 111 *
138 * If we observe the old cpu in task_rq_lock, the acquire of 112 * If we observe the old CPU in task_rq_lock, the acquire of
139 * the old rq->lock will fully serialize against the stores. 113 * the old rq->lock will fully serialize against the stores.
140 * 114 *
141 * If we observe the new CPU in task_rq_lock, the acquire will 115 * If we observe the new CPU in task_rq_lock, the acquire will
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
333} 307}
334#endif /* CONFIG_SMP */ 308#endif /* CONFIG_SMP */
335 309
336static void init_rq_hrtick(struct rq *rq) 310static void hrtick_rq_init(struct rq *rq)
337{ 311{
338#ifdef CONFIG_SMP 312#ifdef CONFIG_SMP
339 rq->hrtick_csd_pending = 0; 313 rq->hrtick_csd_pending = 0;
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
351{ 325{
352} 326}
353 327
354static inline void init_rq_hrtick(struct rq *rq) 328static inline void hrtick_rq_init(struct rq *rq)
355{ 329{
356} 330}
357#endif /* CONFIG_SCHED_HRTICK */ 331#endif /* CONFIG_SCHED_HRTICK */
@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void)
609{ 583{
610 int cpu = smp_processor_id(); 584 int cpu = smp_processor_id();
611 585
612 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 586 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
613 return false; 587 return false;
614 588
615 if (idle_cpu(cpu) && !need_resched()) 589 if (idle_cpu(cpu) && !need_resched())
@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void)
619 * We can't run Idle Load Balance on this CPU for this time so we 593 * We can't run Idle Load Balance on this CPU for this time so we
620 * cancel it and clear NOHZ_BALANCE_KICK 594 * cancel it and clear NOHZ_BALANCE_KICK
621 */ 595 */
622 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 596 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
623 return false; 597 return false;
624} 598}
625 599
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
1457 * 1431 *
1458 * - cpu_active must be a subset of cpu_online 1432 * - cpu_active must be a subset of cpu_online
1459 * 1433 *
1460 * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, 1434 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
1461 * see __set_cpus_allowed_ptr(). At this point the newly online 1435 * see __set_cpus_allowed_ptr(). At this point the newly online
1462 * CPU isn't yet part of the sched domains, and balancing will not 1436 * CPU isn't yet part of the sched domains, and balancing will not
1463 * see it. 1437 * see it.
@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p)
2488 2462
2489#ifdef CONFIG_PREEMPT_NOTIFIERS 2463#ifdef CONFIG_PREEMPT_NOTIFIERS
2490 2464
2491static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; 2465static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
2492 2466
2493void preempt_notifier_inc(void) 2467void preempt_notifier_inc(void)
2494{ 2468{
2495 static_key_slow_inc(&preempt_notifier_key); 2469 static_branch_inc(&preempt_notifier_key);
2496} 2470}
2497EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2471EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2498 2472
2499void preempt_notifier_dec(void) 2473void preempt_notifier_dec(void)
2500{ 2474{
2501 static_key_slow_dec(&preempt_notifier_key); 2475 static_branch_dec(&preempt_notifier_key);
2502} 2476}
2503EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2477EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2504 2478
@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2508 */ 2482 */
2509void preempt_notifier_register(struct preempt_notifier *notifier) 2483void preempt_notifier_register(struct preempt_notifier *notifier)
2510{ 2484{
2511 if (!static_key_false(&preempt_notifier_key)) 2485 if (!static_branch_unlikely(&preempt_notifier_key))
2512 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2486 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2513 2487
2514 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2488 hlist_add_head(&notifier->link, &current->preempt_notifiers);
@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2537 2511
2538static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2512static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2539{ 2513{
2540 if (static_key_false(&preempt_notifier_key)) 2514 if (static_branch_unlikely(&preempt_notifier_key))
2541 __fire_sched_in_preempt_notifiers(curr); 2515 __fire_sched_in_preempt_notifiers(curr);
2542} 2516}
2543 2517
@@ -2555,7 +2529,7 @@ static __always_inline void
2555fire_sched_out_preempt_notifiers(struct task_struct *curr, 2529fire_sched_out_preempt_notifiers(struct task_struct *curr,
2556 struct task_struct *next) 2530 struct task_struct *next)
2557{ 2531{
2558 if (static_key_false(&preempt_notifier_key)) 2532 if (static_branch_unlikely(&preempt_notifier_key))
2559 __fire_sched_out_preempt_notifiers(curr, next); 2533 __fire_sched_out_preempt_notifiers(curr, next);
2560} 2534}
2561 2535
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
2629 raw_spin_unlock_irq(&rq->lock); 2603 raw_spin_unlock_irq(&rq->lock);
2630} 2604}
2631 2605
2606/*
2607 * NOP if the arch has not defined these:
2608 */
2609
2610#ifndef prepare_arch_switch
2611# define prepare_arch_switch(next) do { } while (0)
2612#endif
2613
2614#ifndef finish_arch_post_lock_switch
2615# define finish_arch_post_lock_switch() do { } while (0)
2616#endif
2617
2632/** 2618/**
2633 * prepare_task_switch - prepare to switch tasks 2619 * prepare_task_switch - prepare to switch tasks
2634 * @rq: the runqueue preparing to switch 2620 * @rq: the runqueue preparing to switch
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3037 3023
3038#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3024#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3039 /* 3025 /*
3040 * 64-bit doesn't need locks to atomically read a 64bit value. 3026 * 64-bit doesn't need locks to atomically read a 64-bit value.
3041 * So we have a optimization chance when the task's delta_exec is 0. 3027 * So we have a optimization chance when the task's delta_exec is 0.
3042 * Reading ->on_cpu is racy, but this is ok. 3028 * Reading ->on_cpu is racy, but this is ok.
3043 * 3029 *
@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
3096 rq->idle_balance = idle_cpu(cpu); 3082 rq->idle_balance = idle_cpu(cpu);
3097 trigger_load_balance(rq); 3083 trigger_load_balance(rq);
3098#endif 3084#endif
3099 rq_last_tick_reset(rq);
3100} 3085}
3101 3086
3102#ifdef CONFIG_NO_HZ_FULL 3087#ifdef CONFIG_NO_HZ_FULL
3103/** 3088
3104 * scheduler_tick_max_deferment 3089struct tick_work {
3105 * 3090 int cpu;
3106 * Keep at least one tick per second when a single 3091 struct delayed_work work;
3107 * active task is running because the scheduler doesn't 3092};
3108 * yet completely support full dynticks environment. 3093
3109 * 3094static struct tick_work __percpu *tick_work_cpu;
3110 * This makes sure that uptime, CFS vruntime, load 3095
3111 * balancing, etc... continue to move forward, even 3096static void sched_tick_remote(struct work_struct *work)
3112 * with a very low granularity.
3113 *
3114 * Return: Maximum deferment in nanoseconds.
3115 */
3116u64 scheduler_tick_max_deferment(void)
3117{ 3097{
3118 struct rq *rq = this_rq(); 3098 struct delayed_work *dwork = to_delayed_work(work);
3119 unsigned long next, now = READ_ONCE(jiffies); 3099 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3100 int cpu = twork->cpu;
3101 struct rq *rq = cpu_rq(cpu);
3102 struct rq_flags rf;
3120 3103
3121 next = rq->last_sched_tick + HZ; 3104 /*
3105 * Handle the tick only if it appears the remote CPU is running in full
3106 * dynticks mode. The check is racy by nature, but missing a tick or
3107 * having one too much is no big deal because the scheduler tick updates
3108 * statistics and checks timeslices in a time-independent way, regardless
3109 * of when exactly it is running.
3110 */
3111 if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
3112 struct task_struct *curr;
3113 u64 delta;
3122 3114
3123 if (time_before_eq(next, now)) 3115 rq_lock_irq(rq, &rf);
3124 return 0; 3116 update_rq_clock(rq);
3117 curr = rq->curr;
3118 delta = rq_clock_task(rq) - curr->se.exec_start;
3125 3119
3126 return jiffies_to_nsecs(next - now); 3120 /*
3121 * Make sure the next tick runs within a reasonable
3122 * amount of time.
3123 */
3124 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3125 curr->sched_class->task_tick(rq, curr, 0);
3126 rq_unlock_irq(rq, &rf);
3127 }
3128
3129 /*
3130 * Run the remote tick once per second (1Hz). This arbitrary
3131 * frequency is large enough to avoid overload but short enough
3132 * to keep scheduler internal stats reasonably up to date.
3133 */
3134 queue_delayed_work(system_unbound_wq, dwork, HZ);
3127} 3135}
3136
3137static void sched_tick_start(int cpu)
3138{
3139 struct tick_work *twork;
3140
3141 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3142 return;
3143
3144 WARN_ON_ONCE(!tick_work_cpu);
3145
3146 twork = per_cpu_ptr(tick_work_cpu, cpu);
3147 twork->cpu = cpu;
3148 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3149 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3150}
3151
3152#ifdef CONFIG_HOTPLUG_CPU
3153static void sched_tick_stop(int cpu)
3154{
3155 struct tick_work *twork;
3156
3157 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3158 return;
3159
3160 WARN_ON_ONCE(!tick_work_cpu);
3161
3162 twork = per_cpu_ptr(tick_work_cpu, cpu);
3163 cancel_delayed_work_sync(&twork->work);
3164}
3165#endif /* CONFIG_HOTPLUG_CPU */
3166
3167int __init sched_tick_offload_init(void)
3168{
3169 tick_work_cpu = alloc_percpu(struct tick_work);
3170 BUG_ON(!tick_work_cpu);
3171
3172 return 0;
3173}
3174
3175#else /* !CONFIG_NO_HZ_FULL */
3176static inline void sched_tick_start(int cpu) { }
3177static inline void sched_tick_stop(int cpu) { }
3128#endif 3178#endif
3129 3179
3130#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3180#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu)
5786{ 5836{
5787 set_cpu_rq_start_time(cpu); 5837 set_cpu_rq_start_time(cpu);
5788 sched_rq_cpu_starting(cpu); 5838 sched_rq_cpu_starting(cpu);
5839 sched_tick_start(cpu);
5789 return 0; 5840 return 0;
5790} 5841}
5791 5842
@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu)
5797 5848
5798 /* Handle pending wakeups and then migrate everything off */ 5849 /* Handle pending wakeups and then migrate everything off */
5799 sched_ttwu_pending(); 5850 sched_ttwu_pending();
5851 sched_tick_stop(cpu);
5800 5852
5801 rq_lock_irqsave(rq, &rf); 5853 rq_lock_irqsave(rq, &rf);
5802 if (rq->rd) { 5854 if (rq->rd) {
@@ -5809,7 +5861,7 @@ int sched_cpu_dying(unsigned int cpu)
5809 5861
5810 calc_load_migrate(rq); 5862 calc_load_migrate(rq);
5811 update_max_interval(); 5863 update_max_interval();
5812 nohz_balance_exit_idle(cpu); 5864 nohz_balance_exit_idle(rq);
5813 hrtick_clear(rq); 5865 hrtick_clear(rq);
5814 return 0; 5866 return 0;
5815} 5867}
@@ -6022,13 +6074,11 @@ void __init sched_init(void)
6022 rq_attach_root(rq, &def_root_domain); 6074 rq_attach_root(rq, &def_root_domain);
6023#ifdef CONFIG_NO_HZ_COMMON 6075#ifdef CONFIG_NO_HZ_COMMON
6024 rq->last_load_update_tick = jiffies; 6076 rq->last_load_update_tick = jiffies;
6025 rq->nohz_flags = 0; 6077 rq->last_blocked_load_update_tick = jiffies;
6026#endif 6078 atomic_set(&rq->nohz_flags, 0);
6027#ifdef CONFIG_NO_HZ_FULL
6028 rq->last_sched_tick = 0;
6029#endif 6079#endif
6030#endif /* CONFIG_SMP */ 6080#endif /* CONFIG_SMP */
6031 init_rq_hrtick(rq); 6081 hrtick_rq_init(rq);
6032 atomic_set(&rq->nr_iowait, 0); 6082 atomic_set(&rq->nr_iowait, 0);
6033 } 6083 }
6034 6084
@@ -7027,3 +7077,5 @@ const u32 sched_prio_to_wmult[40] = {
7027 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 7077 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
7028 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 7078 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
7029}; 7079};
7080
7081#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..9fbb10383434 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,24 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/cgroup.h>
3#include <linux/slab.h>
4#include <linux/percpu.h>
5#include <linux/spinlock.h>
6#include <linux/cpumask.h>
7#include <linux/seq_file.h>
8#include <linux/rcupdate.h>
9#include <linux/kernel_stat.h>
10#include <linux/err.h>
11
12#include "sched.h"
13
14/* 2/*
15 * CPU accounting code for task groups. 3 * CPU accounting code for task groups.
16 * 4 *
17 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 5 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
18 * (balbir@in.ibm.com). 6 * (balbir@in.ibm.com).
19 */ 7 */
8#include "sched.h"
20 9
21/* Time spent by the tasks of the cpu accounting group executing in ... */ 10/* Time spent by the tasks of the CPU accounting group executing in ... */
22enum cpuacct_stat_index { 11enum cpuacct_stat_index {
23 CPUACCT_STAT_USER, /* ... user mode */ 12 CPUACCT_STAT_USER, /* ... user mode */
24 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 13 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
@@ -35,12 +24,12 @@ struct cpuacct_usage {
35 u64 usages[CPUACCT_STAT_NSTATS]; 24 u64 usages[CPUACCT_STAT_NSTATS];
36}; 25};
37 26
38/* track cpu usage of a group of tasks and its child groups */ 27/* track CPU usage of a group of tasks and its child groups */
39struct cpuacct { 28struct cpuacct {
40 struct cgroup_subsys_state css; 29 struct cgroup_subsys_state css;
41 /* cpuusage holds pointer to a u64-type object on every cpu */ 30 /* cpuusage holds pointer to a u64-type object on every CPU */
42 struct cpuacct_usage __percpu *cpuusage; 31 struct cpuacct_usage __percpu *cpuusage;
43 struct kernel_cpustat __percpu *cpustat; 32 struct kernel_cpustat __percpu *cpustat;
44}; 33};
45 34
46static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) 35static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
48 return css ? container_of(css, struct cpuacct, css) : NULL; 37 return css ? container_of(css, struct cpuacct, css) : NULL;
49} 38}
50 39
51/* return cpu accounting group to which this task belongs */ 40/* Return CPU accounting group to which this task belongs */
52static inline struct cpuacct *task_ca(struct task_struct *tsk) 41static inline struct cpuacct *task_ca(struct task_struct *tsk)
53{ 42{
54 return css_ca(task_css(tsk, cpuacct_cgrp_id)); 43 return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
65 .cpuusage = &root_cpuacct_cpuusage, 54 .cpuusage = &root_cpuacct_cpuusage,
66}; 55};
67 56
68/* create a new cpu accounting group */ 57/* Create a new CPU accounting group */
69static struct cgroup_subsys_state * 58static struct cgroup_subsys_state *
70cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) 59cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
71{ 60{
@@ -96,7 +85,7 @@ out:
96 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
97} 86}
98 87
99/* destroy an existing cpu accounting group */ 88/* Destroy an existing CPU accounting group */
100static void cpuacct_css_free(struct cgroup_subsys_state *css) 89static void cpuacct_css_free(struct cgroup_subsys_state *css)
101{ 90{
102 struct cpuacct *ca = css_ca(css); 91 struct cpuacct *ca = css_ca(css);
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
162#endif 151#endif
163} 152}
164 153
165/* return total cpu usage (in nanoseconds) of a group */ 154/* Return total CPU usage (in nanoseconds) of a group */
166static u64 __cpuusage_read(struct cgroup_subsys_state *css, 155static u64 __cpuusage_read(struct cgroup_subsys_state *css,
167 enum cpuacct_stat_index index) 156 enum cpuacct_stat_index index)
168{ 157{
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8d9562d890d3..50316455ea66 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,11 +10,7 @@
10 * as published by the Free Software Foundation; version 2 10 * as published by the Free Software Foundation; version 2
11 * of the License. 11 * of the License.
12 */ 12 */
13 13#include "sched.h"
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include <linux/slab.h>
17#include "cpudeadline.h"
18 14
19static inline int parent(int i) 15static inline int parent(int i)
20{ 16{
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
42 return; 38 return;
43 39
44 /* adapted from lib/prio_heap.c */ 40 /* adapted from lib/prio_heap.c */
45 while(1) { 41 while (1) {
46 u64 largest_dl; 42 u64 largest_dl;
43
47 l = left_child(idx); 44 l = left_child(idx);
48 r = right_child(idx); 45 r = right_child(idx);
49 largest = idx; 46 largest = idx;
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
131 return 1; 128 return 1;
132 } else { 129 } else {
133 int best_cpu = cpudl_maximum(cp); 130 int best_cpu = cpudl_maximum(cp);
131
134 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 132 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
135 133
136 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && 134 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
145} 143}
146 144
147/* 145/*
148 * cpudl_clear - remove a cpu from the cpudl max-heap 146 * cpudl_clear - remove a CPU from the cpudl max-heap
149 * @cp: the cpudl max-heap context 147 * @cp: the cpudl max-heap context
150 * @cpu: the target cpu 148 * @cpu: the target CPU
151 * 149 *
152 * Notes: assumes cpu_rq(cpu)->lock is locked 150 * Notes: assumes cpu_rq(cpu)->lock is locked
153 * 151 *
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
186/* 184/*
187 * cpudl_set - update the cpudl max-heap 185 * cpudl_set - update the cpudl max-heap
188 * @cp: the cpudl max-heap context 186 * @cp: the cpudl max-heap context
189 * @cpu: the target cpu 187 * @cpu: the target CPU
190 * @dl: the new earliest deadline for this cpu 188 * @dl: the new earliest deadline for this CPU
191 * 189 *
192 * Notes: assumes cpu_rq(cpu)->lock is locked 190 * Notes: assumes cpu_rq(cpu)->lock is locked
193 * 191 *
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
205 old_idx = cp->elements[cpu].idx; 203 old_idx = cp->elements[cpu].idx;
206 if (old_idx == IDX_INVALID) { 204 if (old_idx == IDX_INVALID) {
207 int new_idx = cp->size++; 205 int new_idx = cp->size++;
206
208 cp->elements[new_idx].dl = dl; 207 cp->elements[new_idx].dl = dl;
209 cp->elements[new_idx].cpu = cpu; 208 cp->elements[new_idx].cpu = cpu;
210 cp->elements[cpu].idx = new_idx; 209 cp->elements[cpu].idx = new_idx;
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
221/* 220/*
222 * cpudl_set_freecpu - Set the cpudl.free_cpus 221 * cpudl_set_freecpu - Set the cpudl.free_cpus
223 * @cp: the cpudl max-heap context 222 * @cp: the cpudl max-heap context
224 * @cpu: rd attached cpu 223 * @cpu: rd attached CPU
225 */ 224 */
226void cpudl_set_freecpu(struct cpudl *cp, int cpu) 225void cpudl_set_freecpu(struct cpudl *cp, int cpu)
227{ 226{
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
231/* 230/*
232 * cpudl_clear_freecpu - Clear the cpudl.free_cpus 231 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
233 * @cp: the cpudl max-heap context 232 * @cp: the cpudl max-heap context
234 * @cpu: rd attached cpu 233 * @cpu: rd attached CPU
235 */ 234 */
236void cpudl_clear_freecpu(struct cpudl *cp, int cpu) 235void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
237{ 236{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..0adeda93b5fb 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,26 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUDL_H
3#define _LINUX_CPUDL_H
4 2
5#include <linux/sched.h> 3#define IDX_INVALID -1
6#include <linux/sched/deadline.h>
7
8#define IDX_INVALID -1
9 4
10struct cpudl_item { 5struct cpudl_item {
11 u64 dl; 6 u64 dl;
12 int cpu; 7 int cpu;
13 int idx; 8 int idx;
14}; 9};
15 10
16struct cpudl { 11struct cpudl {
17 raw_spinlock_t lock; 12 raw_spinlock_t lock;
18 int size; 13 int size;
19 cpumask_var_t free_cpus; 14 cpumask_var_t free_cpus;
20 struct cpudl_item *elements; 15 struct cpudl_item *elements;
21}; 16};
22 17
23
24#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
25int cpudl_find(struct cpudl *cp, struct task_struct *p, 19int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
26 struct cpumask *later_mask);
27void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 20void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
28void cpudl_clear(struct cpudl *cp, int cpu); 21void cpudl_clear(struct cpudl *cp, int cpu);
29int cpudl_init(struct cpudl *cp); 22int cpudl_init(struct cpudl *cp);
30void cpudl_set_freecpu(struct cpudl *cp, int cpu); 23void cpudl_set_freecpu(struct cpudl *cp, int cpu);
31void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 24void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
32void cpudl_cleanup(struct cpudl *cp); 25void cpudl_cleanup(struct cpudl *cp);
33#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
34
35#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..5e54cbcae673 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,7 +8,6 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11
12#include "sched.h" 11#include "sched.h"
13 12
14DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 13DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7936f548e071..2b124811947d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,61 +11,56 @@
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <linux/cpufreq.h>
15#include <linux/kthread.h>
16#include <uapi/linux/sched/types.h>
17#include <linux/slab.h>
18#include <trace/events/power.h>
19
20#include "sched.h" 14#include "sched.h"
21 15
16#include <trace/events/power.h>
17
22struct sugov_tunables { 18struct sugov_tunables {
23 struct gov_attr_set attr_set; 19 struct gov_attr_set attr_set;
24 unsigned int rate_limit_us; 20 unsigned int rate_limit_us;
25}; 21};
26 22
27struct sugov_policy { 23struct sugov_policy {
28 struct cpufreq_policy *policy; 24 struct cpufreq_policy *policy;
29 25
30 struct sugov_tunables *tunables; 26 struct sugov_tunables *tunables;
31 struct list_head tunables_hook; 27 struct list_head tunables_hook;
32 28
33 raw_spinlock_t update_lock; /* For shared policies */ 29 raw_spinlock_t update_lock; /* For shared policies */
34 u64 last_freq_update_time; 30 u64 last_freq_update_time;
35 s64 freq_update_delay_ns; 31 s64 freq_update_delay_ns;
36 unsigned int next_freq; 32 unsigned int next_freq;
37 unsigned int cached_raw_freq; 33 unsigned int cached_raw_freq;
38 34
39 /* The next fields are only needed if fast switch cannot be used. */ 35 /* The next fields are only needed if fast switch cannot be used: */
40 struct irq_work irq_work; 36 struct irq_work irq_work;
41 struct kthread_work work; 37 struct kthread_work work;
42 struct mutex work_lock; 38 struct mutex work_lock;
43 struct kthread_worker worker; 39 struct kthread_worker worker;
44 struct task_struct *thread; 40 struct task_struct *thread;
45 bool work_in_progress; 41 bool work_in_progress;
46 42
47 bool need_freq_update; 43 bool need_freq_update;
48}; 44};
49 45
50struct sugov_cpu { 46struct sugov_cpu {
51 struct update_util_data update_util; 47 struct update_util_data update_util;
52 struct sugov_policy *sg_policy; 48 struct sugov_policy *sg_policy;
53 unsigned int cpu; 49 unsigned int cpu;
54 50
55 bool iowait_boost_pending; 51 bool iowait_boost_pending;
56 unsigned int iowait_boost; 52 unsigned int iowait_boost;
57 unsigned int iowait_boost_max; 53 unsigned int iowait_boost_max;
58 u64 last_update; 54 u64 last_update;
59 55
60 /* The fields below are only needed when sharing a policy. */ 56 /* The fields below are only needed when sharing a policy: */
61 unsigned long util_cfs; 57 unsigned long util_cfs;
62 unsigned long util_dl; 58 unsigned long util_dl;
63 unsigned long max; 59 unsigned long max;
64 unsigned int flags;
65 60
66 /* The field below is for single-CPU policies only. */ 61 /* The field below is for single-CPU policies only: */
67#ifdef CONFIG_NO_HZ_COMMON 62#ifdef CONFIG_NO_HZ_COMMON
68 unsigned long saved_idle_calls; 63 unsigned long saved_idle_calls;
69#endif 64#endif
70}; 65};
71 66
@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
79 74
80 /* 75 /*
81 * Since cpufreq_update_util() is called with rq->lock held for 76 * Since cpufreq_update_util() is called with rq->lock held for
82 * the @target_cpu, our per-cpu data is fully serialized. 77 * the @target_cpu, our per-CPU data is fully serialized.
83 * 78 *
84 * However, drivers cannot in general deal with cross-cpu 79 * However, drivers cannot in general deal with cross-CPU
85 * requests, so while get_next_freq() will work, our 80 * requests, so while get_next_freq() will work, our
86 * sugov_update_commit() call may not for the fast switching platforms. 81 * sugov_update_commit() call may not for the fast switching platforms.
87 * 82 *
@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
111 } 106 }
112 107
113 delta_ns = time - sg_policy->last_freq_update_time; 108 delta_ns = time - sg_policy->last_freq_update_time;
109
114 return delta_ns >= sg_policy->freq_update_delay_ns; 110 return delta_ns >= sg_policy->freq_update_delay_ns;
115} 111}
116 112
@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
186 182
187static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) 183static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
188{ 184{
185 struct rq *rq = cpu_rq(sg_cpu->cpu);
186 unsigned long util;
187
188 if (rq->rt.rt_nr_running) {
189 util = sg_cpu->max;
190 } else {
191 util = sg_cpu->util_dl;
192 if (rq->cfs.h_nr_running)
193 util += sg_cpu->util_cfs;
194 }
195
189 /* 196 /*
190 * Ideally we would like to set util_dl as min/guaranteed freq and 197 * Ideally we would like to set util_dl as min/guaranteed freq and
191 * util_cfs + util_dl as requested freq. However, cpufreq is not yet 198 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
192 * ready for such an interface. So, we only do the latter for now. 199 * ready for such an interface. So, we only do the latter for now.
193 */ 200 */
194 return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); 201 return min(util, sg_cpu->max);
195} 202}
196 203
197static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) 204static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
198{ 205{
199 if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { 206 if (flags & SCHED_CPUFREQ_IOWAIT) {
200 if (sg_cpu->iowait_boost_pending) 207 if (sg_cpu->iowait_boost_pending)
201 return; 208 return;
202 209
@@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
260static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } 267static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
261#endif /* CONFIG_NO_HZ_COMMON */ 268#endif /* CONFIG_NO_HZ_COMMON */
262 269
270/*
271 * Make sugov_should_update_freq() ignore the rate limit when DL
272 * has increased the utilization.
273 */
274static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
275{
276 if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
277 sg_policy->need_freq_update = true;
278}
279
263static void sugov_update_single(struct update_util_data *hook, u64 time, 280static void sugov_update_single(struct update_util_data *hook, u64 time,
264 unsigned int flags) 281 unsigned int flags)
265{ 282{
266 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 283 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
267 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 284 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
268 struct cpufreq_policy *policy = sg_policy->policy;
269 unsigned long util, max; 285 unsigned long util, max;
270 unsigned int next_f; 286 unsigned int next_f;
271 bool busy; 287 bool busy;
272 288
273 sugov_set_iowait_boost(sg_cpu, time); 289 sugov_set_iowait_boost(sg_cpu, time, flags);
274 sg_cpu->last_update = time; 290 sg_cpu->last_update = time;
275 291
292 ignore_dl_rate_limit(sg_cpu, sg_policy);
293
276 if (!sugov_should_update_freq(sg_policy, time)) 294 if (!sugov_should_update_freq(sg_policy, time))
277 return; 295 return;
278 296
279 busy = sugov_cpu_is_busy(sg_cpu); 297 busy = sugov_cpu_is_busy(sg_cpu);
280 298
281 if (flags & SCHED_CPUFREQ_RT) { 299 sugov_get_util(sg_cpu);
282 next_f = policy->cpuinfo.max_freq; 300 max = sg_cpu->max;
283 } else { 301 util = sugov_aggregate_util(sg_cpu);
284 sugov_get_util(sg_cpu); 302 sugov_iowait_boost(sg_cpu, &util, &max);
285 max = sg_cpu->max; 303 next_f = get_next_freq(sg_policy, util, max);
286 util = sugov_aggregate_util(sg_cpu); 304 /*
287 sugov_iowait_boost(sg_cpu, &util, &max); 305 * Do not reduce the frequency if the CPU has not been idle
288 next_f = get_next_freq(sg_policy, util, max); 306 * recently, as the reduction is likely to be premature then.
289 /* 307 */
290 * Do not reduce the frequency if the CPU has not been idle 308 if (busy && next_f < sg_policy->next_freq) {
291 * recently, as the reduction is likely to be premature then. 309 next_f = sg_policy->next_freq;
292 */
293 if (busy && next_f < sg_policy->next_freq) {
294 next_f = sg_policy->next_freq;
295 310
296 /* Reset cached freq as next_freq has changed */ 311 /* Reset cached freq as next_freq has changed */
297 sg_policy->cached_raw_freq = 0; 312 sg_policy->cached_raw_freq = 0;
298 }
299 } 313 }
314
300 sugov_update_commit(sg_policy, time, next_f); 315 sugov_update_commit(sg_policy, time, next_f);
301} 316}
302 317
@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
312 unsigned long j_util, j_max; 327 unsigned long j_util, j_max;
313 s64 delta_ns; 328 s64 delta_ns;
314 329
330 sugov_get_util(j_sg_cpu);
331
315 /* 332 /*
316 * If the CFS CPU utilization was last updated before the 333 * If the CFS CPU utilization was last updated before the
317 * previous frequency update and the time elapsed between the 334 * previous frequency update and the time elapsed between the
@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
325 if (delta_ns > TICK_NSEC) { 342 if (delta_ns > TICK_NSEC) {
326 j_sg_cpu->iowait_boost = 0; 343 j_sg_cpu->iowait_boost = 0;
327 j_sg_cpu->iowait_boost_pending = false; 344 j_sg_cpu->iowait_boost_pending = false;
328 j_sg_cpu->util_cfs = 0;
329 if (j_sg_cpu->util_dl == 0)
330 continue;
331 } 345 }
332 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
333 return policy->cpuinfo.max_freq;
334 346
335 j_max = j_sg_cpu->max; 347 j_max = j_sg_cpu->max;
336 j_util = sugov_aggregate_util(j_sg_cpu); 348 j_util = sugov_aggregate_util(j_sg_cpu);
349 sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
337 if (j_util * max > j_max * util) { 350 if (j_util * max > j_max * util) {
338 util = j_util; 351 util = j_util;
339 max = j_max; 352 max = j_max;
340 } 353 }
341
342 sugov_iowait_boost(j_sg_cpu, &util, &max);
343 } 354 }
344 355
345 return get_next_freq(sg_policy, util, max); 356 return get_next_freq(sg_policy, util, max);
346} 357}
347 358
348static void sugov_update_shared(struct update_util_data *hook, u64 time, 359static void
349 unsigned int flags) 360sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
350{ 361{
351 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 362 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
352 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 363 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
354 365
355 raw_spin_lock(&sg_policy->update_lock); 366 raw_spin_lock(&sg_policy->update_lock);
356 367
357 sugov_get_util(sg_cpu); 368 sugov_set_iowait_boost(sg_cpu, time, flags);
358 sg_cpu->flags = flags;
359
360 sugov_set_iowait_boost(sg_cpu, time);
361 sg_cpu->last_update = time; 369 sg_cpu->last_update = time;
362 370
363 if (sugov_should_update_freq(sg_policy, time)) { 371 ignore_dl_rate_limit(sg_cpu, sg_policy);
364 if (flags & SCHED_CPUFREQ_RT)
365 next_f = sg_policy->policy->cpuinfo.max_freq;
366 else
367 next_f = sugov_next_freq_shared(sg_cpu, time);
368 372
373 if (sugov_should_update_freq(sg_policy, time)) {
374 next_f = sugov_next_freq_shared(sg_cpu, time);
369 sugov_update_commit(sg_policy, time, next_f); 375 sugov_update_commit(sg_policy, time, next_f);
370 } 376 }
371 377
@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
423 return sprintf(buf, "%u\n", tunables->rate_limit_us); 429 return sprintf(buf, "%u\n", tunables->rate_limit_us);
424} 430}
425 431
426static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, 432static ssize_t
427 size_t count) 433rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
428{ 434{
429 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 435 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
430 struct sugov_policy *sg_policy; 436 struct sugov_policy *sg_policy;
@@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
479{ 485{
480 struct task_struct *thread; 486 struct task_struct *thread;
481 struct sched_attr attr = { 487 struct sched_attr attr = {
482 .size = sizeof(struct sched_attr), 488 .size = sizeof(struct sched_attr),
483 .sched_policy = SCHED_DEADLINE, 489 .sched_policy = SCHED_DEADLINE,
484 .sched_flags = SCHED_FLAG_SUGOV, 490 .sched_flags = SCHED_FLAG_SUGOV,
485 .sched_nice = 0, 491 .sched_nice = 0,
486 .sched_priority = 0, 492 .sched_priority = 0,
487 /* 493 /*
488 * Fake (unused) bandwidth; workaround to "fix" 494 * Fake (unused) bandwidth; workaround to "fix"
489 * priority inheritance. 495 * priority inheritance.
@@ -663,21 +669,20 @@ static int sugov_start(struct cpufreq_policy *policy)
663 struct sugov_policy *sg_policy = policy->governor_data; 669 struct sugov_policy *sg_policy = policy->governor_data;
664 unsigned int cpu; 670 unsigned int cpu;
665 671
666 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 672 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
667 sg_policy->last_freq_update_time = 0; 673 sg_policy->last_freq_update_time = 0;
668 sg_policy->next_freq = UINT_MAX; 674 sg_policy->next_freq = UINT_MAX;
669 sg_policy->work_in_progress = false; 675 sg_policy->work_in_progress = false;
670 sg_policy->need_freq_update = false; 676 sg_policy->need_freq_update = false;
671 sg_policy->cached_raw_freq = 0; 677 sg_policy->cached_raw_freq = 0;
672 678
673 for_each_cpu(cpu, policy->cpus) { 679 for_each_cpu(cpu, policy->cpus) {
674 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 680 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
675 681
676 memset(sg_cpu, 0, sizeof(*sg_cpu)); 682 memset(sg_cpu, 0, sizeof(*sg_cpu));
677 sg_cpu->cpu = cpu; 683 sg_cpu->cpu = cpu;
678 sg_cpu->sg_policy = sg_policy; 684 sg_cpu->sg_policy = sg_policy;
679 sg_cpu->flags = 0; 685 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
680 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
681 } 686 }
682 687
683 for_each_cpu(cpu, policy->cpus) { 688 for_each_cpu(cpu, policy->cpus) {
@@ -721,14 +726,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
721} 726}
722 727
723static struct cpufreq_governor schedutil_gov = { 728static struct cpufreq_governor schedutil_gov = {
724 .name = "schedutil", 729 .name = "schedutil",
725 .owner = THIS_MODULE, 730 .owner = THIS_MODULE,
726 .dynamic_switching = true, 731 .dynamic_switching = true,
727 .init = sugov_init, 732 .init = sugov_init,
728 .exit = sugov_exit, 733 .exit = sugov_exit,
729 .start = sugov_start, 734 .start = sugov_start,
730 .stop = sugov_stop, 735 .stop = sugov_stop,
731 .limits = sugov_limits, 736 .limits = sugov_limits,
732}; 737};
733 738
734#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 739#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..daaadf939ccb 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
14 * 14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state 15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with 16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus 17 * a 2 dimensional bitmap (the first for priority class, the second for CPUs
18 * in that class). Therefore a typical application without affinity 18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit 19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a 20 * searches). For tasks with affinity restrictions, the algorithm has a
@@ -26,12 +26,7 @@
26 * as published by the Free Software Foundation; version 2 26 * as published by the Free Software Foundation; version 2
27 * of the License. 27 * of the License.
28 */ 28 */
29 29#include "sched.h"
30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
33#include <linux/slab.h>
34#include "cpupri.h"
35 30
36/* Convert between a 140 based task->prio, and our 102 based cpupri */ 31/* Convert between a 140 based task->prio, and our 102 based cpupri */
37static int convert_prio(int prio) 32static int convert_prio(int prio)
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
128} 123}
129 124
130/** 125/**
131 * cpupri_set - update the cpu priority setting 126 * cpupri_set - update the CPU priority setting
132 * @cp: The cpupri context 127 * @cp: The cpupri context
133 * @cpu: The target cpu 128 * @cpu: The target CPU
134 * @newpri: The priority (INVALID-RT99) to assign to this CPU 129 * @newpri: The priority (INVALID-RT99) to assign to this CPU
135 * 130 *
136 * Note: Assumes cpu_rq(cpu)->lock is locked 131 * Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
151 return; 146 return;
152 147
153 /* 148 /*
154 * If the cpu was currently mapped to a different value, we 149 * If the CPU was currently mapped to a different value, we
155 * need to map it to the new value then remove the old value. 150 * need to map it to the new value then remove the old value.
156 * Note, we must add the new value first, otherwise we risk the 151 * Note, we must add the new value first, otherwise we risk the
157 * cpu being missed by the priority loop in cpupri_find. 152 * cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..7dc20a3232e7 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,25 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUPRI_H
3#define _LINUX_CPUPRI_H
4
5#include <linux/sched.h>
6 2
7#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 3#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
8 4
9#define CPUPRI_INVALID -1 5#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 6#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1 7#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */ 8/* values 2-101 are RT priorities 0-99 */
13 9
14struct cpupri_vec { 10struct cpupri_vec {
15 atomic_t count; 11 atomic_t count;
16 cpumask_var_t mask; 12 cpumask_var_t mask;
17}; 13};
18 14
19struct cpupri { 15struct cpupri {
20 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 16 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
21 int *cpu_to_pri; 17 int *cpu_to_pri;
22}; 18};
23 19
24#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
25int cpupri_find(struct cpupri *cp, 21int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
26 struct task_struct *p, struct cpumask *lowest_mask);
27void cpupri_set(struct cpupri *cp, int cpu, int pri); 22void cpupri_set(struct cpupri *cp, int cpu, int pri);
28int cpupri_init(struct cpupri *cp); 23int cpupri_init(struct cpupri *cp);
29void cpupri_cleanup(struct cpupri *cp); 24void cpupri_cleanup(struct cpupri *cp);
30#endif 25#endif
31
32#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..0796f938c4f0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,6 @@
1#include <linux/export.h> 1/*
2#include <linux/sched.h> 2 * Simple CPU accounting cgroup controller
3#include <linux/tsacct_kern.h> 3 */
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
7#include <linux/sched/cputime.h>
8#include "sched.h" 4#include "sched.h"
9 5
10#ifdef CONFIG_IRQ_TIME_ACCOUNTING 6#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
113} 109}
114 110
115/* 111/*
116 * Account user cpu time to a process. 112 * Account user CPU time to a process.
117 * @p: the process that the cpu time gets accounted to 113 * @p: the process that the CPU time gets accounted to
118 * @cputime: the cpu time spent in user space since the last update 114 * @cputime: the CPU time spent in user space since the last update
119 */ 115 */
120void account_user_time(struct task_struct *p, u64 cputime) 116void account_user_time(struct task_struct *p, u64 cputime)
121{ 117{
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
135} 131}
136 132
137/* 133/*
138 * Account guest cpu time to a process. 134 * Account guest CPU time to a process.
139 * @p: the process that the cpu time gets accounted to 135 * @p: the process that the CPU time gets accounted to
140 * @cputime: the cpu time spent in virtual machine since the last update 136 * @cputime: the CPU time spent in virtual machine since the last update
141 */ 137 */
142void account_guest_time(struct task_struct *p, u64 cputime) 138void account_guest_time(struct task_struct *p, u64 cputime)
143{ 139{
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
159} 155}
160 156
161/* 157/*
162 * Account system cpu time to a process and desired cpustat field 158 * Account system CPU time to a process and desired cpustat field
163 * @p: the process that the cpu time gets accounted to 159 * @p: the process that the CPU time gets accounted to
164 * @cputime: the cpu time spent in kernel space since the last update 160 * @cputime: the CPU time spent in kernel space since the last update
165 * @index: pointer to cpustat field that has to be updated 161 * @index: pointer to cpustat field that has to be updated
166 */ 162 */
167void account_system_index_time(struct task_struct *p, 163void account_system_index_time(struct task_struct *p,
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
179} 175}
180 176
181/* 177/*
182 * Account system cpu time to a process. 178 * Account system CPU time to a process.
183 * @p: the process that the cpu time gets accounted to 179 * @p: the process that the CPU time gets accounted to
184 * @hardirq_offset: the offset to subtract from hardirq_count() 180 * @hardirq_offset: the offset to subtract from hardirq_count()
185 * @cputime: the cpu time spent in kernel space since the last update 181 * @cputime: the CPU time spent in kernel space since the last update
186 */ 182 */
187void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 183void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
188{ 184{
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
205 201
206/* 202/*
207 * Account for involuntary wait time. 203 * Account for involuntary wait time.
208 * @cputime: the cpu time spent in involuntary wait 204 * @cputime: the CPU time spent in involuntary wait
209 */ 205 */
210void account_steal_time(u64 cputime) 206void account_steal_time(u64 cputime)
211{ 207{
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
216 212
217/* 213/*
218 * Account for idle time. 214 * Account for idle time.
219 * @cputime: the cpu time spent in idle wait 215 * @cputime: the CPU time spent in idle wait
220 */ 216 */
221void account_idle_time(u64 cputime) 217void account_idle_time(u64 cputime)
222{ 218{
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
338#ifdef CONFIG_IRQ_TIME_ACCOUNTING 334#ifdef CONFIG_IRQ_TIME_ACCOUNTING
339/* 335/*
340 * Account a tick to a process and cpustat 336 * Account a tick to a process and cpustat
341 * @p: the process that the cpu time gets accounted to 337 * @p: the process that the CPU time gets accounted to
342 * @user_tick: is the tick from userspace 338 * @user_tick: is the tick from userspace
343 * @rq: the pointer to rq 339 * @rq: the pointer to rq
344 * 340 *
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
400 irqtime_account_process_tick(current, 0, rq, ticks); 396 irqtime_account_process_tick(current, 0, rq, ticks);
401} 397}
402#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 398#else /* CONFIG_IRQ_TIME_ACCOUNTING */
403static inline void irqtime_account_idle_ticks(int ticks) {} 399static inline void irqtime_account_idle_ticks(int ticks) { }
404static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 400static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
405 struct rq *rq, int nr_ticks) {} 401 struct rq *rq, int nr_ticks) { }
406#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 402#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
407 403
408/* 404/*
409 * Use precise platform statistics if available: 405 * Use precise platform statistics if available:
410 */ 406 */
411#ifdef CONFIG_VIRT_CPU_ACCOUNTING 407#ifdef CONFIG_VIRT_CPU_ACCOUNTING
412 408# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
413#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
414void vtime_common_task_switch(struct task_struct *prev) 409void vtime_common_task_switch(struct task_struct *prev)
415{ 410{
416 if (is_idle_task(prev)) 411 if (is_idle_task(prev))
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
421 vtime_flush(prev); 416 vtime_flush(prev);
422 arch_vtime_task_switch(prev); 417 arch_vtime_task_switch(prev);
423} 418}
424#endif 419# endif
425
426#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 420#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
427 421
428 422
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
469 *ut = cputime.utime; 463 *ut = cputime.utime;
470 *st = cputime.stime; 464 *st = cputime.stime;
471} 465}
472#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 466
467#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
468
473/* 469/*
474 * Account a single tick of cpu time. 470 * Account a single tick of CPU time.
475 * @p: the process that the cpu time gets accounted to 471 * @p: the process that the CPU time gets accounted to
476 * @user_tick: indicates if the tick is a user or a system tick 472 * @user_tick: indicates if the tick is a user or a system tick
477 */ 473 */
478void account_process_tick(struct task_struct *p, int user_tick) 474void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9df09782025c..d1c7bf7c7e5b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,9 +17,6 @@
17 */ 17 */
18#include "sched.h" 18#include "sched.h"
19 19
20#include <linux/slab.h>
21#include <uapi/linux/sched/types.h>
22
23struct dl_bandwidth def_dl_bandwidth; 20struct dl_bandwidth def_dl_bandwidth;
24 21
25static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) 22static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
87 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ 84 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
88 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); 85 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
89 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ 86 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
90 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); 87 cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
91} 88}
92 89
93static inline 90static inline
@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
101 if (dl_rq->running_bw > old) 98 if (dl_rq->running_bw > old)
102 dl_rq->running_bw = 0; 99 dl_rq->running_bw = 0;
103 /* kick cpufreq (see the comment in kernel/sched/sched.h). */ 100 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
104 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); 101 cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
105} 102}
106 103
107static inline 104static inline
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
514static void push_dl_tasks(struct rq *); 511static void push_dl_tasks(struct rq *);
515static void pull_dl_task(struct rq *); 512static void pull_dl_task(struct rq *);
516 513
517static inline void queue_push_tasks(struct rq *rq) 514static inline void deadline_queue_push_tasks(struct rq *rq)
518{ 515{
519 if (!has_pushable_dl_tasks(rq)) 516 if (!has_pushable_dl_tasks(rq))
520 return; 517 return;
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
522 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); 519 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
523} 520}
524 521
525static inline void queue_pull_task(struct rq *rq) 522static inline void deadline_queue_pull_task(struct rq *rq)
526{ 523{
527 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); 524 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
528} 525}
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
539 536
540 /* 537 /*
541 * If we cannot preempt any rq, fall back to pick any 538 * If we cannot preempt any rq, fall back to pick any
542 * online cpu. 539 * online CPU:
543 */ 540 */
544 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 541 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
545 if (cpu >= nr_cpu_ids) { 542 if (cpu >= nr_cpu_ids) {
546 /* 543 /*
547 * Fail to find any suitable cpu. 544 * Failed to find any suitable CPU.
548 * The task will never come back! 545 * The task will never come back!
549 */ 546 */
550 BUG_ON(dl_bandwidth_enabled()); 547 BUG_ON(dl_bandwidth_enabled());
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
597{ 594{
598} 595}
599 596
600static inline void queue_push_tasks(struct rq *rq) 597static inline void deadline_queue_push_tasks(struct rq *rq)
601{ 598{
602} 599}
603 600
604static inline void queue_pull_task(struct rq *rq) 601static inline void deadline_queue_pull_task(struct rq *rq)
605{ 602{
606} 603}
607#endif /* CONFIG_SMP */ 604#endif /* CONFIG_SMP */
608 605
609static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 606static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
610static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); 607static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
611static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, 608static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
612 int flags);
613 609
614/* 610/*
615 * We are being explicitly informed that a new instance is starting, 611 * We are being explicitly informed that a new instance is starting,
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1763 if (hrtick_enabled(rq)) 1759 if (hrtick_enabled(rq))
1764 start_hrtick_dl(rq, p); 1760 start_hrtick_dl(rq, p);
1765 1761
1766 queue_push_tasks(rq); 1762 deadline_queue_push_tasks(rq);
1767 1763
1768 return p; 1764 return p;
1769} 1765}
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1776 enqueue_pushable_dl_task(rq, p); 1772 enqueue_pushable_dl_task(rq, p);
1777} 1773}
1778 1774
1775/*
1776 * scheduler tick hitting a task of our scheduling class.
1777 *
1778 * NOTE: This function can be called remotely by the tick offload that
1779 * goes along full dynticks. Therefore no local assumption can be made
1780 * and everything must be accessed through the @rq and @curr passed in
1781 * parameters.
1782 */
1779static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) 1783static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1780{ 1784{
1781 update_curr_dl(rq); 1785 update_curr_dl(rq);
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
1865 1869
1866 /* 1870 /*
1867 * We have to consider system topology and task affinity 1871 * We have to consider system topology and task affinity
1868 * first, then we can look for a suitable cpu. 1872 * first, then we can look for a suitable CPU.
1869 */ 1873 */
1870 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) 1874 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
1871 return -1; 1875 return -1;
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
1879 * Now we check how well this matches with task's 1883 * Now we check how well this matches with task's
1880 * affinity and system topology. 1884 * affinity and system topology.
1881 * 1885 *
1882 * The last cpu where the task run is our first 1886 * The last CPU where the task run is our first
1883 * guess, since it is most likely cache-hot there. 1887 * guess, since it is most likely cache-hot there.
1884 */ 1888 */
1885 if (cpumask_test_cpu(cpu, later_mask)) 1889 if (cpumask_test_cpu(cpu, later_mask))
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
1909 best_cpu = cpumask_first_and(later_mask, 1913 best_cpu = cpumask_first_and(later_mask,
1910 sched_domain_span(sd)); 1914 sched_domain_span(sd));
1911 /* 1915 /*
1912 * Last chance: if a cpu being in both later_mask 1916 * Last chance: if a CPU being in both later_mask
1913 * and current sd span is valid, that becomes our 1917 * and current sd span is valid, that becomes our
1914 * choice. Of course, the latest possible cpu is 1918 * choice. Of course, the latest possible CPU is
1915 * already under consideration through later_mask. 1919 * already under consideration through later_mask.
1916 */ 1920 */
1917 if (best_cpu < nr_cpu_ids) { 1921 if (best_cpu < nr_cpu_ids) {
@@ -2067,7 +2071,7 @@ retry:
2067 if (task == next_task) { 2071 if (task == next_task) {
2068 /* 2072 /*
2069 * The task is still there. We don't try 2073 * The task is still there. We don't try
2070 * again, some other cpu will pull it when ready. 2074 * again, some other CPU will pull it when ready.
2071 */ 2075 */
2072 goto out; 2076 goto out;
2073 } 2077 }
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
2300 /* 2304 /*
2301 * Since this might be the only -deadline task on the rq, 2305 * Since this might be the only -deadline task on the rq,
2302 * this is the right place to try to pull some other one 2306 * this is the right place to try to pull some other one
2303 * from an overloaded cpu, if any. 2307 * from an overloaded CPU, if any.
2304 */ 2308 */
2305 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) 2309 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
2306 return; 2310 return;
2307 2311
2308 queue_pull_task(rq); 2312 deadline_queue_pull_task(rq);
2309} 2313}
2310 2314
2311/* 2315/*
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
2327 if (rq->curr != p) { 2331 if (rq->curr != p) {
2328#ifdef CONFIG_SMP 2332#ifdef CONFIG_SMP
2329 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) 2333 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
2330 queue_push_tasks(rq); 2334 deadline_queue_push_tasks(rq);
2331#endif 2335#endif
2332 if (dl_task(rq->curr)) 2336 if (dl_task(rq->curr))
2333 check_preempt_curr_dl(rq, p, 0); 2337 check_preempt_curr_dl(rq, p, 0);
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
2352 * or lowering its prio, so... 2356 * or lowering its prio, so...
2353 */ 2357 */
2354 if (!rq->dl.overloaded) 2358 if (!rq->dl.overloaded)
2355 queue_pull_task(rq); 2359 deadline_queue_pull_task(rq);
2356 2360
2357 /* 2361 /*
2358 * If we now have a earlier deadline task than p, 2362 * If we now have a earlier deadline task than p,
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
2626{ 2630{
2627 struct sched_dl_entity *dl_se = &p->dl; 2631 struct sched_dl_entity *dl_se = &p->dl;
2628 2632
2629 dl_se->dl_runtime = 0; 2633 dl_se->dl_runtime = 0;
2630 dl_se->dl_deadline = 0; 2634 dl_se->dl_deadline = 0;
2631 dl_se->dl_period = 0; 2635 dl_se->dl_period = 0;
2632 dl_se->flags = 0; 2636 dl_se->flags = 0;
2633 dl_se->dl_bw = 0; 2637 dl_se->dl_bw = 0;
2634 dl_se->dl_density = 0; 2638 dl_se->dl_density = 0;
2635 2639
2636 dl_se->dl_throttled = 0; 2640 dl_se->dl_throttled = 0;
2637 dl_se->dl_yielded = 0; 2641 dl_se->dl_yielded = 0;
2638 dl_se->dl_non_contending = 0; 2642 dl_se->dl_non_contending = 0;
2639 dl_se->dl_overrun = 0; 2643 dl_se->dl_overrun = 0;
2640} 2644}
2641 2645
2642bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) 2646bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
2655#ifdef CONFIG_SMP 2659#ifdef CONFIG_SMP
2656int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) 2660int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
2657{ 2661{
2658 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 2662 unsigned int dest_cpu;
2659 cs_cpus_allowed);
2660 struct dl_bw *dl_b; 2663 struct dl_bw *dl_b;
2661 bool overflow; 2664 bool overflow;
2662 int cpus, ret; 2665 int cpus, ret;
2663 unsigned long flags; 2666 unsigned long flags;
2664 2667
2668 dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
2669
2665 rcu_read_lock_sched(); 2670 rcu_read_lock_sched();
2666 dl_b = dl_bw_of(dest_cpu); 2671 dl_b = dl_bw_of(dest_cpu);
2667 raw_spin_lock_irqsave(&dl_b->lock, flags); 2672 raw_spin_lock_irqsave(&dl_b->lock, flags);
2668 cpus = dl_bw_cpus(dest_cpu); 2673 cpus = dl_bw_cpus(dest_cpu);
2669 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 2674 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
2670 if (overflow) 2675 if (overflow) {
2671 ret = -EBUSY; 2676 ret = -EBUSY;
2672 else { 2677 } else {
2673 /* 2678 /*
2674 * We reserve space for this task in the destination 2679 * We reserve space for this task in the destination
2675 * root_domain, as we can't fail after this point. 2680 * root_domain, as we can't fail after this point.
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
2681 } 2686 }
2682 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2687 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2683 rcu_read_unlock_sched(); 2688 rcu_read_unlock_sched();
2689
2684 return ret; 2690 return ret;
2685} 2691}
2686 2692
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
2701 ret = 0; 2707 ret = 0;
2702 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 2708 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
2703 rcu_read_unlock_sched(); 2709 rcu_read_unlock_sched();
2710
2704 return ret; 2711 return ret;
2705} 2712}
2706 2713
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
2718 overflow = __dl_overflow(dl_b, cpus, 0, 0); 2725 overflow = __dl_overflow(dl_b, cpus, 0, 0);
2719 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2726 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2720 rcu_read_unlock_sched(); 2727 rcu_read_unlock_sched();
2728
2721 return overflow; 2729 return overflow;
2722} 2730}
2723#endif 2731#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 72c401b3b15c..15b10e210a6b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * kernel/sched/debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree and other debugging details
5 * 5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar 6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 * 7 *
@@ -9,16 +9,6 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched/mm.h>
15#include <linux/sched/task.h>
16#include <linux/seq_file.h>
17#include <linux/kallsyms.h>
18#include <linux/utsname.h>
19#include <linux/mempolicy.h>
20#include <linux/debugfs.h>
21
22#include "sched.h" 12#include "sched.h"
23 13
24static DEFINE_SPINLOCK(sched_debug_lock); 14static DEFINE_SPINLOCK(sched_debug_lock);
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
274 if (table == NULL) 264 if (table == NULL)
275 return NULL; 265 return NULL;
276 266
277 set_table_entry(&table[0], "min_interval", &sd->min_interval, 267 set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
278 sizeof(long), 0644, proc_doulongvec_minmax, false); 268 set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
279 set_table_entry(&table[1], "max_interval", &sd->max_interval, 269 set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
280 sizeof(long), 0644, proc_doulongvec_minmax, false); 270 set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
281 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 271 set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
282 sizeof(int), 0644, proc_dointvec_minmax, true); 272 set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
283 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 273 set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
284 sizeof(int), 0644, proc_dointvec_minmax, true); 274 set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
285 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 275 set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
286 sizeof(int), 0644, proc_dointvec_minmax, true); 276 set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
287 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 277 set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
288 sizeof(int), 0644, proc_dointvec_minmax, true); 278 set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
289 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 279 set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
290 sizeof(int), 0644, proc_dointvec_minmax, true);
291 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
292 sizeof(int), 0644, proc_dointvec_minmax, false);
293 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
294 sizeof(int), 0644, proc_dointvec_minmax, false);
295 set_table_entry(&table[9], "cache_nice_tries",
296 &sd->cache_nice_tries,
297 sizeof(int), 0644, proc_dointvec_minmax, false);
298 set_table_entry(&table[10], "flags", &sd->flags,
299 sizeof(int), 0644, proc_dointvec_minmax, false);
300 set_table_entry(&table[11], "max_newidle_lb_cost",
301 &sd->max_newidle_lb_cost,
302 sizeof(long), 0644, proc_doulongvec_minmax, false);
303 set_table_entry(&table[12], "name", sd->name,
304 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
305 /* &table[13] is terminator */ 280 /* &table[13] is terminator */
306 281
307 return table; 282 return table;
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
332 return table; 307 return table;
333} 308}
334 309
335static cpumask_var_t sd_sysctl_cpus; 310static cpumask_var_t sd_sysctl_cpus;
336static struct ctl_table_header *sd_sysctl_header; 311static struct ctl_table_header *sd_sysctl_header;
337 312
338void register_sched_domain_sysctl(void) 313void register_sched_domain_sysctl(void)
339{ 314{
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
413{ 388{
414 struct sched_entity *se = tg->se[cpu]; 389 struct sched_entity *se = tg->se[cpu];
415 390
416#define P(F) \ 391#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
417 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 392#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
418#define P_SCHEDSTAT(F) \ 393#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
419 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) 394#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
420#define PN(F) \
421 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
422#define PN_SCHEDSTAT(F) \
423 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
424 395
425 if (!se) 396 if (!se)
426 return; 397 return;
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
428 PN(se->exec_start); 399 PN(se->exec_start);
429 PN(se->vruntime); 400 PN(se->vruntime);
430 PN(se->sum_exec_runtime); 401 PN(se->sum_exec_runtime);
402
431 if (schedstat_enabled()) { 403 if (schedstat_enabled()) {
432 PN_SCHEDSTAT(se->statistics.wait_start); 404 PN_SCHEDSTAT(se->statistics.wait_start);
433 PN_SCHEDSTAT(se->statistics.sleep_start); 405 PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
440 PN_SCHEDSTAT(se->statistics.wait_sum); 412 PN_SCHEDSTAT(se->statistics.wait_sum);
441 P_SCHEDSTAT(se->statistics.wait_count); 413 P_SCHEDSTAT(se->statistics.wait_count);
442 } 414 }
415
443 P(se->load.weight); 416 P(se->load.weight);
444 P(se->runnable_weight); 417 P(se->runnable_weight);
445#ifdef CONFIG_SMP 418#ifdef CONFIG_SMP
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
464 return group_path; 437 return group_path;
465 438
466 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 439 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
440
467 return group_path; 441 return group_path;
468} 442}
469#endif 443#endif
@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
569 cfs_rq->avg.runnable_load_avg); 543 cfs_rq->avg.runnable_load_avg);
570 SEQ_printf(m, " .%-30s: %lu\n", "util_avg", 544 SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
571 cfs_rq->avg.util_avg); 545 cfs_rq->avg.util_avg);
546 SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
547 cfs_rq->avg.util_est.enqueued);
572 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", 548 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
573 cfs_rq->removed.load_avg); 549 cfs_rq->removed.load_avg);
574 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", 550 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void)
804/* 780/*
805 * This itererator needs some explanation. 781 * This itererator needs some explanation.
806 * It returns 1 for the header position. 782 * It returns 1 for the header position.
807 * This means 2 is cpu 0. 783 * This means 2 is CPU 0.
808 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 784 * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
809 * to use cpumask_* to iterate over the cpus. 785 * to use cpumask_* to iterate over the CPUs.
810 */ 786 */
811static void *sched_debug_start(struct seq_file *file, loff_t *offset) 787static void *sched_debug_start(struct seq_file *file, loff_t *offset)
812{ 788{
@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
826 802
827 if (n < nr_cpu_ids) 803 if (n < nr_cpu_ids)
828 return (void *)(unsigned long)(n + 2); 804 return (void *)(unsigned long)(n + 2);
805
829 return NULL; 806 return NULL;
830} 807}
831 808
@@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
840} 817}
841 818
842static const struct seq_operations sched_debug_sops = { 819static const struct seq_operations sched_debug_sops = {
843 .start = sched_debug_start, 820 .start = sched_debug_start,
844 .next = sched_debug_next, 821 .next = sched_debug_next,
845 .stop = sched_debug_stop, 822 .stop = sched_debug_stop,
846 .show = sched_debug_show, 823 .show = sched_debug_show,
847}; 824};
848 825
849static int sched_debug_release(struct inode *inode, struct file *file) 826static int sched_debug_release(struct inode *inode, struct file *file)
@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void)
881 858
882__initcall(init_sched_debug_procfs); 859__initcall(init_sched_debug_procfs);
883 860
884#define __P(F) \ 861#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
885 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 862#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
886#define P(F) \ 863#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
887 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 864#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
888#define __PN(F) \
889 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
890#define PN(F) \
891 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
892 865
893 866
894#ifdef CONFIG_NUMA_BALANCING 867#ifdef CONFIG_NUMA_BALANCING
@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
1023 P(se.avg.runnable_load_avg); 996 P(se.avg.runnable_load_avg);
1024 P(se.avg.util_avg); 997 P(se.avg.util_avg);
1025 P(se.avg.last_update_time); 998 P(se.avg.last_update_time);
999 P(se.avg.util_est.ewma);
1000 P(se.avg.util_est.enqueued);
1026#endif 1001#endif
1027 P(policy); 1002 P(policy);
1028 P(prio); 1003 P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..0951d1c58d2f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,10 @@
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */ 22 */
23 23#include "sched.h"
24#include <linux/sched/mm.h>
25#include <linux/sched/topology.h>
26
27#include <linux/latencytop.h>
28#include <linux/cpumask.h>
29#include <linux/cpuidle.h>
30#include <linux/slab.h>
31#include <linux/profile.h>
32#include <linux/interrupt.h>
33#include <linux/mempolicy.h>
34#include <linux/migrate.h>
35#include <linux/task_work.h>
36#include <linux/sched/isolation.h>
37 24
38#include <trace/events/sched.h> 25#include <trace/events/sched.h>
39 26
40#include "sched.h"
41
42/* 27/*
43 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
44 * 29 *
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
103 88
104#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
105/* 90/*
106 * For asym packing, by default the lower numbered cpu has higher priority. 91 * For asym packing, by default the lower numbered CPU has higher priority.
107 */ 92 */
108int __weak arch_asym_cpu_priority(int cpu) 93int __weak arch_asym_cpu_priority(int cpu)
109{ 94{
@@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
787 * For !fair tasks do: 772 * For !fair tasks do:
788 * 773 *
789 update_cfs_rq_load_avg(now, cfs_rq); 774 update_cfs_rq_load_avg(now, cfs_rq);
790 attach_entity_load_avg(cfs_rq, se); 775 attach_entity_load_avg(cfs_rq, se, 0);
791 switched_from_fair(rq, p); 776 switched_from_fair(rq, p);
792 * 777 *
793 * such that the next switched_to_fair() has the 778 * such that the next switched_to_fair() has the
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p)
1181} 1166}
1182 1167
1183/* 1168/*
1184 * The averaged statistics, shared & private, memory & cpu, 1169 * The averaged statistics, shared & private, memory & CPU,
1185 * occupy the first half of the array. The second half of the 1170 * occupy the first half of the array. The second half of the
1186 * array is for current counters, which are averaged into the 1171 * array is for current counters, which are averaged into the
1187 * first set by task_numa_placement. 1172 * first set by task_numa_placement.
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env,
1587 * be incurred if the tasks were swapped. 1572 * be incurred if the tasks were swapped.
1588 */ 1573 */
1589 if (cur) { 1574 if (cur) {
1590 /* Skip this swap candidate if cannot move to the source cpu */ 1575 /* Skip this swap candidate if cannot move to the source CPU: */
1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1576 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1592 goto unlock; 1577 goto unlock;
1593 1578
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env,
1631 goto balance; 1616 goto balance;
1632 } 1617 }
1633 1618
1634 /* Balance doesn't matter much if we're running a task per cpu */ 1619 /* Balance doesn't matter much if we're running a task per CPU: */
1635 if (imp > env->best_imp && src_rq->nr_running == 1 && 1620 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1636 dst_rq->nr_running == 1) 1621 dst_rq->nr_running == 1)
1637 goto assign; 1622 goto assign;
@@ -1676,7 +1661,7 @@ balance:
1676 */ 1661 */
1677 if (!cur) { 1662 if (!cur) {
1678 /* 1663 /*
1679 * select_idle_siblings() uses an per-cpu cpumask that 1664 * select_idle_siblings() uses an per-CPU cpumask that
1680 * can be used from IRQ context. 1665 * can be used from IRQ context.
1681 */ 1666 */
1682 local_irq_disable(); 1667 local_irq_disable();
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p)
1869static void numa_migrate_preferred(struct task_struct *p) 1854static void numa_migrate_preferred(struct task_struct *p)
1870{ 1855{
1871 unsigned long interval = HZ; 1856 unsigned long interval = HZ;
1857 unsigned long numa_migrate_retry;
1872 1858
1873 /* This task has no NUMA fault statistics yet */ 1859 /* This task has no NUMA fault statistics yet */
1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1860 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p)
1876 1862
1877 /* Periodically retry migrating the task to the preferred node */ 1863 /* Periodically retry migrating the task to the preferred node */
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1864 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval; 1865 numa_migrate_retry = jiffies + interval;
1866
1867 /*
1868 * Check that the new retry threshold is after the current one. If
1869 * the retry is in the future, it implies that wake_affine has
1870 * temporarily asked NUMA balancing to backoff from placement.
1871 */
1872 if (numa_migrate_retry > p->numa_migrate_retry)
1873 return;
1874
1875 /* Safe to try placing the task on the preferred node */
1876 p->numa_migrate_retry = numa_migrate_retry;
1880 1877
1881 /* Success if task is already running on preferred CPU */ 1878 /* Success if task is already running on preferred CPU */
1882 if (task_node(p) == p->numa_preferred_nid) 1879 if (task_node(p) == p->numa_preferred_nid)
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio)
2823} 2820}
2824 2821
2825#ifdef CONFIG_FAIR_GROUP_SCHED 2822#ifdef CONFIG_FAIR_GROUP_SCHED
2826# ifdef CONFIG_SMP 2823#ifdef CONFIG_SMP
2827/* 2824/*
2828 * All this does is approximate the hierarchical proportion which includes that 2825 * All this does is approximate the hierarchical proportion which includes that
2829 * global sum we all love to hate. 2826 * global sum we all love to hate.
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2974 2971
2975 return clamp_t(long, runnable, MIN_SHARES, shares); 2972 return clamp_t(long, runnable, MIN_SHARES, shares);
2976} 2973}
2977# endif /* CONFIG_SMP */ 2974#endif /* CONFIG_SMP */
2978 2975
2979static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 2976static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2980 2977
@@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se)
3012} 3009}
3013#endif /* CONFIG_FAIR_GROUP_SCHED */ 3010#endif /* CONFIG_FAIR_GROUP_SCHED */
3014 3011
3015static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 3012static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3016{ 3013{
3017 struct rq *rq = rq_of(cfs_rq); 3014 struct rq *rq = rq_of(cfs_rq);
3018 3015
3019 if (&rq->cfs == cfs_rq) { 3016 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3020 /* 3017 /*
3021 * There are a few boundary cases this might miss but it should 3018 * There are a few boundary cases this might miss but it should
3022 * get called often enough that that should (hopefully) not be 3019 * get called often enough that that should (hopefully) not be
@@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3031 * 3028 *
3032 * See cpu_util(). 3029 * See cpu_util().
3033 */ 3030 */
3034 cpufreq_update_util(rq, 0); 3031 cpufreq_update_util(rq, flags);
3035 } 3032 }
3036} 3033}
3037 3034
@@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
3246} 3243}
3247 3244
3248/* 3245/*
3246 * When a task is dequeued, its estimated utilization should not be update if
3247 * its util_avg has not been updated at least once.
3248 * This flag is used to synchronize util_avg updates with util_est updates.
3249 * We map this information into the LSB bit of the utilization saved at
3250 * dequeue time (i.e. util_est.dequeued).
3251 */
3252#define UTIL_AVG_UNCHANGED 0x1
3253
3254static inline void cfs_se_util_change(struct sched_avg *avg)
3255{
3256 unsigned int enqueued;
3257
3258 if (!sched_feat(UTIL_EST))
3259 return;
3260
3261 /* Avoid store if the flag has been already set */
3262 enqueued = avg->util_est.enqueued;
3263 if (!(enqueued & UTIL_AVG_UNCHANGED))
3264 return;
3265
3266 /* Reset flag to report util_avg has been updated */
3267 enqueued &= ~UTIL_AVG_UNCHANGED;
3268 WRITE_ONCE(avg->util_est.enqueued, enqueued);
3269}
3270
3271/*
3249 * sched_entity: 3272 * sched_entity:
3250 * 3273 *
3251 * task: 3274 * task:
@@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
3296 cfs_rq->curr == se)) { 3319 cfs_rq->curr == se)) {
3297 3320
3298 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 3321 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3322 cfs_se_util_change(&se->avg);
3299 return 1; 3323 return 1;
3300 } 3324 }
3301 3325
@@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3350} 3374}
3351 3375
3352/* 3376/*
3353 * Called within set_task_rq() right before setting a task's cpu. The 3377 * Called within set_task_rq() right before setting a task's CPU. The
3354 * caller only guarantees p->pi_lock is held; no other assumptions, 3378 * caller only guarantees p->pi_lock is held; no other assumptions,
3355 * including the state of rq->lock, should be made. 3379 * including the state of rq->lock, should be made.
3356 */ 3380 */
@@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
3529 3553
3530 /* 3554 /*
3531 * runnable_sum can't be lower than running_sum 3555 * runnable_sum can't be lower than running_sum
3532 * As running sum is scale with cpu capacity wehreas the runnable sum 3556 * As running sum is scale with CPU capacity wehreas the runnable sum
3533 * is not we rescale running_sum 1st 3557 * is not we rescale running_sum 1st
3534 */ 3558 */
3535 running_sum = se->avg.util_sum / 3559 running_sum = se->avg.util_sum /
@@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3689#endif 3713#endif
3690 3714
3691 if (decayed) 3715 if (decayed)
3692 cfs_rq_util_change(cfs_rq); 3716 cfs_rq_util_change(cfs_rq, 0);
3693 3717
3694 return decayed; 3718 return decayed;
3695} 3719}
@@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3702 * Must call update_cfs_rq_load_avg() before this, since we rely on 3726 * Must call update_cfs_rq_load_avg() before this, since we rely on
3703 * cfs_rq->avg.last_update_time being current. 3727 * cfs_rq->avg.last_update_time being current.
3704 */ 3728 */
3705static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3729static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3706{ 3730{
3707 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; 3731 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3708 3732
@@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3738 3762
3739 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3763 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3740 3764
3741 cfs_rq_util_change(cfs_rq); 3765 cfs_rq_util_change(cfs_rq, flags);
3742} 3766}
3743 3767
3744/** 3768/**
@@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3757 3781
3758 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3782 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3759 3783
3760 cfs_rq_util_change(cfs_rq); 3784 cfs_rq_util_change(cfs_rq, 0);
3761} 3785}
3762 3786
3763/* 3787/*
@@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3787 3811
3788 if (!se->avg.last_update_time && (flags & DO_ATTACH)) { 3812 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3789 3813
3790 attach_entity_load_avg(cfs_rq, se); 3814 /*
3815 * DO_ATTACH means we're here from enqueue_entity().
3816 * !last_update_time means we've passed through
3817 * migrate_task_rq_fair() indicating we migrated.
3818 *
3819 * IOW we're enqueueing a task on a new CPU.
3820 */
3821 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3791 update_tg_load_avg(cfs_rq, 0); 3822 update_tg_load_avg(cfs_rq, 0);
3792 3823
3793 } else if (decayed && (flags & UPDATE_TG)) 3824 } else if (decayed && (flags & UPDATE_TG))
@@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3869 3900
3870static int idle_balance(struct rq *this_rq, struct rq_flags *rf); 3901static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3871 3902
3903static inline unsigned long task_util(struct task_struct *p)
3904{
3905 return READ_ONCE(p->se.avg.util_avg);
3906}
3907
3908static inline unsigned long _task_util_est(struct task_struct *p)
3909{
3910 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3911
3912 return max(ue.ewma, ue.enqueued);
3913}
3914
3915static inline unsigned long task_util_est(struct task_struct *p)
3916{
3917 return max(task_util(p), _task_util_est(p));
3918}
3919
3920static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3921 struct task_struct *p)
3922{
3923 unsigned int enqueued;
3924
3925 if (!sched_feat(UTIL_EST))
3926 return;
3927
3928 /* Update root cfs_rq's estimated utilization */
3929 enqueued = cfs_rq->avg.util_est.enqueued;
3930 enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
3931 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3932}
3933
3934/*
3935 * Check if a (signed) value is within a specified (unsigned) margin,
3936 * based on the observation that:
3937 *
3938 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3939 *
3940 * NOTE: this only works when value + maring < INT_MAX.
3941 */
3942static inline bool within_margin(int value, int margin)
3943{
3944 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3945}
3946
3947static void
3948util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3949{
3950 long last_ewma_diff;
3951 struct util_est ue;
3952
3953 if (!sched_feat(UTIL_EST))
3954 return;
3955
3956 /*
3957 * Update root cfs_rq's estimated utilization
3958 *
3959 * If *p is the last task then the root cfs_rq's estimated utilization
3960 * of a CPU is 0 by definition.
3961 */
3962 ue.enqueued = 0;
3963 if (cfs_rq->nr_running) {
3964 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3965 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3966 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3967 }
3968 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3969
3970 /*
3971 * Skip update of task's estimated utilization when the task has not
3972 * yet completed an activation, e.g. being migrated.
3973 */
3974 if (!task_sleep)
3975 return;
3976
3977 /*
3978 * If the PELT values haven't changed since enqueue time,
3979 * skip the util_est update.
3980 */
3981 ue = p->se.avg.util_est;
3982 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3983 return;
3984
3985 /*
3986 * Skip update of task's estimated utilization when its EWMA is
3987 * already ~1% close to its last activation value.
3988 */
3989 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3990 last_ewma_diff = ue.enqueued - ue.ewma;
3991 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3992 return;
3993
3994 /*
3995 * Update Task's estimated utilization
3996 *
3997 * When *p completes an activation we can consolidate another sample
3998 * of the task size. This is done by storing the current PELT value
3999 * as ue.enqueued and by using this value to update the Exponential
4000 * Weighted Moving Average (EWMA):
4001 *
4002 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4003 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4004 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4005 * = w * ( last_ewma_diff ) + ewma(t-1)
4006 * = w * (last_ewma_diff + ewma(t-1) / w)
4007 *
4008 * Where 'w' is the weight of new samples, which is configured to be
4009 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4010 */
4011 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4012 ue.ewma += last_ewma_diff;
4013 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4014 WRITE_ONCE(p->se.avg.util_est, ue);
4015}
4016
3872#else /* CONFIG_SMP */ 4017#else /* CONFIG_SMP */
3873 4018
3874static inline int 4019static inline int
@@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3883 4028
3884static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 4029static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3885{ 4030{
3886 cfs_rq_util_change(cfs_rq); 4031 cfs_rq_util_change(cfs_rq, 0);
3887} 4032}
3888 4033
3889static inline void remove_entity_load_avg(struct sched_entity *se) {} 4034static inline void remove_entity_load_avg(struct sched_entity *se) {}
3890 4035
3891static inline void 4036static inline void
3892attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 4037attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3893static inline void 4038static inline void
3894detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 4039detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3895 4040
@@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3898 return 0; 4043 return 0;
3899} 4044}
3900 4045
4046static inline void
4047util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4048
4049static inline void
4050util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
4051 bool task_sleep) {}
4052
3901#endif /* CONFIG_SMP */ 4053#endif /* CONFIG_SMP */
3902 4054
3903static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) 4055static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4676 if (!se) 4828 if (!se)
4677 add_nr_running(rq, task_delta); 4829 add_nr_running(rq, task_delta);
4678 4830
4679 /* determine whether we need to wake up potentially idle cpu */ 4831 /* Determine whether we need to wake up potentially idle CPU: */
4680 if (rq->curr == rq->idle && rq->cfs.nr_running) 4832 if (rq->curr == rq->idle && rq->cfs.nr_running)
4681 resched_curr(rq); 4833 resched_curr(rq);
4682} 4834}
@@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5041} 5193}
5042 5194
5043/* 5195/*
5044 * Both these cpu hotplug callbacks race against unregister_fair_sched_group() 5196 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5045 * 5197 *
5046 * The race is harmless, since modifying bandwidth settings of unhooked group 5198 * The race is harmless, since modifying bandwidth settings of unhooked group
5047 * bits doesn't do much. 5199 * bits doesn't do much.
@@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5086 */ 5238 */
5087 cfs_rq->runtime_remaining = 1; 5239 cfs_rq->runtime_remaining = 1;
5088 /* 5240 /*
5089 * Offline rq is schedulable till cpu is completely disabled 5241 * Offline rq is schedulable till CPU is completely disabled
5090 * in take_cpu_down(), so we prevent new cfs throttling here. 5242 * in take_cpu_down(), so we prevent new cfs throttling here.
5091 */ 5243 */
5092 cfs_rq->runtime_enabled = 0; 5244 cfs_rq->runtime_enabled = 0;
@@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5245 if (!se) 5397 if (!se)
5246 add_nr_running(rq, 1); 5398 add_nr_running(rq, 1);
5247 5399
5400 util_est_enqueue(&rq->cfs, p);
5248 hrtick_update(rq); 5401 hrtick_update(rq);
5249} 5402}
5250 5403
@@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5304 if (!se) 5457 if (!se)
5305 sub_nr_running(rq, 1); 5458 sub_nr_running(rq, 1);
5306 5459
5460 util_est_dequeue(&rq->cfs, p, task_sleep);
5307 hrtick_update(rq); 5461 hrtick_update(rq);
5308} 5462}
5309 5463
@@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5323 * 5477 *
5324 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load 5478 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5325 * 5479 *
5326 * If a cpu misses updates for n ticks (as it was idle) and update gets 5480 * If a CPU misses updates for n ticks (as it was idle) and update gets
5327 * called on the n+1-th tick when cpu may be busy, then we have: 5481 * called on the n+1-th tick when CPU may be busy, then we have:
5328 * 5482 *
5329 * load_n = (1 - 1/2^i)^n * load_0 5483 * load_n = (1 - 1/2^i)^n * load_0
5330 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load 5484 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
@@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5379 } 5533 }
5380 return load; 5534 return load;
5381} 5535}
5536
5537static struct {
5538 cpumask_var_t idle_cpus_mask;
5539 atomic_t nr_cpus;
5540 int has_blocked; /* Idle CPUS has blocked load */
5541 unsigned long next_balance; /* in jiffy units */
5542 unsigned long next_blocked; /* Next update of blocked load in jiffies */
5543} nohz ____cacheline_aligned;
5544
5382#endif /* CONFIG_NO_HZ_COMMON */ 5545#endif /* CONFIG_NO_HZ_COMMON */
5383 5546
5384/** 5547/**
@@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
5468#ifdef CONFIG_NO_HZ_COMMON 5631#ifdef CONFIG_NO_HZ_COMMON
5469/* 5632/*
5470 * There is no sane way to deal with nohz on smp when using jiffies because the 5633 * There is no sane way to deal with nohz on smp when using jiffies because the
5471 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 5634 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5472 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 5635 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5473 * 5636 *
5474 * Therefore we need to avoid the delta approach from the regular tick when 5637 * Therefore we need to avoid the delta approach from the regular tick when
@@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq)
5579} 5742}
5580 5743
5581/* 5744/*
5582 * Return a low guess at the load of a migration-source cpu weighted 5745 * Return a low guess at the load of a migration-source CPU weighted
5583 * according to the scheduling class and "nice" value. 5746 * according to the scheduling class and "nice" value.
5584 * 5747 *
5585 * We want to under-estimate the load of migration sources, to 5748 * We want to under-estimate the load of migration sources, to
@@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type)
5597} 5760}
5598 5761
5599/* 5762/*
5600 * Return a high guess at the load of a migration-target cpu weighted 5763 * Return a high guess at the load of a migration-target CPU weighted
5601 * according to the scheduling class and "nice" value. 5764 * according to the scheduling class and "nice" value.
5602 */ 5765 */
5603static unsigned long target_load(int cpu, int type) 5766static unsigned long target_load(int cpu, int type)
@@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5724 unsigned long task_load; 5887 unsigned long task_load;
5725 5888
5726 this_eff_load = target_load(this_cpu, sd->wake_idx); 5889 this_eff_load = target_load(this_cpu, sd->wake_idx);
5727 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5728 5890
5729 if (sync) { 5891 if (sync) {
5730 unsigned long current_load = task_h_load(current); 5892 unsigned long current_load = task_h_load(current);
@@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5742 this_eff_load *= 100; 5904 this_eff_load *= 100;
5743 this_eff_load *= capacity_of(prev_cpu); 5905 this_eff_load *= capacity_of(prev_cpu);
5744 5906
5907 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5745 prev_eff_load -= task_load; 5908 prev_eff_load -= task_load;
5746 if (sched_feat(WA_BIAS)) 5909 if (sched_feat(WA_BIAS))
5747 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5910 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5748 prev_eff_load *= capacity_of(this_cpu); 5911 prev_eff_load *= capacity_of(this_cpu);
5749 5912
5750 return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; 5913 /*
5914 * If sync, adjust the weight of prev_eff_load such that if
5915 * prev_eff == this_eff that select_idle_sibling() will consider
5916 * stacking the wakee on top of the waker if no other CPU is
5917 * idle.
5918 */
5919 if (sync)
5920 prev_eff_load += 1;
5921
5922 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5923}
5924
5925#ifdef CONFIG_NUMA_BALANCING
5926static void
5927update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5928{
5929 unsigned long interval;
5930
5931 if (!static_branch_likely(&sched_numa_balancing))
5932 return;
5933
5934 /* If balancing has no preference then continue gathering data */
5935 if (p->numa_preferred_nid == -1)
5936 return;
5937
5938 /*
5939 * If the wakeup is not affecting locality then it is neutral from
5940 * the perspective of NUMA balacing so continue gathering data.
5941 */
5942 if (cpu_to_node(prev_cpu) == cpu_to_node(target))
5943 return;
5944
5945 /*
5946 * Temporarily prevent NUMA balancing trying to place waker/wakee after
5947 * wakee has been moved by wake_affine. This will potentially allow
5948 * related tasks to converge and update their data placement. The
5949 * 4 * numa_scan_period is to allow the two-pass filter to migrate
5950 * hot data to the wakers node.
5951 */
5952 interval = max(sysctl_numa_balancing_scan_delay,
5953 p->numa_scan_period << 2);
5954 p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5955
5956 interval = max(sysctl_numa_balancing_scan_delay,
5957 current->numa_scan_period << 2);
5958 current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5751} 5959}
5960#else
5961static void
5962update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5963{
5964}
5965#endif
5752 5966
5753static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5967static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5754 int prev_cpu, int sync) 5968 int this_cpu, int prev_cpu, int sync)
5755{ 5969{
5756 int this_cpu = smp_processor_id();
5757 int target = nr_cpumask_bits; 5970 int target = nr_cpumask_bits;
5758 5971
5759 if (sched_feat(WA_IDLE)) 5972 if (sched_feat(WA_IDLE))
@@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5766 if (target == nr_cpumask_bits) 5979 if (target == nr_cpumask_bits)
5767 return prev_cpu; 5980 return prev_cpu;
5768 5981
5982 update_wa_numa_placement(p, prev_cpu, target);
5769 schedstat_inc(sd->ttwu_move_affine); 5983 schedstat_inc(sd->ttwu_move_affine);
5770 schedstat_inc(p->se.statistics.nr_wakeups_affine); 5984 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5771 return target; 5985 return target;
5772} 5986}
5773 5987
5774static inline unsigned long task_util(struct task_struct *p);
5775static unsigned long cpu_util_wake(int cpu, struct task_struct *p); 5988static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5776 5989
5777static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5990static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5826 max_spare_cap = 0; 6039 max_spare_cap = 0;
5827 6040
5828 for_each_cpu(i, sched_group_span(group)) { 6041 for_each_cpu(i, sched_group_span(group)) {
5829 /* Bias balancing toward cpus of our domain */ 6042 /* Bias balancing toward CPUs of our domain */
5830 if (local_group) 6043 if (local_group)
5831 load = source_load(i, load_idx); 6044 load = source_load(i, load_idx);
5832 else 6045 else
@@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5856 if (min_runnable_load > (runnable_load + imbalance)) { 6069 if (min_runnable_load > (runnable_load + imbalance)) {
5857 /* 6070 /*
5858 * The runnable load is significantly smaller 6071 * The runnable load is significantly smaller
5859 * so we can pick this new cpu 6072 * so we can pick this new CPU:
5860 */ 6073 */
5861 min_runnable_load = runnable_load; 6074 min_runnable_load = runnable_load;
5862 min_avg_load = avg_load; 6075 min_avg_load = avg_load;
@@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5865 (100*min_avg_load > imbalance_scale*avg_load)) { 6078 (100*min_avg_load > imbalance_scale*avg_load)) {
5866 /* 6079 /*
5867 * The runnable loads are close so take the 6080 * The runnable loads are close so take the
5868 * blocked load into account through avg_load. 6081 * blocked load into account through avg_load:
5869 */ 6082 */
5870 min_avg_load = avg_load; 6083 min_avg_load = avg_load;
5871 idlest = group; 6084 idlest = group;
@@ -5903,6 +6116,18 @@ skip_spare:
5903 if (!idlest) 6116 if (!idlest)
5904 return NULL; 6117 return NULL;
5905 6118
6119 /*
6120 * When comparing groups across NUMA domains, it's possible for the
6121 * local domain to be very lightly loaded relative to the remote
6122 * domains but "imbalance" skews the comparison making remote CPUs
6123 * look much more favourable. When considering cross-domain, add
6124 * imbalance to the runnable load on the remote node and consider
6125 * staying local.
6126 */
6127 if ((sd->flags & SD_NUMA) &&
6128 min_runnable_load + imbalance >= this_runnable_load)
6129 return NULL;
6130
5906 if (min_runnable_load > (this_runnable_load + imbalance)) 6131 if (min_runnable_load > (this_runnable_load + imbalance))
5907 return NULL; 6132 return NULL;
5908 6133
@@ -5914,7 +6139,7 @@ skip_spare:
5914} 6139}
5915 6140
5916/* 6141/*
5917 * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 6142 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5918 */ 6143 */
5919static int 6144static int
5920find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 6145find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
5992 6217
5993 new_cpu = find_idlest_group_cpu(group, p, cpu); 6218 new_cpu = find_idlest_group_cpu(group, p, cpu);
5994 if (new_cpu == cpu) { 6219 if (new_cpu == cpu) {
5995 /* Now try balancing at a lower domain level of cpu */ 6220 /* Now try balancing at a lower domain level of 'cpu': */
5996 sd = sd->child; 6221 sd = sd->child;
5997 continue; 6222 continue;
5998 } 6223 }
5999 6224
6000 /* Now try balancing at a lower domain level of new_cpu */ 6225 /* Now try balancing at a lower domain level of 'new_cpu': */
6001 cpu = new_cpu; 6226 cpu = new_cpu;
6002 weight = sd->span_weight; 6227 weight = sd->span_weight;
6003 sd = NULL; 6228 sd = NULL;
@@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6007 if (tmp->flags & sd_flag) 6232 if (tmp->flags & sd_flag)
6008 sd = tmp; 6233 sd = tmp;
6009 } 6234 }
6010 /* while loop will break here if sd == NULL */
6011 } 6235 }
6012 6236
6013 return new_cpu; 6237 return new_cpu;
@@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6203 return target; 6427 return target;
6204 6428
6205 /* 6429 /*
6206 * If the previous cpu is cache affine and idle, don't be stupid. 6430 * If the previous CPU is cache affine and idle, don't be stupid:
6207 */ 6431 */
6208 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) 6432 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6209 return prev; 6433 return prev;
6210 6434
6211 /* Check a recently used CPU as a potential idle candidate */ 6435 /* Check a recently used CPU as a potential idle candidate: */
6212 recent_used_cpu = p->recent_used_cpu; 6436 recent_used_cpu = p->recent_used_cpu;
6213 if (recent_used_cpu != prev && 6437 if (recent_used_cpu != prev &&
6214 recent_used_cpu != target && 6438 recent_used_cpu != target &&
@@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6217 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 6441 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6218 /* 6442 /*
6219 * Replace recent_used_cpu with prev as it is a potential 6443 * Replace recent_used_cpu with prev as it is a potential
6220 * candidate for the next wake. 6444 * candidate for the next wake:
6221 */ 6445 */
6222 p->recent_used_cpu = prev; 6446 p->recent_used_cpu = prev;
6223 return recent_used_cpu; 6447 return recent_used_cpu;
@@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6242 return target; 6466 return target;
6243} 6467}
6244 6468
6245/* 6469/**
6246 * cpu_util returns the amount of capacity of a CPU that is used by CFS 6470 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
6247 * tasks. The unit of the return value must be the one of capacity so we can 6471 * @cpu: the CPU to get the utilization of
6248 * compare the utilization with the capacity of the CPU that is available for 6472 *
6249 * CFS task (ie cpu_capacity). 6473 * The unit of the return value must be the one of capacity so we can compare
6474 * the utilization with the capacity of the CPU that is available for CFS task
6475 * (ie cpu_capacity).
6250 * 6476 *
6251 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the 6477 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6252 * recent utilization of currently non-runnable tasks on a CPU. It represents 6478 * recent utilization of currently non-runnable tasks on a CPU. It represents
@@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6257 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is 6483 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6258 * the running time on this CPU scaled by capacity_curr. 6484 * the running time on this CPU scaled by capacity_curr.
6259 * 6485 *
6486 * The estimated utilization of a CPU is defined to be the maximum between its
6487 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
6488 * currently RUNNABLE on that CPU.
6489 * This allows to properly represent the expected utilization of a CPU which
6490 * has just got a big task running since a long sleep period. At the same time
6491 * however it preserves the benefits of the "blocked utilization" in
6492 * describing the potential for other tasks waking up on the same CPU.
6493 *
6260 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even 6494 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6261 * higher than capacity_orig because of unfortunate rounding in 6495 * higher than capacity_orig because of unfortunate rounding in
6262 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until 6496 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
@@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6267 * available capacity. We allow utilization to overshoot capacity_curr (but not 6501 * available capacity. We allow utilization to overshoot capacity_curr (but not
6268 * capacity_orig) as it useful for predicting the capacity required after task 6502 * capacity_orig) as it useful for predicting the capacity required after task
6269 * migrations (scheduler-driven DVFS). 6503 * migrations (scheduler-driven DVFS).
6504 *
6505 * Return: the (estimated) utilization for the specified CPU
6270 */ 6506 */
6271static unsigned long cpu_util(int cpu) 6507static inline unsigned long cpu_util(int cpu)
6272{ 6508{
6273 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; 6509 struct cfs_rq *cfs_rq;
6274 unsigned long capacity = capacity_orig_of(cpu); 6510 unsigned int util;
6275 6511
6276 return (util >= capacity) ? capacity : util; 6512 cfs_rq = &cpu_rq(cpu)->cfs;
6277} 6513 util = READ_ONCE(cfs_rq->avg.util_avg);
6278 6514
6279static inline unsigned long task_util(struct task_struct *p) 6515 if (sched_feat(UTIL_EST))
6280{ 6516 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6281 return p->se.avg.util_avg; 6517
6518 return min_t(unsigned long, util, capacity_orig_of(cpu));
6282} 6519}
6283 6520
6284/* 6521/*
6285 * cpu_util_wake: Compute cpu utilization with any contributions from 6522 * cpu_util_wake: Compute CPU utilization with any contributions from
6286 * the waking task p removed. 6523 * the waking task p removed.
6287 */ 6524 */
6288static unsigned long cpu_util_wake(int cpu, struct task_struct *p) 6525static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6289{ 6526{
6290 unsigned long util, capacity; 6527 struct cfs_rq *cfs_rq;
6528 unsigned int util;
6291 6529
6292 /* Task has no contribution or is new */ 6530 /* Task has no contribution or is new */
6293 if (cpu != task_cpu(p) || !p->se.avg.last_update_time) 6531 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6294 return cpu_util(cpu); 6532 return cpu_util(cpu);
6295 6533
6296 capacity = capacity_orig_of(cpu); 6534 cfs_rq = &cpu_rq(cpu)->cfs;
6297 util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); 6535 util = READ_ONCE(cfs_rq->avg.util_avg);
6298 6536
6299 return (util >= capacity) ? capacity : util; 6537 /* Discount task's blocked util from CPU's util */
6538 util -= min_t(unsigned int, util, task_util(p));
6539
6540 /*
6541 * Covered cases:
6542 *
6543 * a) if *p is the only task sleeping on this CPU, then:
6544 * cpu_util (== task_util) > util_est (== 0)
6545 * and thus we return:
6546 * cpu_util_wake = (cpu_util - task_util) = 0
6547 *
6548 * b) if other tasks are SLEEPING on this CPU, which is now exiting
6549 * IDLE, then:
6550 * cpu_util >= task_util
6551 * cpu_util > util_est (== 0)
6552 * and thus we discount *p's blocked utilization to return:
6553 * cpu_util_wake = (cpu_util - task_util) >= 0
6554 *
6555 * c) if other tasks are RUNNABLE on that CPU and
6556 * util_est > cpu_util
6557 * then we use util_est since it returns a more restrictive
6558 * estimation of the spare capacity on that CPU, by just
6559 * considering the expected utilization of tasks already
6560 * runnable on that CPU.
6561 *
6562 * Cases a) and b) are covered by the above code, while case c) is
6563 * covered by the following code when estimated utilization is
6564 * enabled.
6565 */
6566 if (sched_feat(UTIL_EST))
6567 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6568
6569 /*
6570 * Utilization (estimated) can exceed the CPU capacity, thus let's
6571 * clamp to the maximum CPU capacity to ensure consistency with
6572 * the cpu_util call.
6573 */
6574 return min_t(unsigned long, util, capacity_orig_of(cpu));
6300} 6575}
6301 6576
6302/* 6577/*
@@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6328 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 6603 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6329 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 6604 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6330 * 6605 *
6331 * Balances load by selecting the idlest cpu in the idlest group, or under 6606 * Balances load by selecting the idlest CPU in the idlest group, or under
6332 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. 6607 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6333 * 6608 *
6334 * Returns the target cpu number. 6609 * Returns the target CPU number.
6335 * 6610 *
6336 * preempt must be disabled. 6611 * preempt must be disabled.
6337 */ 6612 */
@@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6342 int cpu = smp_processor_id(); 6617 int cpu = smp_processor_id();
6343 int new_cpu = prev_cpu; 6618 int new_cpu = prev_cpu;
6344 int want_affine = 0; 6619 int want_affine = 0;
6345 int sync = wake_flags & WF_SYNC; 6620 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6346 6621
6347 if (sd_flag & SD_BALANCE_WAKE) { 6622 if (sd_flag & SD_BALANCE_WAKE) {
6348 record_wakee(p); 6623 record_wakee(p);
@@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6356 break; 6631 break;
6357 6632
6358 /* 6633 /*
6359 * If both cpu and prev_cpu are part of this domain, 6634 * If both 'cpu' and 'prev_cpu' are part of this domain,
6360 * cpu is a valid SD_WAKE_AFFINE target. 6635 * cpu is a valid SD_WAKE_AFFINE target.
6361 */ 6636 */
6362 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 6637 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
@@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6376 if (cpu == prev_cpu) 6651 if (cpu == prev_cpu)
6377 goto pick_cpu; 6652 goto pick_cpu;
6378 6653
6379 new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); 6654 new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
6380 } 6655 }
6381 6656
6382 if (sd && !(sd_flag & SD_BALANCE_FORK)) { 6657 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6407,9 +6682,9 @@ pick_cpu:
6407static void detach_entity_cfs_rq(struct sched_entity *se); 6682static void detach_entity_cfs_rq(struct sched_entity *se);
6408 6683
6409/* 6684/*
6410 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 6685 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
6411 * cfs_rq_of(p) references at time of call are still valid and identify the 6686 * cfs_rq_of(p) references at time of call are still valid and identify the
6412 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6687 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6413 */ 6688 */
6414static void migrate_task_rq_fair(struct task_struct *p) 6689static void migrate_task_rq_fair(struct task_struct *p)
6415{ 6690{
@@ -6738,7 +7013,7 @@ simple:
6738 7013
6739 p = task_of(se); 7014 p = task_of(se);
6740 7015
6741done: __maybe_unused 7016done: __maybe_unused;
6742#ifdef CONFIG_SMP 7017#ifdef CONFIG_SMP
6743 /* 7018 /*
6744 * Move the next running task to the front of 7019 * Move the next running task to the front of
@@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6843 * BASICS 7118 * BASICS
6844 * 7119 *
6845 * The purpose of load-balancing is to achieve the same basic fairness the 7120 * The purpose of load-balancing is to achieve the same basic fairness the
6846 * per-cpu scheduler provides, namely provide a proportional amount of compute 7121 * per-CPU scheduler provides, namely provide a proportional amount of compute
6847 * time to each task. This is expressed in the following equation: 7122 * time to each task. This is expressed in the following equation:
6848 * 7123 *
6849 * W_i,n/P_i == W_j,n/P_j for all i,j (1) 7124 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6850 * 7125 *
6851 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight 7126 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
6852 * W_i,0 is defined as: 7127 * W_i,0 is defined as:
6853 * 7128 *
6854 * W_i,0 = \Sum_j w_i,j (2) 7129 * W_i,0 = \Sum_j w_i,j (2)
6855 * 7130 *
6856 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight 7131 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
6857 * is derived from the nice value as per sched_prio_to_weight[]. 7132 * is derived from the nice value as per sched_prio_to_weight[].
6858 * 7133 *
6859 * The weight average is an exponential decay average of the instantaneous 7134 * The weight average is an exponential decay average of the instantaneous
@@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6861 * 7136 *
6862 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 7137 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6863 * 7138 *
6864 * C_i is the compute capacity of cpu i, typically it is the 7139 * C_i is the compute capacity of CPU i, typically it is the
6865 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 7140 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6866 * can also include other factors [XXX]. 7141 * can also include other factors [XXX].
6867 * 7142 *
@@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6882 * SCHED DOMAINS 7157 * SCHED DOMAINS
6883 * 7158 *
6884 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) 7159 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6885 * for all i,j solution, we create a tree of cpus that follows the hardware 7160 * for all i,j solution, we create a tree of CPUs that follows the hardware
6886 * topology where each level pairs two lower groups (or better). This results 7161 * topology where each level pairs two lower groups (or better). This results
6887 * in O(log n) layers. Furthermore we reduce the number of cpus going up the 7162 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
6888 * tree to only the first of the previous level and we decrease the frequency 7163 * tree to only the first of the previous level and we decrease the frequency
6889 * of load-balance at each level inv. proportional to the number of cpus in 7164 * of load-balance at each level inv. proportional to the number of CPUs in
6890 * the groups. 7165 * the groups.
6891 * 7166 *
6892 * This yields: 7167 * This yields:
@@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6895 * \Sum { --- * --- * 2^i } = O(n) (5) 7170 * \Sum { --- * --- * 2^i } = O(n) (5)
6896 * i = 0 2^i 2^i 7171 * i = 0 2^i 2^i
6897 * `- size of each group 7172 * `- size of each group
6898 * | | `- number of cpus doing load-balance 7173 * | | `- number of CPUs doing load-balance
6899 * | `- freq 7174 * | `- freq
6900 * `- sum over all levels 7175 * `- sum over all levels
6901 * 7176 *
@@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6903 * this makes (5) the runtime complexity of the balancer. 7178 * this makes (5) the runtime complexity of the balancer.
6904 * 7179 *
6905 * An important property here is that each CPU is still (indirectly) connected 7180 * An important property here is that each CPU is still (indirectly) connected
6906 * to every other cpu in at most O(log n) steps: 7181 * to every other CPU in at most O(log n) steps:
6907 * 7182 *
6908 * The adjacency matrix of the resulting graph is given by: 7183 * The adjacency matrix of the resulting graph is given by:
6909 * 7184 *
@@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6915 * 7190 *
6916 * A^(log_2 n)_i,j != 0 for all i,j (7) 7191 * A^(log_2 n)_i,j != 0 for all i,j (7)
6917 * 7192 *
6918 * Showing there's indeed a path between every cpu in at most O(log n) steps. 7193 * Showing there's indeed a path between every CPU in at most O(log n) steps.
6919 * The task movement gives a factor of O(m), giving a convergence complexity 7194 * The task movement gives a factor of O(m), giving a convergence complexity
6920 * of: 7195 * of:
6921 * 7196 *
@@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6925 * WORK CONSERVING 7200 * WORK CONSERVING
6926 * 7201 *
6927 * In order to avoid CPUs going idle while there's still work to do, new idle 7202 * In order to avoid CPUs going idle while there's still work to do, new idle
6928 * balancing is more aggressive and has the newly idle cpu iterate up the domain 7203 * balancing is more aggressive and has the newly idle CPU iterate up the domain
6929 * tree itself instead of relying on other CPUs to bring it work. 7204 * tree itself instead of relying on other CPUs to bring it work.
6930 * 7205 *
6931 * This adds some complexity to both (5) and (8) but it reduces the total idle 7206 * This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6946 * 7221 *
6947 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) 7222 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
6948 * 7223 *
6949 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. 7224 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
6950 * 7225 *
6951 * The big problem is S_k, its a global sum needed to compute a local (W_i) 7226 * The big problem is S_k, its a global sum needed to compute a local (W_i)
6952 * property. 7227 * property.
@@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all };
6963#define LBF_NEED_BREAK 0x02 7238#define LBF_NEED_BREAK 0x02
6964#define LBF_DST_PINNED 0x04 7239#define LBF_DST_PINNED 0x04
6965#define LBF_SOME_PINNED 0x08 7240#define LBF_SOME_PINNED 0x08
7241#define LBF_NOHZ_STATS 0x10
7242#define LBF_NOHZ_AGAIN 0x20
6966 7243
6967struct lb_env { 7244struct lb_env {
6968 struct sched_domain *sd; 7245 struct sched_domain *sd;
@@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7110 env->flags |= LBF_SOME_PINNED; 7387 env->flags |= LBF_SOME_PINNED;
7111 7388
7112 /* 7389 /*
7113 * Remember if this task can be migrated to any other cpu in 7390 * Remember if this task can be migrated to any other CPU in
7114 * our sched_group. We may want to revisit it if we couldn't 7391 * our sched_group. We may want to revisit it if we couldn't
7115 * meet load balance goals by pulling other tasks on src_cpu. 7392 * meet load balance goals by pulling other tasks on src_cpu.
7116 * 7393 *
@@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7120 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 7397 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7121 return 0; 7398 return 0;
7122 7399
7123 /* Prevent to re-select dst_cpu via env's cpus */ 7400 /* Prevent to re-select dst_cpu via env's CPUs: */
7124 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7401 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7125 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7402 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7126 env->flags |= LBF_DST_PINNED; 7403 env->flags |= LBF_DST_PINNED;
@@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env)
7347 rq_unlock(env->dst_rq, &rf); 7624 rq_unlock(env->dst_rq, &rf);
7348} 7625}
7349 7626
7627static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7628{
7629 if (cfs_rq->avg.load_avg)
7630 return true;
7631
7632 if (cfs_rq->avg.util_avg)
7633 return true;
7634
7635 return false;
7636}
7637
7350#ifdef CONFIG_FAIR_GROUP_SCHED 7638#ifdef CONFIG_FAIR_GROUP_SCHED
7351 7639
7352static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 7640static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu)
7371 struct rq *rq = cpu_rq(cpu); 7659 struct rq *rq = cpu_rq(cpu);
7372 struct cfs_rq *cfs_rq, *pos; 7660 struct cfs_rq *cfs_rq, *pos;
7373 struct rq_flags rf; 7661 struct rq_flags rf;
7662 bool done = true;
7374 7663
7375 rq_lock_irqsave(rq, &rf); 7664 rq_lock_irqsave(rq, &rf);
7376 update_rq_clock(rq); 7665 update_rq_clock(rq);
@@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu)
7400 */ 7689 */
7401 if (cfs_rq_is_decayed(cfs_rq)) 7690 if (cfs_rq_is_decayed(cfs_rq))
7402 list_del_leaf_cfs_rq(cfs_rq); 7691 list_del_leaf_cfs_rq(cfs_rq);
7692
7693 /* Don't need periodic decay once load/util_avg are null */
7694 if (cfs_rq_has_blocked(cfs_rq))
7695 done = false;
7403 } 7696 }
7697
7698#ifdef CONFIG_NO_HZ_COMMON
7699 rq->last_blocked_load_update_tick = jiffies;
7700 if (done)
7701 rq->has_blocked_load = 0;
7702#endif
7404 rq_unlock_irqrestore(rq, &rf); 7703 rq_unlock_irqrestore(rq, &rf);
7405} 7704}
7406 7705
@@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu)
7460 rq_lock_irqsave(rq, &rf); 7759 rq_lock_irqsave(rq, &rf);
7461 update_rq_clock(rq); 7760 update_rq_clock(rq);
7462 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); 7761 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
7762#ifdef CONFIG_NO_HZ_COMMON
7763 rq->last_blocked_load_update_tick = jiffies;
7764 if (!cfs_rq_has_blocked(cfs_rq))
7765 rq->has_blocked_load = 0;
7766#endif
7463 rq_unlock_irqrestore(rq, &rf); 7767 rq_unlock_irqrestore(rq, &rf);
7464} 7768}
7465 7769
@@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7694 * Group imbalance indicates (and tries to solve) the problem where balancing 7998 * Group imbalance indicates (and tries to solve) the problem where balancing
7695 * groups is inadequate due to ->cpus_allowed constraints. 7999 * groups is inadequate due to ->cpus_allowed constraints.
7696 * 8000 *
7697 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a 8001 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
7698 * cpumask covering 1 cpu of the first group and 3 cpus of the second group. 8002 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
7699 * Something like: 8003 * Something like:
7700 * 8004 *
7701 * { 0 1 2 3 } { 4 5 6 7 } 8005 * { 0 1 2 3 } { 4 5 6 7 }
@@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7703 * 8007 *
7704 * If we were to balance group-wise we'd place two tasks in the first group and 8008 * If we were to balance group-wise we'd place two tasks in the first group and
7705 * two tasks in the second group. Clearly this is undesired as it will overload 8009 * two tasks in the second group. Clearly this is undesired as it will overload
7706 * cpu 3 and leave one of the cpus in the second group unused. 8010 * cpu 3 and leave one of the CPUs in the second group unused.
7707 * 8011 *
7708 * The current solution to this issue is detecting the skew in the first group 8012 * The current solution to this issue is detecting the skew in the first group
7709 * by noticing the lower domain failed to reach balance and had difficulty 8013 * by noticing the lower domain failed to reach balance and had difficulty
@@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group,
7794 return group_other; 8098 return group_other;
7795} 8099}
7796 8100
8101static bool update_nohz_stats(struct rq *rq, bool force)
8102{
8103#ifdef CONFIG_NO_HZ_COMMON
8104 unsigned int cpu = rq->cpu;
8105
8106 if (!rq->has_blocked_load)
8107 return false;
8108
8109 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8110 return false;
8111
8112 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8113 return true;
8114
8115 update_blocked_averages(cpu);
8116
8117 return rq->has_blocked_load;
8118#else
8119 return false;
8120#endif
8121}
8122
7797/** 8123/**
7798 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 8124 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
7799 * @env: The load balancing environment. 8125 * @env: The load balancing environment.
@@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
7816 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 8142 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7817 struct rq *rq = cpu_rq(i); 8143 struct rq *rq = cpu_rq(i);
7818 8144
7819 /* Bias balancing toward cpus of our domain */ 8145 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8146 env->flags |= LBF_NOHZ_AGAIN;
8147
8148 /* Bias balancing toward CPUs of our domain: */
7820 if (local_group) 8149 if (local_group)
7821 load = target_load(i, load_idx); 8150 load = target_load(i, load_idx);
7822 else 8151 else
@@ -7902,7 +8231,7 @@ asym_packing:
7902 if (!(env->sd->flags & SD_ASYM_PACKING)) 8231 if (!(env->sd->flags & SD_ASYM_PACKING))
7903 return true; 8232 return true;
7904 8233
7905 /* No ASYM_PACKING if target cpu is already busy */ 8234 /* No ASYM_PACKING if target CPU is already busy */
7906 if (env->idle == CPU_NOT_IDLE) 8235 if (env->idle == CPU_NOT_IDLE)
7907 return true; 8236 return true;
7908 /* 8237 /*
@@ -7915,7 +8244,7 @@ asym_packing:
7915 if (!sds->busiest) 8244 if (!sds->busiest)
7916 return true; 8245 return true;
7917 8246
7918 /* Prefer to move from lowest priority cpu's work */ 8247 /* Prefer to move from lowest priority CPU's work */
7919 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, 8248 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7920 sg->asym_prefer_cpu)) 8249 sg->asym_prefer_cpu))
7921 return true; 8250 return true;
@@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
7971 if (child && child->flags & SD_PREFER_SIBLING) 8300 if (child && child->flags & SD_PREFER_SIBLING)
7972 prefer_sibling = 1; 8301 prefer_sibling = 1;
7973 8302
8303#ifdef CONFIG_NO_HZ_COMMON
8304 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8305 env->flags |= LBF_NOHZ_STATS;
8306#endif
8307
7974 load_idx = get_sd_load_idx(env->sd, env->idle); 8308 load_idx = get_sd_load_idx(env->sd, env->idle);
7975 8309
7976 do { 8310 do {
@@ -8024,6 +8358,15 @@ next_group:
8024 sg = sg->next; 8358 sg = sg->next;
8025 } while (sg != env->sd->groups); 8359 } while (sg != env->sd->groups);
8026 8360
8361#ifdef CONFIG_NO_HZ_COMMON
8362 if ((env->flags & LBF_NOHZ_AGAIN) &&
8363 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8364
8365 WRITE_ONCE(nohz.next_blocked,
8366 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8367 }
8368#endif
8369
8027 if (env->sd->flags & SD_NUMA) 8370 if (env->sd->flags & SD_NUMA)
8028 env->fbq_type = fbq_classify_group(&sds->busiest_stat); 8371 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8029 8372
@@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8168 if (busiest->group_type == group_imbalanced) { 8511 if (busiest->group_type == group_imbalanced) {
8169 /* 8512 /*
8170 * In the group_imb case we cannot rely on group-wide averages 8513 * In the group_imb case we cannot rely on group-wide averages
8171 * to ensure cpu-load equilibrium, look at wider averages. XXX 8514 * to ensure CPU-load equilibrium, look at wider averages. XXX
8172 */ 8515 */
8173 busiest->load_per_task = 8516 busiest->load_per_task =
8174 min(busiest->load_per_task, sds->avg_load); 8517 min(busiest->load_per_task, sds->avg_load);
@@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8187 } 8530 }
8188 8531
8189 /* 8532 /*
8190 * If there aren't any idle cpus, avoid creating some. 8533 * If there aren't any idle CPUs, avoid creating some.
8191 */ 8534 */
8192 if (busiest->group_type == group_overloaded && 8535 if (busiest->group_type == group_overloaded &&
8193 local->group_type == group_overloaded) { 8536 local->group_type == group_overloaded) {
@@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8201 } 8544 }
8202 8545
8203 /* 8546 /*
8204 * We're trying to get all the cpus to the average_load, so we don't 8547 * We're trying to get all the CPUs to the average_load, so we don't
8205 * want to push ourselves above the average load, nor do we wish to 8548 * want to push ourselves above the average load, nor do we wish to
8206 * reduce the max loaded cpu below the average load. At the same time, 8549 * reduce the max loaded CPU below the average load. At the same time,
8207 * we also don't want to reduce the group load below the group 8550 * we also don't want to reduce the group load below the group
8208 * capacity. Thus we look for the minimum possible imbalance. 8551 * capacity. Thus we look for the minimum possible imbalance.
8209 */ 8552 */
@@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8297 8640
8298 if (env->idle == CPU_IDLE) { 8641 if (env->idle == CPU_IDLE) {
8299 /* 8642 /*
8300 * This cpu is idle. If the busiest group is not overloaded 8643 * This CPU is idle. If the busiest group is not overloaded
8301 * and there is no imbalance between this and busiest group 8644 * and there is no imbalance between this and busiest group
8302 * wrt idle cpus, it is balanced. The imbalance becomes 8645 * wrt idle CPUs, it is balanced. The imbalance becomes
8303 * significant if the diff is greater than 1 otherwise we 8646 * significant if the diff is greater than 1 otherwise we
8304 * might end up to just move the imbalance on another group 8647 * might end up to just move the imbalance on another group
8305 */ 8648 */
@@ -8327,7 +8670,7 @@ out_balanced:
8327} 8670}
8328 8671
8329/* 8672/*
8330 * find_busiest_queue - find the busiest runqueue among the cpus in group. 8673 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
8331 */ 8674 */
8332static struct rq *find_busiest_queue(struct lb_env *env, 8675static struct rq *find_busiest_queue(struct lb_env *env,
8333 struct sched_group *group) 8676 struct sched_group *group)
@@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8371 8714
8372 /* 8715 /*
8373 * When comparing with imbalance, use weighted_cpuload() 8716 * When comparing with imbalance, use weighted_cpuload()
8374 * which is not scaled with the cpu capacity. 8717 * which is not scaled with the CPU capacity.
8375 */ 8718 */
8376 8719
8377 if (rq->nr_running == 1 && wl > env->imbalance && 8720 if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8379 continue; 8722 continue;
8380 8723
8381 /* 8724 /*
8382 * For the load comparisons with the other cpu's, consider 8725 * For the load comparisons with the other CPU's, consider
8383 * the weighted_cpuload() scaled with the cpu capacity, so 8726 * the weighted_cpuload() scaled with the CPU capacity, so
8384 * that the load can be moved away from the cpu that is 8727 * that the load can be moved away from the CPU that is
8385 * potentially running at a lower capacity. 8728 * potentially running at a lower capacity.
8386 * 8729 *
8387 * Thus we're looking for max(wl_i / capacity_i), crosswise 8730 * Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env)
8452 return 0; 8795 return 0;
8453 8796
8454 /* 8797 /*
8455 * In the newly idle case, we will allow all the cpu's 8798 * In the newly idle case, we will allow all the CPUs
8456 * to do the newly idle load balance. 8799 * to do the newly idle load balance.
8457 */ 8800 */
8458 if (env->idle == CPU_NEWLY_IDLE) 8801 if (env->idle == CPU_NEWLY_IDLE)
8459 return 1; 8802 return 1;
8460 8803
8461 /* Try to find first idle cpu */ 8804 /* Try to find first idle CPU */
8462 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { 8805 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8463 if (!idle_cpu(cpu)) 8806 if (!idle_cpu(cpu))
8464 continue; 8807 continue;
@@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env)
8471 balance_cpu = group_balance_cpu(sg); 8814 balance_cpu = group_balance_cpu(sg);
8472 8815
8473 /* 8816 /*
8474 * First idle cpu or the first cpu(busiest) in this sched group 8817 * First idle CPU or the first CPU(busiest) in this sched group
8475 * is eligible for doing load balancing at this and above domains. 8818 * is eligible for doing load balancing at this and above domains.
8476 */ 8819 */
8477 return balance_cpu == env->dst_cpu; 8820 return balance_cpu == env->dst_cpu;
@@ -8580,7 +8923,7 @@ more_balance:
8580 * Revisit (affine) tasks on src_cpu that couldn't be moved to 8923 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8581 * us and move them to an alternate dst_cpu in our sched_group 8924 * us and move them to an alternate dst_cpu in our sched_group
8582 * where they can run. The upper limit on how many times we 8925 * where they can run. The upper limit on how many times we
8583 * iterate on same src_cpu is dependent on number of cpus in our 8926 * iterate on same src_cpu is dependent on number of CPUs in our
8584 * sched_group. 8927 * sched_group.
8585 * 8928 *
8586 * This changes load balance semantics a bit on who can move 8929 * This changes load balance semantics a bit on who can move
@@ -8597,7 +8940,7 @@ more_balance:
8597 */ 8940 */
8598 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 8941 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8599 8942
8600 /* Prevent to re-select dst_cpu via env's cpus */ 8943 /* Prevent to re-select dst_cpu via env's CPUs */
8601 cpumask_clear_cpu(env.dst_cpu, env.cpus); 8944 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8602 8945
8603 env.dst_rq = cpu_rq(env.new_dst_cpu); 8946 env.dst_rq = cpu_rq(env.new_dst_cpu);
@@ -8659,9 +9002,10 @@ more_balance:
8659 9002
8660 raw_spin_lock_irqsave(&busiest->lock, flags); 9003 raw_spin_lock_irqsave(&busiest->lock, flags);
8661 9004
8662 /* don't kick the active_load_balance_cpu_stop, 9005 /*
8663 * if the curr task on busiest cpu can't be 9006 * Don't kick the active_load_balance_cpu_stop,
8664 * moved to this_cpu 9007 * if the curr task on busiest CPU can't be
9008 * moved to this_cpu:
8665 */ 9009 */
8666 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 9010 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8667 raw_spin_unlock_irqrestore(&busiest->lock, 9011 raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8773,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
8773} 9117}
8774 9118
8775/* 9119/*
8776 * idle_balance is called by schedule() if this_cpu is about to become 9120 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
8777 * idle. Attempts to pull tasks from other CPUs.
8778 */
8779static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
8780{
8781 unsigned long next_balance = jiffies + HZ;
8782 int this_cpu = this_rq->cpu;
8783 struct sched_domain *sd;
8784 int pulled_task = 0;
8785 u64 curr_cost = 0;
8786
8787 /*
8788 * We must set idle_stamp _before_ calling idle_balance(), such that we
8789 * measure the duration of idle_balance() as idle time.
8790 */
8791 this_rq->idle_stamp = rq_clock(this_rq);
8792
8793 /*
8794 * Do not pull tasks towards !active CPUs...
8795 */
8796 if (!cpu_active(this_cpu))
8797 return 0;
8798
8799 /*
8800 * This is OK, because current is on_cpu, which avoids it being picked
8801 * for load-balance and preemption/IRQs are still disabled avoiding
8802 * further scheduler activity on it and we're being very careful to
8803 * re-start the picking loop.
8804 */
8805 rq_unpin_lock(this_rq, rf);
8806
8807 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
8808 !this_rq->rd->overload) {
8809 rcu_read_lock();
8810 sd = rcu_dereference_check_sched_domain(this_rq->sd);
8811 if (sd)
8812 update_next_balance(sd, &next_balance);
8813 rcu_read_unlock();
8814
8815 goto out;
8816 }
8817
8818 raw_spin_unlock(&this_rq->lock);
8819
8820 update_blocked_averages(this_cpu);
8821 rcu_read_lock();
8822 for_each_domain(this_cpu, sd) {
8823 int continue_balancing = 1;
8824 u64 t0, domain_cost;
8825
8826 if (!(sd->flags & SD_LOAD_BALANCE))
8827 continue;
8828
8829 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
8830 update_next_balance(sd, &next_balance);
8831 break;
8832 }
8833
8834 if (sd->flags & SD_BALANCE_NEWIDLE) {
8835 t0 = sched_clock_cpu(this_cpu);
8836
8837 pulled_task = load_balance(this_cpu, this_rq,
8838 sd, CPU_NEWLY_IDLE,
8839 &continue_balancing);
8840
8841 domain_cost = sched_clock_cpu(this_cpu) - t0;
8842 if (domain_cost > sd->max_newidle_lb_cost)
8843 sd->max_newidle_lb_cost = domain_cost;
8844
8845 curr_cost += domain_cost;
8846 }
8847
8848 update_next_balance(sd, &next_balance);
8849
8850 /*
8851 * Stop searching for tasks to pull if there are
8852 * now runnable tasks on this rq.
8853 */
8854 if (pulled_task || this_rq->nr_running > 0)
8855 break;
8856 }
8857 rcu_read_unlock();
8858
8859 raw_spin_lock(&this_rq->lock);
8860
8861 if (curr_cost > this_rq->max_idle_balance_cost)
8862 this_rq->max_idle_balance_cost = curr_cost;
8863
8864 /*
8865 * While browsing the domains, we released the rq lock, a task could
8866 * have been enqueued in the meantime. Since we're not going idle,
8867 * pretend we pulled a task.
8868 */
8869 if (this_rq->cfs.h_nr_running && !pulled_task)
8870 pulled_task = 1;
8871
8872out:
8873 /* Move the next balance forward */
8874 if (time_after(this_rq->next_balance, next_balance))
8875 this_rq->next_balance = next_balance;
8876
8877 /* Is there a task of a high priority class? */
8878 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
8879 pulled_task = -1;
8880
8881 if (pulled_task)
8882 this_rq->idle_stamp = 0;
8883
8884 rq_repin_lock(this_rq, rf);
8885
8886 return pulled_task;
8887}
8888
8889/*
8890 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
8891 * running tasks off the busiest CPU onto idle CPUs. It requires at 9121 * running tasks off the busiest CPU onto idle CPUs. It requires at
8892 * least 1 task to be running on each physical CPU where possible, and 9122 * least 1 task to be running on each physical CPU where possible, and
8893 * avoids physical / logical imbalances. 9123 * avoids physical / logical imbalances.
@@ -8911,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data)
8911 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 9141 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8912 goto out_unlock; 9142 goto out_unlock;
8913 9143
8914 /* make sure the requested cpu hasn't gone down in the meantime */ 9144 /* Make sure the requested CPU hasn't gone down in the meantime: */
8915 if (unlikely(busiest_cpu != smp_processor_id() || 9145 if (unlikely(busiest_cpu != smp_processor_id() ||
8916 !busiest_rq->active_balance)) 9146 !busiest_rq->active_balance))
8917 goto out_unlock; 9147 goto out_unlock;
@@ -8923,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data)
8923 /* 9153 /*
8924 * This condition is "impossible", if it occurs 9154 * This condition is "impossible", if it occurs
8925 * we need to fix it. Originally reported by 9155 * we need to fix it. Originally reported by
8926 * Bjorn Helgaas on a 128-cpu setup. 9156 * Bjorn Helgaas on a 128-CPU setup.
8927 */ 9157 */
8928 BUG_ON(busiest_rq == target_rq); 9158 BUG_ON(busiest_rq == target_rq);
8929 9159
@@ -8977,141 +9207,6 @@ out_unlock:
8977 return 0; 9207 return 0;
8978} 9208}
8979 9209
8980static inline int on_null_domain(struct rq *rq)
8981{
8982 return unlikely(!rcu_dereference_sched(rq->sd));
8983}
8984
8985#ifdef CONFIG_NO_HZ_COMMON
8986/*
8987 * idle load balancing details
8988 * - When one of the busy CPUs notice that there may be an idle rebalancing
8989 * needed, they will kick the idle load balancer, which then does idle
8990 * load balancing for all the idle CPUs.
8991 */
8992static struct {
8993 cpumask_var_t idle_cpus_mask;
8994 atomic_t nr_cpus;
8995 unsigned long next_balance; /* in jiffy units */
8996} nohz ____cacheline_aligned;
8997
8998static inline int find_new_ilb(void)
8999{
9000 int ilb = cpumask_first(nohz.idle_cpus_mask);
9001
9002 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9003 return ilb;
9004
9005 return nr_cpu_ids;
9006}
9007
9008/*
9009 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9010 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9011 * CPU (if there is one).
9012 */
9013static void nohz_balancer_kick(void)
9014{
9015 int ilb_cpu;
9016
9017 nohz.next_balance++;
9018
9019 ilb_cpu = find_new_ilb();
9020
9021 if (ilb_cpu >= nr_cpu_ids)
9022 return;
9023
9024 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
9025 return;
9026 /*
9027 * Use smp_send_reschedule() instead of resched_cpu().
9028 * This way we generate a sched IPI on the target cpu which
9029 * is idle. And the softirq performing nohz idle load balance
9030 * will be run before returning from the IPI.
9031 */
9032 smp_send_reschedule(ilb_cpu);
9033 return;
9034}
9035
9036void nohz_balance_exit_idle(unsigned int cpu)
9037{
9038 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
9039 /*
9040 * Completely isolated CPUs don't ever set, so we must test.
9041 */
9042 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9043 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9044 atomic_dec(&nohz.nr_cpus);
9045 }
9046 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9047 }
9048}
9049
9050static inline void set_cpu_sd_state_busy(void)
9051{
9052 struct sched_domain *sd;
9053 int cpu = smp_processor_id();
9054
9055 rcu_read_lock();
9056 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9057
9058 if (!sd || !sd->nohz_idle)
9059 goto unlock;
9060 sd->nohz_idle = 0;
9061
9062 atomic_inc(&sd->shared->nr_busy_cpus);
9063unlock:
9064 rcu_read_unlock();
9065}
9066
9067void set_cpu_sd_state_idle(void)
9068{
9069 struct sched_domain *sd;
9070 int cpu = smp_processor_id();
9071
9072 rcu_read_lock();
9073 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9074
9075 if (!sd || sd->nohz_idle)
9076 goto unlock;
9077 sd->nohz_idle = 1;
9078
9079 atomic_dec(&sd->shared->nr_busy_cpus);
9080unlock:
9081 rcu_read_unlock();
9082}
9083
9084/*
9085 * This routine will record that the cpu is going idle with tick stopped.
9086 * This info will be used in performing idle load balancing in the future.
9087 */
9088void nohz_balance_enter_idle(int cpu)
9089{
9090 /*
9091 * If this cpu is going down, then nothing needs to be done.
9092 */
9093 if (!cpu_active(cpu))
9094 return;
9095
9096 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
9097 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9098 return;
9099
9100 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9101 return;
9102
9103 /*
9104 * If we're a completely isolated CPU, we don't play.
9105 */
9106 if (on_null_domain(cpu_rq(cpu)))
9107 return;
9108
9109 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9110 atomic_inc(&nohz.nr_cpus);
9111 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9112}
9113#endif
9114
9115static DEFINE_SPINLOCK(balancing); 9210static DEFINE_SPINLOCK(balancing);
9116 9211
9117/* 9212/*
@@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9141 int need_serialize, need_decay = 0; 9236 int need_serialize, need_decay = 0;
9142 u64 max_cost = 0; 9237 u64 max_cost = 0;
9143 9238
9144 update_blocked_averages(cpu);
9145
9146 rcu_read_lock(); 9239 rcu_read_lock();
9147 for_each_domain(cpu, sd) { 9240 for_each_domain(cpu, sd) {
9148 /* 9241 /*
@@ -9232,68 +9325,56 @@ out:
9232 } 9325 }
9233} 9326}
9234 9327
9328static inline int on_null_domain(struct rq *rq)
9329{
9330 return unlikely(!rcu_dereference_sched(rq->sd));
9331}
9332
9235#ifdef CONFIG_NO_HZ_COMMON 9333#ifdef CONFIG_NO_HZ_COMMON
9236/* 9334/*
9237 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 9335 * idle load balancing details
9238 * rebalancing for all the cpus for whom scheduler ticks are stopped. 9336 * - When one of the busy CPUs notice that there may be an idle rebalancing
9337 * needed, they will kick the idle load balancer, which then does idle
9338 * load balancing for all the idle CPUs.
9239 */ 9339 */
9240static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9241{
9242 int this_cpu = this_rq->cpu;
9243 struct rq *rq;
9244 int balance_cpu;
9245 /* Earliest time when we have to do rebalance again */
9246 unsigned long next_balance = jiffies + 60*HZ;
9247 int update_next_balance = 0;
9248 9340
9249 if (idle != CPU_IDLE || 9341static inline int find_new_ilb(void)
9250 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) 9342{
9251 goto end; 9343 int ilb = cpumask_first(nohz.idle_cpus_mask);
9252 9344
9253 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 9345 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9254 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) 9346 return ilb;
9255 continue;
9256 9347
9257 /* 9348 return nr_cpu_ids;
9258 * If this cpu gets work to do, stop the load balancing 9349}
9259 * work being done for other cpus. Next load
9260 * balancing owner will pick it up.
9261 */
9262 if (need_resched())
9263 break;
9264 9350
9265 rq = cpu_rq(balance_cpu); 9351/*
9352 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9353 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9354 * CPU (if there is one).
9355 */
9356static void kick_ilb(unsigned int flags)
9357{
9358 int ilb_cpu;
9266 9359
9267 /* 9360 nohz.next_balance++;
9268 * If time for next balance is due,
9269 * do the balance.
9270 */
9271 if (time_after_eq(jiffies, rq->next_balance)) {
9272 struct rq_flags rf;
9273 9361
9274 rq_lock_irq(rq, &rf); 9362 ilb_cpu = find_new_ilb();
9275 update_rq_clock(rq);
9276 cpu_load_update_idle(rq);
9277 rq_unlock_irq(rq, &rf);
9278 9363
9279 rebalance_domains(rq, CPU_IDLE); 9364 if (ilb_cpu >= nr_cpu_ids)
9280 } 9365 return;
9281 9366
9282 if (time_after(next_balance, rq->next_balance)) { 9367 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9283 next_balance = rq->next_balance; 9368 if (flags & NOHZ_KICK_MASK)
9284 update_next_balance = 1; 9369 return;
9285 }
9286 }
9287 9370
9288 /* 9371 /*
9289 * next_balance will be updated only when there is a need. 9372 * Use smp_send_reschedule() instead of resched_cpu().
9290 * When the CPU is attached to null domain for ex, it will not be 9373 * This way we generate a sched IPI on the target CPU which
9291 * updated. 9374 * is idle. And the softirq performing nohz idle load balance
9375 * will be run before returning from the IPI.
9292 */ 9376 */
9293 if (likely(update_next_balance)) 9377 smp_send_reschedule(ilb_cpu);
9294 nohz.next_balance = next_balance;
9295end:
9296 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
9297} 9378}
9298 9379
9299/* 9380/*
@@ -9307,36 +9388,41 @@ end:
9307 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 9388 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9308 * domain span are idle. 9389 * domain span are idle.
9309 */ 9390 */
9310static inline bool nohz_kick_needed(struct rq *rq) 9391static void nohz_balancer_kick(struct rq *rq)
9311{ 9392{
9312 unsigned long now = jiffies; 9393 unsigned long now = jiffies;
9313 struct sched_domain_shared *sds; 9394 struct sched_domain_shared *sds;
9314 struct sched_domain *sd; 9395 struct sched_domain *sd;
9315 int nr_busy, i, cpu = rq->cpu; 9396 int nr_busy, i, cpu = rq->cpu;
9316 bool kick = false; 9397 unsigned int flags = 0;
9317 9398
9318 if (unlikely(rq->idle_balance)) 9399 if (unlikely(rq->idle_balance))
9319 return false; 9400 return;
9320 9401
9321 /* 9402 /*
9322 * We may be recently in ticked or tickless idle mode. At the first 9403 * We may be recently in ticked or tickless idle mode. At the first
9323 * busy tick after returning from idle, we will update the busy stats. 9404 * busy tick after returning from idle, we will update the busy stats.
9324 */ 9405 */
9325 set_cpu_sd_state_busy(); 9406 nohz_balance_exit_idle(rq);
9326 nohz_balance_exit_idle(cpu);
9327 9407
9328 /* 9408 /*
9329 * None are in tickless mode and hence no need for NOHZ idle load 9409 * None are in tickless mode and hence no need for NOHZ idle load
9330 * balancing. 9410 * balancing.
9331 */ 9411 */
9332 if (likely(!atomic_read(&nohz.nr_cpus))) 9412 if (likely(!atomic_read(&nohz.nr_cpus)))
9333 return false; 9413 return;
9414
9415 if (READ_ONCE(nohz.has_blocked) &&
9416 time_after(now, READ_ONCE(nohz.next_blocked)))
9417 flags = NOHZ_STATS_KICK;
9334 9418
9335 if (time_before(now, nohz.next_balance)) 9419 if (time_before(now, nohz.next_balance))
9336 return false; 9420 goto out;
9337 9421
9338 if (rq->nr_running >= 2) 9422 if (rq->nr_running >= 2) {
9339 return true; 9423 flags = NOHZ_KICK_MASK;
9424 goto out;
9425 }
9340 9426
9341 rcu_read_lock(); 9427 rcu_read_lock();
9342 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 9428 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
@@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
9347 */ 9433 */
9348 nr_busy = atomic_read(&sds->nr_busy_cpus); 9434 nr_busy = atomic_read(&sds->nr_busy_cpus);
9349 if (nr_busy > 1) { 9435 if (nr_busy > 1) {
9350 kick = true; 9436 flags = NOHZ_KICK_MASK;
9351 goto unlock; 9437 goto unlock;
9352 } 9438 }
9353 9439
@@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
9357 if (sd) { 9443 if (sd) {
9358 if ((rq->cfs.h_nr_running >= 1) && 9444 if ((rq->cfs.h_nr_running >= 1) &&
9359 check_cpu_capacity(rq, sd)) { 9445 check_cpu_capacity(rq, sd)) {
9360 kick = true; 9446 flags = NOHZ_KICK_MASK;
9361 goto unlock; 9447 goto unlock;
9362 } 9448 }
9363 } 9449 }
@@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq)
9370 continue; 9456 continue;
9371 9457
9372 if (sched_asym_prefer(i, cpu)) { 9458 if (sched_asym_prefer(i, cpu)) {
9373 kick = true; 9459 flags = NOHZ_KICK_MASK;
9374 goto unlock; 9460 goto unlock;
9375 } 9461 }
9376 } 9462 }
9377 } 9463 }
9378unlock: 9464unlock:
9379 rcu_read_unlock(); 9465 rcu_read_unlock();
9380 return kick; 9466out:
9467 if (flags)
9468 kick_ilb(flags);
9469}
9470
9471static void set_cpu_sd_state_busy(int cpu)
9472{
9473 struct sched_domain *sd;
9474
9475 rcu_read_lock();
9476 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9477
9478 if (!sd || !sd->nohz_idle)
9479 goto unlock;
9480 sd->nohz_idle = 0;
9481
9482 atomic_inc(&sd->shared->nr_busy_cpus);
9483unlock:
9484 rcu_read_unlock();
9485}
9486
9487void nohz_balance_exit_idle(struct rq *rq)
9488{
9489 SCHED_WARN_ON(rq != this_rq());
9490
9491 if (likely(!rq->nohz_tick_stopped))
9492 return;
9493
9494 rq->nohz_tick_stopped = 0;
9495 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9496 atomic_dec(&nohz.nr_cpus);
9497
9498 set_cpu_sd_state_busy(rq->cpu);
9499}
9500
9501static void set_cpu_sd_state_idle(int cpu)
9502{
9503 struct sched_domain *sd;
9504
9505 rcu_read_lock();
9506 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9507
9508 if (!sd || sd->nohz_idle)
9509 goto unlock;
9510 sd->nohz_idle = 1;
9511
9512 atomic_dec(&sd->shared->nr_busy_cpus);
9513unlock:
9514 rcu_read_unlock();
9515}
9516
9517/*
9518 * This routine will record that the CPU is going idle with tick stopped.
9519 * This info will be used in performing idle load balancing in the future.
9520 */
9521void nohz_balance_enter_idle(int cpu)
9522{
9523 struct rq *rq = cpu_rq(cpu);
9524
9525 SCHED_WARN_ON(cpu != smp_processor_id());
9526
9527 /* If this CPU is going down, then nothing needs to be done: */
9528 if (!cpu_active(cpu))
9529 return;
9530
9531 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
9532 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9533 return;
9534
9535 /*
9536 * Can be set safely without rq->lock held
9537 * If a clear happens, it will have evaluated last additions because
9538 * rq->lock is held during the check and the clear
9539 */
9540 rq->has_blocked_load = 1;
9541
9542 /*
9543 * The tick is still stopped but load could have been added in the
9544 * meantime. We set the nohz.has_blocked flag to trig a check of the
9545 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
9546 * of nohz.has_blocked can only happen after checking the new load
9547 */
9548 if (rq->nohz_tick_stopped)
9549 goto out;
9550
9551 /* If we're a completely isolated CPU, we don't play: */
9552 if (on_null_domain(rq))
9553 return;
9554
9555 rq->nohz_tick_stopped = 1;
9556
9557 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9558 atomic_inc(&nohz.nr_cpus);
9559
9560 /*
9561 * Ensures that if nohz_idle_balance() fails to observe our
9562 * @idle_cpus_mask store, it must observe the @has_blocked
9563 * store.
9564 */
9565 smp_mb__after_atomic();
9566
9567 set_cpu_sd_state_idle(cpu);
9568
9569out:
9570 /*
9571 * Each time a cpu enter idle, we assume that it has blocked load and
9572 * enable the periodic update of the load of idle cpus
9573 */
9574 WRITE_ONCE(nohz.has_blocked, 1);
9575}
9576
9577/*
9578 * Internal function that runs load balance for all idle cpus. The load balance
9579 * can be a simple update of blocked load or a complete load balance with
9580 * tasks movement depending of flags.
9581 * The function returns false if the loop has stopped before running
9582 * through all idle CPUs.
9583 */
9584static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9585 enum cpu_idle_type idle)
9586{
9587 /* Earliest time when we have to do rebalance again */
9588 unsigned long now = jiffies;
9589 unsigned long next_balance = now + 60*HZ;
9590 bool has_blocked_load = false;
9591 int update_next_balance = 0;
9592 int this_cpu = this_rq->cpu;
9593 int balance_cpu;
9594 int ret = false;
9595 struct rq *rq;
9596
9597 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9598
9599 /*
9600 * We assume there will be no idle load after this update and clear
9601 * the has_blocked flag. If a cpu enters idle in the mean time, it will
9602 * set the has_blocked flag and trig another update of idle load.
9603 * Because a cpu that becomes idle, is added to idle_cpus_mask before
9604 * setting the flag, we are sure to not clear the state and not
9605 * check the load of an idle cpu.
9606 */
9607 WRITE_ONCE(nohz.has_blocked, 0);
9608
9609 /*
9610 * Ensures that if we miss the CPU, we must see the has_blocked
9611 * store from nohz_balance_enter_idle().
9612 */
9613 smp_mb();
9614
9615 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9616 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9617 continue;
9618
9619 /*
9620 * If this CPU gets work to do, stop the load balancing
9621 * work being done for other CPUs. Next load
9622 * balancing owner will pick it up.
9623 */
9624 if (need_resched()) {
9625 has_blocked_load = true;
9626 goto abort;
9627 }
9628
9629 rq = cpu_rq(balance_cpu);
9630
9631 has_blocked_load |= update_nohz_stats(rq, true);
9632
9633 /*
9634 * If time for next balance is due,
9635 * do the balance.
9636 */
9637 if (time_after_eq(jiffies, rq->next_balance)) {
9638 struct rq_flags rf;
9639
9640 rq_lock_irqsave(rq, &rf);
9641 update_rq_clock(rq);
9642 cpu_load_update_idle(rq);
9643 rq_unlock_irqrestore(rq, &rf);
9644
9645 if (flags & NOHZ_BALANCE_KICK)
9646 rebalance_domains(rq, CPU_IDLE);
9647 }
9648
9649 if (time_after(next_balance, rq->next_balance)) {
9650 next_balance = rq->next_balance;
9651 update_next_balance = 1;
9652 }
9653 }
9654
9655 /* Newly idle CPU doesn't need an update */
9656 if (idle != CPU_NEWLY_IDLE) {
9657 update_blocked_averages(this_cpu);
9658 has_blocked_load |= this_rq->has_blocked_load;
9659 }
9660
9661 if (flags & NOHZ_BALANCE_KICK)
9662 rebalance_domains(this_rq, CPU_IDLE);
9663
9664 WRITE_ONCE(nohz.next_blocked,
9665 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9666
9667 /* The full idle balance loop has been done */
9668 ret = true;
9669
9670abort:
9671 /* There is still blocked load, enable periodic update */
9672 if (has_blocked_load)
9673 WRITE_ONCE(nohz.has_blocked, 1);
9674
9675 /*
9676 * next_balance will be updated only when there is a need.
9677 * When the CPU is attached to null domain for ex, it will not be
9678 * updated.
9679 */
9680 if (likely(update_next_balance))
9681 nohz.next_balance = next_balance;
9682
9683 return ret;
9684}
9685
9686/*
9687 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9688 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9689 */
9690static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9691{
9692 int this_cpu = this_rq->cpu;
9693 unsigned int flags;
9694
9695 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9696 return false;
9697
9698 if (idle != CPU_IDLE) {
9699 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9700 return false;
9701 }
9702
9703 /*
9704 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
9705 */
9706 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9707 if (!(flags & NOHZ_KICK_MASK))
9708 return false;
9709
9710 _nohz_idle_balance(this_rq, flags, idle);
9711
9712 return true;
9713}
9714
9715static void nohz_newidle_balance(struct rq *this_rq)
9716{
9717 int this_cpu = this_rq->cpu;
9718
9719 /*
9720 * This CPU doesn't want to be disturbed by scheduler
9721 * housekeeping
9722 */
9723 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9724 return;
9725
9726 /* Will wake up very soon. No time for doing anything else*/
9727 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9728 return;
9729
9730 /* Don't need to update blocked load of idle CPUs*/
9731 if (!READ_ONCE(nohz.has_blocked) ||
9732 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9733 return;
9734
9735 raw_spin_unlock(&this_rq->lock);
9736 /*
9737 * This CPU is going to be idle and blocked load of idle CPUs
9738 * need to be updated. Run the ilb locally as it is a good
9739 * candidate for ilb instead of waking up another idle CPU.
9740 * Kick an normal ilb if we failed to do the update.
9741 */
9742 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9743 kick_ilb(NOHZ_STATS_KICK);
9744 raw_spin_lock(&this_rq->lock);
9745}
9746
9747#else /* !CONFIG_NO_HZ_COMMON */
9748static inline void nohz_balancer_kick(struct rq *rq) { }
9749
9750static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9751{
9752 return false;
9753}
9754
9755static inline void nohz_newidle_balance(struct rq *this_rq) { }
9756#endif /* CONFIG_NO_HZ_COMMON */
9757
9758/*
9759 * idle_balance is called by schedule() if this_cpu is about to become
9760 * idle. Attempts to pull tasks from other CPUs.
9761 */
9762static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9763{
9764 unsigned long next_balance = jiffies + HZ;
9765 int this_cpu = this_rq->cpu;
9766 struct sched_domain *sd;
9767 int pulled_task = 0;
9768 u64 curr_cost = 0;
9769
9770 /*
9771 * We must set idle_stamp _before_ calling idle_balance(), such that we
9772 * measure the duration of idle_balance() as idle time.
9773 */
9774 this_rq->idle_stamp = rq_clock(this_rq);
9775
9776 /*
9777 * Do not pull tasks towards !active CPUs...
9778 */
9779 if (!cpu_active(this_cpu))
9780 return 0;
9781
9782 /*
9783 * This is OK, because current is on_cpu, which avoids it being picked
9784 * for load-balance and preemption/IRQs are still disabled avoiding
9785 * further scheduler activity on it and we're being very careful to
9786 * re-start the picking loop.
9787 */
9788 rq_unpin_lock(this_rq, rf);
9789
9790 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9791 !this_rq->rd->overload) {
9792
9793 rcu_read_lock();
9794 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9795 if (sd)
9796 update_next_balance(sd, &next_balance);
9797 rcu_read_unlock();
9798
9799 nohz_newidle_balance(this_rq);
9800
9801 goto out;
9802 }
9803
9804 raw_spin_unlock(&this_rq->lock);
9805
9806 update_blocked_averages(this_cpu);
9807 rcu_read_lock();
9808 for_each_domain(this_cpu, sd) {
9809 int continue_balancing = 1;
9810 u64 t0, domain_cost;
9811
9812 if (!(sd->flags & SD_LOAD_BALANCE))
9813 continue;
9814
9815 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9816 update_next_balance(sd, &next_balance);
9817 break;
9818 }
9819
9820 if (sd->flags & SD_BALANCE_NEWIDLE) {
9821 t0 = sched_clock_cpu(this_cpu);
9822
9823 pulled_task = load_balance(this_cpu, this_rq,
9824 sd, CPU_NEWLY_IDLE,
9825 &continue_balancing);
9826
9827 domain_cost = sched_clock_cpu(this_cpu) - t0;
9828 if (domain_cost > sd->max_newidle_lb_cost)
9829 sd->max_newidle_lb_cost = domain_cost;
9830
9831 curr_cost += domain_cost;
9832 }
9833
9834 update_next_balance(sd, &next_balance);
9835
9836 /*
9837 * Stop searching for tasks to pull if there are
9838 * now runnable tasks on this rq.
9839 */
9840 if (pulled_task || this_rq->nr_running > 0)
9841 break;
9842 }
9843 rcu_read_unlock();
9844
9845 raw_spin_lock(&this_rq->lock);
9846
9847 if (curr_cost > this_rq->max_idle_balance_cost)
9848 this_rq->max_idle_balance_cost = curr_cost;
9849
9850 /*
9851 * While browsing the domains, we released the rq lock, a task could
9852 * have been enqueued in the meantime. Since we're not going idle,
9853 * pretend we pulled a task.
9854 */
9855 if (this_rq->cfs.h_nr_running && !pulled_task)
9856 pulled_task = 1;
9857
9858out:
9859 /* Move the next balance forward */
9860 if (time_after(this_rq->next_balance, next_balance))
9861 this_rq->next_balance = next_balance;
9862
9863 /* Is there a task of a high priority class? */
9864 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9865 pulled_task = -1;
9866
9867 if (pulled_task)
9868 this_rq->idle_stamp = 0;
9869
9870 rq_repin_lock(this_rq, rf);
9871
9872 return pulled_task;
9381} 9873}
9382#else
9383static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
9384#endif
9385 9874
9386/* 9875/*
9387 * run_rebalance_domains is triggered when needed from the scheduler tick. 9876 * run_rebalance_domains is triggered when needed from the scheduler tick.
@@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9394 CPU_IDLE : CPU_NOT_IDLE; 9883 CPU_IDLE : CPU_NOT_IDLE;
9395 9884
9396 /* 9885 /*
9397 * If this cpu has a pending nohz_balance_kick, then do the 9886 * If this CPU has a pending nohz_balance_kick, then do the
9398 * balancing on behalf of the other idle cpus whose ticks are 9887 * balancing on behalf of the other idle CPUs whose ticks are
9399 * stopped. Do nohz_idle_balance *before* rebalance_domains to 9888 * stopped. Do nohz_idle_balance *before* rebalance_domains to
9400 * give the idle cpus a chance to load balance. Else we may 9889 * give the idle CPUs a chance to load balance. Else we may
9401 * load balance only within the local sched_domain hierarchy 9890 * load balance only within the local sched_domain hierarchy
9402 * and abort nohz_idle_balance altogether if we pull some load. 9891 * and abort nohz_idle_balance altogether if we pull some load.
9403 */ 9892 */
9404 nohz_idle_balance(this_rq, idle); 9893 if (nohz_idle_balance(this_rq, idle))
9894 return;
9895
9896 /* normal load balance */
9897 update_blocked_averages(this_rq->cpu);
9405 rebalance_domains(this_rq, idle); 9898 rebalance_domains(this_rq, idle);
9406} 9899}
9407 9900
@@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq)
9416 9909
9417 if (time_after_eq(jiffies, rq->next_balance)) 9910 if (time_after_eq(jiffies, rq->next_balance))
9418 raise_softirq(SCHED_SOFTIRQ); 9911 raise_softirq(SCHED_SOFTIRQ);
9419#ifdef CONFIG_NO_HZ_COMMON 9912
9420 if (nohz_kick_needed(rq)) 9913 nohz_balancer_kick(rq);
9421 nohz_balancer_kick();
9422#endif
9423} 9914}
9424 9915
9425static void rq_online_fair(struct rq *rq) 9916static void rq_online_fair(struct rq *rq)
@@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq)
9440#endif /* CONFIG_SMP */ 9931#endif /* CONFIG_SMP */
9441 9932
9442/* 9933/*
9443 * scheduler tick hitting a task of our scheduling class: 9934 * scheduler tick hitting a task of our scheduling class.
9935 *
9936 * NOTE: This function can be called remotely by the tick offload that
9937 * goes along full dynticks. Therefore no local assumption can be made
9938 * and everything must be accessed through the @rq and @curr passed in
9939 * parameters.
9444 */ 9940 */
9445static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 9941static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9446{ 9942{
@@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
9591 10087
9592 /* Synchronize entity with its cfs_rq */ 10088 /* Synchronize entity with its cfs_rq */
9593 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 10089 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
9594 attach_entity_load_avg(cfs_rq, se); 10090 attach_entity_load_avg(cfs_rq, se, 0);
9595 update_tg_load_avg(cfs_rq, false); 10091 update_tg_load_avg(cfs_rq, false);
9596 propagate_entity_cfs_rq(se); 10092 propagate_entity_cfs_rq(se);
9597} 10093}
@@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void)
9993 10489
9994#ifdef CONFIG_NO_HZ_COMMON 10490#ifdef CONFIG_NO_HZ_COMMON
9995 nohz.next_balance = jiffies; 10491 nohz.next_balance = jiffies;
10492 nohz.next_blocked = jiffies;
9996 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 10493 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
9997#endif 10494#endif
9998#endif /* SMP */ 10495#endif /* SMP */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..85ae8488039c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
85SCHED_FEAT(WA_IDLE, true) 85SCHED_FEAT(WA_IDLE, true)
86SCHED_FEAT(WA_WEIGHT, true) 86SCHED_FEAT(WA_WEIGHT, true)
87SCHED_FEAT(WA_BIAS, true) 87SCHED_FEAT(WA_BIAS, true)
88
89/*
90 * UtilEstimation. Use estimated CPU utilization.
91 */
92SCHED_FEAT(UTIL_EST, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..2975f195e1c4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,23 +1,14 @@
1/* 1/*
2 * Generic entry point for the idle threads 2 * Generic entry points for the idle threads and
3 * implementation of the idle task scheduling class.
4 *
5 * (NOTE: these are not related to SCHED_IDLE batch scheduled
6 * tasks which are handled in sched/fair.c )
3 */ 7 */
4#include <linux/sched.h> 8#include "sched.h"
5#include <linux/sched/idle.h>
6#include <linux/cpu.h>
7#include <linux/cpuidle.h>
8#include <linux/cpuhotplug.h>
9#include <linux/tick.h>
10#include <linux/mm.h>
11#include <linux/stackprotector.h>
12#include <linux/suspend.h>
13#include <linux/livepatch.h>
14
15#include <asm/tlb.h>
16 9
17#include <trace/events/power.h> 10#include <trace/events/power.h>
18 11
19#include "sched.h"
20
21/* Linker adds these: start and end of __cpuidle functions */ 12/* Linker adds these: start and end of __cpuidle functions */
22extern char __cpuidle_text_start[], __cpuidle_text_end[]; 13extern char __cpuidle_text_start[], __cpuidle_text_end[];
23 14
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
46static int __init cpu_idle_poll_setup(char *__unused) 37static int __init cpu_idle_poll_setup(char *__unused)
47{ 38{
48 cpu_idle_force_poll = 1; 39 cpu_idle_force_poll = 1;
40
49 return 1; 41 return 1;
50} 42}
51__setup("nohlt", cpu_idle_poll_setup); 43__setup("nohlt", cpu_idle_poll_setup);
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
53static int __init cpu_idle_nopoll_setup(char *__unused) 45static int __init cpu_idle_nopoll_setup(char *__unused)
54{ 46{
55 cpu_idle_force_poll = 0; 47 cpu_idle_force_poll = 0;
48
56 return 1; 49 return 1;
57} 50}
58__setup("hlt", cpu_idle_nopoll_setup); 51__setup("hlt", cpu_idle_nopoll_setup);
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
64 trace_cpu_idle_rcuidle(0, smp_processor_id()); 57 trace_cpu_idle_rcuidle(0, smp_processor_id());
65 local_irq_enable(); 58 local_irq_enable();
66 stop_critical_timings(); 59 stop_critical_timings();
60
67 while (!tif_need_resched() && 61 while (!tif_need_resched() &&
68 (cpu_idle_force_poll || tick_check_broadcast_expired())) 62 (cpu_idle_force_poll || tick_check_broadcast_expired()))
69 cpu_relax(); 63 cpu_relax();
70 start_critical_timings(); 64 start_critical_timings();
71 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 65 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
72 rcu_idle_exit(); 66 rcu_idle_exit();
67
73 return 1; 68 return 1;
74} 69}
75 70
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
332{ 327{
333 /* 328 /*
334 * This #ifdef needs to die, but it's too late in the cycle to 329 * This #ifdef needs to die, but it's too late in the cycle to
335 * make this generic (arm and sh have never invoked the canary 330 * make this generic (ARM and SH have never invoked the canary
336 * init for the non boot cpus!). Will be fixed in 3.11 331 * init for the non boot CPUs!). Will be fixed in 3.11
337 */ 332 */
338#ifdef CONFIG_X86 333#ifdef CONFIG_X86
339 /* 334 /*
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
350 while (1) 345 while (1)
351 do_idle(); 346 do_idle();
352} 347}
348
349/*
350 * idle-task scheduling class.
351 */
352
353#ifdef CONFIG_SMP
354static int
355select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
356{
357 return task_cpu(p); /* IDLE tasks as never migrated */
358}
359#endif
360
361/*
362 * Idle tasks are unconditionally rescheduled:
363 */
364static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
365{
366 resched_curr(rq);
367}
368
369static struct task_struct *
370pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
371{
372 put_prev_task(rq, prev);
373 update_idle_core(rq);
374 schedstat_inc(rq->sched_goidle);
375
376 return rq->idle;
377}
378
379/*
380 * It is not legal to sleep in the idle task - print a warning
381 * message if some code attempts to do it:
382 */
383static void
384dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
385{
386 raw_spin_unlock_irq(&rq->lock);
387 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
388 dump_stack();
389 raw_spin_lock_irq(&rq->lock);
390}
391
392static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
393{
394}
395
396/*
397 * scheduler tick hitting a task of our scheduling class.
398 *
399 * NOTE: This function can be called remotely by the tick offload that
400 * goes along full dynticks. Therefore no local assumption can be made
401 * and everything must be accessed through the @rq and @curr passed in
402 * parameters.
403 */
404static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
405{
406}
407
408static void set_curr_task_idle(struct rq *rq)
409{
410}
411
412static void switched_to_idle(struct rq *rq, struct task_struct *p)
413{
414 BUG();
415}
416
417static void
418prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
419{
420 BUG();
421}
422
423static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
424{
425 return 0;
426}
427
428static void update_curr_idle(struct rq *rq)
429{
430}
431
432/*
433 * Simple, special scheduling class for the per-CPU idle tasks:
434 */
435const struct sched_class idle_sched_class = {
436 /* .next is NULL */
437 /* no enqueue/yield_task for idle tasks */
438
439 /* dequeue is not valid, we print a debug message there: */
440 .dequeue_task = dequeue_task_idle,
441
442 .check_preempt_curr = check_preempt_curr_idle,
443
444 .pick_next_task = pick_next_task_idle,
445 .put_prev_task = put_prev_task_idle,
446
447#ifdef CONFIG_SMP
448 .select_task_rq = select_task_rq_idle,
449 .set_cpus_allowed = set_cpus_allowed_common,
450#endif
451
452 .set_curr_task = set_curr_task_idle,
453 .task_tick = task_tick_idle,
454
455 .get_rr_interval = get_rr_interval_idle,
456
457 .prio_changed = prio_changed_idle,
458 .switched_to = switched_to_idle,
459 .update_curr = update_curr_idle,
460};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index d518664cce4f..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,110 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/*
5 * idle-task scheduling class.
6 *
7 * (NOTE: these are not related to SCHED_IDLE tasks which are
8 * handled in sched/fair.c)
9 */
10
11#ifdef CONFIG_SMP
12static int
13select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
14{
15 return task_cpu(p); /* IDLE tasks as never migrated */
16}
17#endif /* CONFIG_SMP */
18
19/*
20 * Idle tasks are unconditionally rescheduled:
21 */
22static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
23{
24 resched_curr(rq);
25}
26
27static struct task_struct *
28pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
29{
30 put_prev_task(rq, prev);
31 update_idle_core(rq);
32 schedstat_inc(rq->sched_goidle);
33 return rq->idle;
34}
35
36/*
37 * It is not legal to sleep in the idle task - print a warning
38 * message if some code attempts to do it:
39 */
40static void
41dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
42{
43 raw_spin_unlock_irq(&rq->lock);
44 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
45 dump_stack();
46 raw_spin_lock_irq(&rq->lock);
47}
48
49static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
50{
51 rq_last_tick_reset(rq);
52}
53
54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_idle(struct rq *rq)
59{
60}
61
62static void switched_to_idle(struct rq *rq, struct task_struct *p)
63{
64 BUG();
65}
66
67static void
68prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
69{
70 BUG();
71}
72
73static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
74{
75 return 0;
76}
77
78static void update_curr_idle(struct rq *rq)
79{
80}
81
82/*
83 * Simple, special scheduling class for the per-CPU idle tasks:
84 */
85const struct sched_class idle_sched_class = {
86 /* .next is NULL */
87 /* no enqueue/yield_task for idle tasks */
88
89 /* dequeue is not valid, we print a debug message there: */
90 .dequeue_task = dequeue_task_idle,
91
92 .check_preempt_curr = check_preempt_curr_idle,
93
94 .pick_next_task = pick_next_task_idle,
95 .put_prev_task = put_prev_task_idle,
96
97#ifdef CONFIG_SMP
98 .select_task_rq = select_task_rq_idle,
99 .set_cpus_allowed = set_cpus_allowed_common,
100#endif
101
102 .set_curr_task = set_curr_task_idle,
103 .task_tick = task_tick_idle,
104
105 .get_rr_interval = get_rr_interval_idle,
106
107 .prio_changed = prio_changed_idle,
108 .switched_to = switched_to_idle,
109 .update_curr = update_curr_idle,
110};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436f59f2..e6802181900f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -3,15 +3,10 @@
3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work. 3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
4 * 4 *
5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker 5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
6 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
6 * 7 *
7 */ 8 */
8 9#include "sched.h"
9#include <linux/sched/isolation.h>
10#include <linux/tick.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/static_key.h>
14#include <linux/ctype.h>
15 10
16DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); 11DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
17EXPORT_SYMBOL_GPL(housekeeping_overriden); 12EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
60 55
61 static_branch_enable(&housekeeping_overriden); 56 static_branch_enable(&housekeeping_overriden);
62 57
58 if (housekeeping_flags & HK_FLAG_TICK)
59 sched_tick_offload_init();
60
63 /* We need at least one CPU to handle housekeeping work */ 61 /* We need at least one CPU to handle housekeeping work */
64 WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); 62 WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
65} 63}
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
119{ 117{
120 unsigned int flags; 118 unsigned int flags;
121 119
122 flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; 120 flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
123 121
124 return housekeeping_setup(str, flags); 122 return housekeeping_setup(str, flags);
125} 123}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a171c1258109 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,10 +6,6 @@
6 * figure. Its a silly number but people think its important. We go through 6 * figure. Its a silly number but people think its important. We go through
7 * great pains to make it work on big machines and tickless kernels. 7 * great pains to make it work on big machines and tickless kernels.
8 */ 8 */
9
10#include <linux/export.h>
11#include <linux/sched/loadavg.h>
12
13#include "sched.h" 9#include "sched.h"
14 10
15/* 11/*
@@ -32,29 +28,29 @@
32 * Due to a number of reasons the above turns in the mess below: 28 * Due to a number of reasons the above turns in the mess below:
33 * 29 *
34 * - for_each_possible_cpu() is prohibitively expensive on machines with 30 * - for_each_possible_cpu() is prohibitively expensive on machines with
35 * serious number of cpus, therefore we need to take a distributed approach 31 * serious number of CPUs, therefore we need to take a distributed approach
36 * to calculating nr_active. 32 * to calculating nr_active.
37 * 33 *
38 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 34 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
39 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } 35 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
40 * 36 *
41 * So assuming nr_active := 0 when we start out -- true per definition, we 37 * So assuming nr_active := 0 when we start out -- true per definition, we
42 * can simply take per-cpu deltas and fold those into a global accumulate 38 * can simply take per-CPU deltas and fold those into a global accumulate
43 * to obtain the same result. See calc_load_fold_active(). 39 * to obtain the same result. See calc_load_fold_active().
44 * 40 *
45 * Furthermore, in order to avoid synchronizing all per-cpu delta folding 41 * Furthermore, in order to avoid synchronizing all per-CPU delta folding
46 * across the machine, we assume 10 ticks is sufficient time for every 42 * across the machine, we assume 10 ticks is sufficient time for every
47 * cpu to have completed this task. 43 * CPU to have completed this task.
48 * 44 *
49 * This places an upper-bound on the IRQ-off latency of the machine. Then 45 * This places an upper-bound on the IRQ-off latency of the machine. Then
50 * again, being late doesn't loose the delta, just wrecks the sample. 46 * again, being late doesn't loose the delta, just wrecks the sample.
51 * 47 *
52 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because 48 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
53 * this would add another cross-cpu cacheline miss and atomic operation 49 * this would add another cross-CPU cacheline miss and atomic operation
54 * to the wakeup path. Instead we increment on whatever cpu the task ran 50 * to the wakeup path. Instead we increment on whatever CPU the task ran
55 * when it went into uninterruptible state and decrement on whatever cpu 51 * when it went into uninterruptible state and decrement on whatever CPU
56 * did the wakeup. This means that only the sum of nr_uninterruptible over 52 * did the wakeup. This means that only the sum of nr_uninterruptible over
57 * all cpus yields the correct result. 53 * all CPUs yields the correct result.
58 * 54 *
59 * This covers the NO_HZ=n code, for extra head-aches, see the comment below. 55 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
60 */ 56 */
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
115 * Handle NO_HZ for the global load-average. 111 * Handle NO_HZ for the global load-average.
116 * 112 *
117 * Since the above described distributed algorithm to compute the global 113 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by 114 * load-average relies on per-CPU sampling from the tick, it is affected by
119 * NO_HZ. 115 * NO_HZ.
120 * 116 *
121 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon 117 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 118 * entering NO_HZ state such that we can include this as an 'extra' CPU delta
123 * when we read the global state. 119 * when we read the global state.
124 * 120 *
125 * Obviously reality has to ruin such a delightfully simple scheme: 121 * Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
146 * busy state. 142 * busy state.
147 * 143 *
148 * This is solved by pushing the window forward, and thus skipping the 144 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which 145 * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
150 * was in effect at the time the window opened). This also solves the issue 146 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ 147 * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
152 * intervals. 148 * intervals.
153 * 149 *
154 * When making the ILB scale, we should try to pull this in as well. 150 * When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
299} 295}
300 296
301/* 297/*
302 * NO_HZ can leave us missing all per-cpu ticks calling 298 * NO_HZ can leave us missing all per-CPU ticks calling
303 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into 299 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
304 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold 300 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
305 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. 301 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
363 return; 359 return;
364 360
365 /* 361 /*
366 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. 362 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
367 */ 363 */
368 delta = calc_load_nohz_fold(); 364 delta = calc_load_nohz_fold();
369 if (delta) 365 if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..76e0eaf4654e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -13,32 +13,25 @@
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 */ 15 */
16 16#include "sched.h"
17#include <linux/syscalls.h>
18#include <linux/membarrier.h>
19#include <linux/tick.h>
20#include <linux/cpumask.h>
21#include <linux/atomic.h>
22
23#include "sched.h" /* for cpu_rq(). */
24 17
25/* 18/*
26 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 19 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
27 * except MEMBARRIER_CMD_QUERY. 20 * except MEMBARRIER_CMD_QUERY.
28 */ 21 */
29#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 22#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
30#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 23#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
31 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 24 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
32 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
33#else 26#else
34#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 27#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
35#endif 28#endif
36 29
37#define MEMBARRIER_CMD_BITMASK \ 30#define MEMBARRIER_CMD_BITMASK \
38 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 31 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
39 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 32 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
40 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 33 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
41 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 34 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
42 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 35 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
43 36
44static void ipi_mb(void *info) 37static void ipi_mb(void *info)
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
85 */ 78 */
86 if (cpu == raw_smp_processor_id()) 79 if (cpu == raw_smp_processor_id())
87 continue; 80 continue;
81
88 rcu_read_lock(); 82 rcu_read_lock();
89 p = task_rcu_dereference(&cpu_rq(cpu)->curr); 83 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
90 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & 84 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
188 * rq->curr modification in scheduler. 182 * rq->curr modification in scheduler.
189 */ 183 */
190 smp_mb(); /* exit from system call is not a mb */ 184 smp_mb(); /* exit from system call is not a mb */
185
191 return 0; 186 return 0;
192} 187}
193 188
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
219 } 214 }
220 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 215 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
221 &mm->membarrier_state); 216 &mm->membarrier_state);
217
222 return 0; 218 return 0;
223} 219}
224 220
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
253 synchronize_sched(); 249 synchronize_sched();
254 } 250 }
255 atomic_or(state, &mm->membarrier_state); 251 atomic_or(state, &mm->membarrier_state);
252
256 return 0; 253 return 0;
257} 254}
258 255
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aad49451584e..86b77987435e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies) 4 * policies)
5 */ 5 */
6
7#include "sched.h" 6#include "sched.h"
8 7
9#include <linux/slab.h>
10#include <linux/irq_work.h>
11
12int sched_rr_timeslice = RR_TIMESLICE; 8int sched_rr_timeslice = RR_TIMESLICE;
13int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 9int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
14 10
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
359static void push_rt_tasks(struct rq *); 355static void push_rt_tasks(struct rq *);
360static void pull_rt_task(struct rq *); 356static void pull_rt_task(struct rq *);
361 357
362static inline void queue_push_tasks(struct rq *rq) 358static inline void rt_queue_push_tasks(struct rq *rq)
363{ 359{
364 if (!has_pushable_tasks(rq)) 360 if (!has_pushable_tasks(rq))
365 return; 361 return;
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
367 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 363 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
368} 364}
369 365
370static inline void queue_pull_task(struct rq *rq) 366static inline void rt_queue_pull_task(struct rq *rq)
371{ 367{
372 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 368 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
373} 369}
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
425{ 421{
426} 422}
427 423
428static inline void queue_push_tasks(struct rq *rq) 424static inline void rt_queue_push_tasks(struct rq *rq)
429{ 425{
430} 426}
431#endif /* CONFIG_SMP */ 427#endif /* CONFIG_SMP */
@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq)
961 if (unlikely((s64)delta_exec <= 0)) 957 if (unlikely((s64)delta_exec <= 0))
962 return; 958 return;
963 959
964 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
965 cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
966
967 schedstat_set(curr->se.statistics.exec_max, 960 schedstat_set(curr->se.statistics.exec_max,
968 max(curr->se.statistics.exec_max, delta_exec)); 961 max(curr->se.statistics.exec_max, delta_exec));
969 962
@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
1005 998
1006 sub_nr_running(rq, rt_rq->rt_nr_running); 999 sub_nr_running(rq, rt_rq->rt_nr_running);
1007 rt_rq->rt_queued = 0; 1000 rt_rq->rt_queued = 0;
1001
1002 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1003 cpufreq_update_util(rq, 0);
1008} 1004}
1009 1005
1010static void 1006static void
@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
1021 1017
1022 add_nr_running(rq, rt_rq->rt_nr_running); 1018 add_nr_running(rq, rt_rq->rt_nr_running);
1023 rt_rq->rt_queued = 1; 1019 rt_rq->rt_queued = 1;
1020
1021 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1022 cpufreq_update_util(rq, 0);
1024} 1023}
1025 1024
1026#if defined CONFIG_SMP 1025#if defined CONFIG_SMP
@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1453 return; 1452 return;
1454 1453
1455 /* 1454 /*
1456 * There appears to be other cpus that can accept 1455 * There appear to be other CPUs that can accept
1457 * current and none to run 'p', so lets reschedule 1456 * the current task but none can run 'p', so lets reschedule
1458 * to try and push current away: 1457 * to try and push the current task away:
1459 */ 1458 */
1460 requeue_task_rt(rq, p, 1); 1459 requeue_task_rt(rq, p, 1);
1461 resched_curr(rq); 1460 resched_curr(rq);
@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1569 /* The running task is never eligible for pushing */ 1568 /* The running task is never eligible for pushing */
1570 dequeue_pushable_task(rq, p); 1569 dequeue_pushable_task(rq, p);
1571 1570
1572 queue_push_tasks(rq); 1571 rt_queue_push_tasks(rq);
1573 1572
1574 return p; 1573 return p;
1575} 1574}
@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1596 if (!task_running(rq, p) && 1595 if (!task_running(rq, p) &&
1597 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1596 cpumask_test_cpu(cpu, &p->cpus_allowed))
1598 return 1; 1597 return 1;
1598
1599 return 0; 1599 return 0;
1600} 1600}
1601 1601
1602/* 1602/*
1603 * Return the highest pushable rq's task, which is suitable to be executed 1603 * Return the highest pushable rq's task, which is suitable to be executed
1604 * on the cpu, NULL otherwise 1604 * on the CPU, NULL otherwise
1605 */ 1605 */
1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1607{ 1607{
@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task)
1639 return -1; /* No targets found */ 1639 return -1; /* No targets found */
1640 1640
1641 /* 1641 /*
1642 * At this point we have built a mask of cpus representing the 1642 * At this point we have built a mask of CPUs representing the
1643 * lowest priority tasks in the system. Now we want to elect 1643 * lowest priority tasks in the system. Now we want to elect
1644 * the best one based on our affinity and topology. 1644 * the best one based on our affinity and topology.
1645 * 1645 *
1646 * We prioritize the last cpu that the task executed on since 1646 * We prioritize the last CPU that the task executed on since
1647 * it is most likely cache-hot in that location. 1647 * it is most likely cache-hot in that location.
1648 */ 1648 */
1649 if (cpumask_test_cpu(cpu, lowest_mask)) 1649 if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task)
1651 1651
1652 /* 1652 /*
1653 * Otherwise, we consult the sched_domains span maps to figure 1653 * Otherwise, we consult the sched_domains span maps to figure
1654 * out which cpu is logically closest to our hot cache data. 1654 * out which CPU is logically closest to our hot cache data.
1655 */ 1655 */
1656 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1656 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task)
1692 cpu = cpumask_any(lowest_mask); 1692 cpu = cpumask_any(lowest_mask);
1693 if (cpu < nr_cpu_ids) 1693 if (cpu < nr_cpu_ids)
1694 return cpu; 1694 return cpu;
1695
1695 return -1; 1696 return -1;
1696} 1697}
1697 1698
@@ -1827,7 +1828,7 @@ retry:
1827 * The task hasn't migrated, and is still the next 1828 * The task hasn't migrated, and is still the next
1828 * eligible task, but we failed to find a run-queue 1829 * eligible task, but we failed to find a run-queue
1829 * to push it to. Do not retry in this case, since 1830 * to push it to. Do not retry in this case, since
1830 * other cpus will pull from us when ready. 1831 * other CPUs will pull from us when ready.
1831 */ 1832 */
1832 goto out; 1833 goto out;
1833 } 1834 }
@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd)
1919 * rt_next_cpu() will simply return the first CPU found in 1920 * rt_next_cpu() will simply return the first CPU found in
1920 * the rto_mask. 1921 * the rto_mask.
1921 * 1922 *
1922 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it 1923 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1923 * will return the next CPU found in the rto_mask. 1924 * will return the next CPU found in the rto_mask.
1924 * 1925 *
1925 * If there are no more CPUs left in the rto_mask, then a check is made 1926 * If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq)
1980 raw_spin_lock(&rq->rd->rto_lock); 1981 raw_spin_lock(&rq->rd->rto_lock);
1981 1982
1982 /* 1983 /*
1983 * The rto_cpu is updated under the lock, if it has a valid cpu 1984 * The rto_cpu is updated under the lock, if it has a valid CPU
1984 * then the IPI is still running and will continue due to the 1985 * then the IPI is still running and will continue due to the
1985 * update to loop_next, and nothing needs to be done here. 1986 * update to loop_next, and nothing needs to be done here.
1986 * Otherwise it is finishing up and an ipi needs to be sent. 1987 * Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq)
2105 2106
2106 /* 2107 /*
2107 * There's a chance that p is higher in priority 2108 * There's a chance that p is higher in priority
2108 * than what's currently running on its cpu. 2109 * than what's currently running on its CPU.
2109 * This is just that p is wakeing up and hasn't 2110 * This is just that p is wakeing up and hasn't
2110 * had a chance to schedule. We only pull 2111 * had a chance to schedule. We only pull
2111 * p if it is lower in priority than the 2112 * p if it is lower in priority than the
@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
2187 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2188 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2188 return; 2189 return;
2189 2190
2190 queue_pull_task(rq); 2191 rt_queue_pull_task(rq);
2191} 2192}
2192 2193
2193void __init init_sched_rt_class(void) 2194void __init init_sched_rt_class(void)
@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
2218 if (task_on_rq_queued(p) && rq->curr != p) { 2219 if (task_on_rq_queued(p) && rq->curr != p) {
2219#ifdef CONFIG_SMP 2220#ifdef CONFIG_SMP
2220 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2221 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2221 queue_push_tasks(rq); 2222 rt_queue_push_tasks(rq);
2222#endif /* CONFIG_SMP */ 2223#endif /* CONFIG_SMP */
2223 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2224 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2224 resched_curr(rq); 2225 resched_curr(rq);
@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2242 * may need to pull tasks to this runqueue. 2243 * may need to pull tasks to this runqueue.
2243 */ 2244 */
2244 if (oldprio < p->prio) 2245 if (oldprio < p->prio)
2245 queue_pull_task(rq); 2246 rt_queue_pull_task(rq);
2246 2247
2247 /* 2248 /*
2248 * If there's a higher priority task waiting to run 2249 * If there's a higher priority task waiting to run
@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
2292static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2293static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2293#endif 2294#endif
2294 2295
2296/*
2297 * scheduler tick hitting a task of our scheduling class.
2298 *
2299 * NOTE: This function can be called remotely by the tick offload that
2300 * goes along full dynticks. Therefore no local assumption can be made
2301 * and everything must be accessed through the @rq and @curr passed in
2302 * parameters.
2303 */
2295static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2304static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2296{ 2305{
2297 struct sched_rt_entity *rt_se = &p->rt; 2306 struct sched_rt_entity *rt_se = &p->rt;
@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
2685 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2694 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2686 } 2695 }
2687 mutex_unlock(&mutex); 2696 mutex_unlock(&mutex);
2697
2688 return ret; 2698 return ret;
2689} 2699}
2690 2700
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb5fc458547f..c3deaee7a7a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,39 +1,73 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2 2/*
3 * Scheduler internal types and methods:
4 */
3#include <linux/sched.h> 5#include <linux/sched.h>
6
4#include <linux/sched/autogroup.h> 7#include <linux/sched/autogroup.h>
5#include <linux/sched/sysctl.h>
6#include <linux/sched/topology.h>
7#include <linux/sched/rt.h>
8#include <linux/sched/deadline.h>
9#include <linux/sched/clock.h> 8#include <linux/sched/clock.h>
10#include <linux/sched/wake_q.h> 9#include <linux/sched/coredump.h>
11#include <linux/sched/signal.h>
12#include <linux/sched/numa_balancing.h>
13#include <linux/sched/mm.h>
14#include <linux/sched/cpufreq.h> 10#include <linux/sched/cpufreq.h>
15#include <linux/sched/stat.h> 11#include <linux/sched/cputime.h>
16#include <linux/sched/nohz.h> 12#include <linux/sched/deadline.h>
17#include <linux/sched/debug.h> 13#include <linux/sched/debug.h>
18#include <linux/sched/hotplug.h> 14#include <linux/sched/hotplug.h>
15#include <linux/sched/idle.h>
16#include <linux/sched/init.h>
17#include <linux/sched/isolation.h>
18#include <linux/sched/jobctl.h>
19#include <linux/sched/loadavg.h>
20#include <linux/sched/mm.h>
21#include <linux/sched/nohz.h>
22#include <linux/sched/numa_balancing.h>
23#include <linux/sched/prio.h>
24#include <linux/sched/rt.h>
25#include <linux/sched/signal.h>
26#include <linux/sched/stat.h>
27#include <linux/sched/sysctl.h>
19#include <linux/sched/task.h> 28#include <linux/sched/task.h>
20#include <linux/sched/task_stack.h> 29#include <linux/sched/task_stack.h>
21#include <linux/sched/cputime.h> 30#include <linux/sched/topology.h>
22#include <linux/sched/init.h> 31#include <linux/sched/user.h>
32#include <linux/sched/wake_q.h>
33#include <linux/sched/xacct.h>
34
35#include <uapi/linux/sched/types.h>
23 36
24#include <linux/u64_stats_sync.h>
25#include <linux/kernel_stat.h>
26#include <linux/binfmts.h> 37#include <linux/binfmts.h>
27#include <linux/mutex.h> 38#include <linux/blkdev.h>
28#include <linux/spinlock.h> 39#include <linux/compat.h>
40#include <linux/context_tracking.h>
41#include <linux/cpufreq.h>
42#include <linux/cpuidle.h>
43#include <linux/cpuset.h>
44#include <linux/ctype.h>
45#include <linux/debugfs.h>
46#include <linux/delayacct.h>
47#include <linux/init_task.h>
48#include <linux/kprobes.h>
49#include <linux/kthread.h>
50#include <linux/membarrier.h>
51#include <linux/migrate.h>
52#include <linux/mmu_context.h>
53#include <linux/nmi.h>
54#include <linux/proc_fs.h>
55#include <linux/prefetch.h>
56#include <linux/profile.h>
57#include <linux/rcupdate_wait.h>
58#include <linux/security.h>
59#include <linux/stackprotector.h>
29#include <linux/stop_machine.h> 60#include <linux/stop_machine.h>
30#include <linux/irq_work.h> 61#include <linux/suspend.h>
31#include <linux/tick.h> 62#include <linux/swait.h>
32#include <linux/slab.h> 63#include <linux/syscalls.h>
33#include <linux/cgroup.h> 64#include <linux/task_work.h>
65#include <linux/tsacct_kern.h>
66
67#include <asm/tlb.h>
34 68
35#ifdef CONFIG_PARAVIRT 69#ifdef CONFIG_PARAVIRT
36#include <asm/paravirt.h> 70# include <asm/paravirt.h>
37#endif 71#endif
38 72
39#include "cpupri.h" 73#include "cpupri.h"
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
79 * and does not change the user-interface for setting shares/weights. 113 * and does not change the user-interface for setting shares/weights.
80 * 114 *
81 * We increase resolution only if we have enough bits to allow this increased 115 * We increase resolution only if we have enough bits to allow this increased
82 * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are 116 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
83 * pretty high and the returns do not justify the increased costs. 117 * are pretty high and the returns do not justify the increased costs.
84 * 118 *
85 * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to 119 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
86 * increase coverage and consistency always enable it on 64bit platforms. 120 * increase coverage and consistency always enable it on 64-bit platforms.
87 */ 121 */
88#ifdef CONFIG_64BIT 122#ifdef CONFIG_64BIT
89# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 123# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
111 * 10 -> just above 1us 145 * 10 -> just above 1us
112 * 9 -> just above 0.5us 146 * 9 -> just above 0.5us
113 */ 147 */
114#define DL_SCALE (10) 148#define DL_SCALE 10
115 149
116/* 150/*
117 * These are the 'tuning knobs' of the scheduler: 151 * Single value that denotes runtime == period, ie unlimited time.
118 */ 152 */
119 153#define RUNTIME_INF ((u64)~0ULL)
120/*
121 * single value that denotes runtime == period, ie unlimited time.
122 */
123#define RUNTIME_INF ((u64)~0ULL)
124 154
125static inline int idle_policy(int policy) 155static inline int idle_policy(int policy)
126{ 156{
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
235 * control. 265 * control.
236 */ 266 */
237struct dl_bandwidth { 267struct dl_bandwidth {
238 raw_spinlock_t dl_runtime_lock; 268 raw_spinlock_t dl_runtime_lock;
239 u64 dl_runtime; 269 u64 dl_runtime;
240 u64 dl_period; 270 u64 dl_period;
241}; 271};
242 272
243static inline int dl_bandwidth_enabled(void) 273static inline int dl_bandwidth_enabled(void)
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
246} 276}
247 277
248struct dl_bw { 278struct dl_bw {
249 raw_spinlock_t lock; 279 raw_spinlock_t lock;
250 u64 bw, total_bw; 280 u64 bw;
281 u64 total_bw;
251}; 282};
252 283
253static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 284static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
273 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 304 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
274} 305}
275 306
276void dl_change_utilization(struct task_struct *p, u64 new_bw); 307extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
277extern void init_dl_bw(struct dl_bw *dl_b); 308extern void init_dl_bw(struct dl_bw *dl_b);
278extern int sched_dl_global_validate(void); 309extern int sched_dl_global_validate(void);
279extern void sched_dl_do_global(void); 310extern void sched_dl_do_global(void);
280extern int sched_dl_overflow(struct task_struct *p, int policy, 311extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
281 const struct sched_attr *attr);
282extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 312extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
283extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 313extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
284extern bool __checkparam_dl(const struct sched_attr *attr); 314extern bool __checkparam_dl(const struct sched_attr *attr);
285extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 315extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
286extern int dl_task_can_attach(struct task_struct *p, 316extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
287 const struct cpumask *cs_cpus_allowed); 317extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
288extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
289 const struct cpumask *trial);
290extern bool dl_cpu_busy(unsigned int cpu); 318extern bool dl_cpu_busy(unsigned int cpu);
291 319
292#ifdef CONFIG_CGROUP_SCHED 320#ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +328,36 @@ extern struct list_head task_groups;
300 328
301struct cfs_bandwidth { 329struct cfs_bandwidth {
302#ifdef CONFIG_CFS_BANDWIDTH 330#ifdef CONFIG_CFS_BANDWIDTH
303 raw_spinlock_t lock; 331 raw_spinlock_t lock;
304 ktime_t period; 332 ktime_t period;
305 u64 quota, runtime; 333 u64 quota;
306 s64 hierarchical_quota; 334 u64 runtime;
307 u64 runtime_expires; 335 s64 hierarchical_quota;
308 336 u64 runtime_expires;
309 int idle, period_active; 337
310 struct hrtimer period_timer, slack_timer; 338 int idle;
311 struct list_head throttled_cfs_rq; 339 int period_active;
312 340 struct hrtimer period_timer;
313 /* statistics */ 341 struct hrtimer slack_timer;
314 int nr_periods, nr_throttled; 342 struct list_head throttled_cfs_rq;
315 u64 throttled_time; 343
344 /* Statistics: */
345 int nr_periods;
346 int nr_throttled;
347 u64 throttled_time;
316#endif 348#endif
317}; 349};
318 350
319/* task group related information */ 351/* Task group related information */
320struct task_group { 352struct task_group {
321 struct cgroup_subsys_state css; 353 struct cgroup_subsys_state css;
322 354
323#ifdef CONFIG_FAIR_GROUP_SCHED 355#ifdef CONFIG_FAIR_GROUP_SCHED
324 /* schedulable entities of this group on each cpu */ 356 /* schedulable entities of this group on each CPU */
325 struct sched_entity **se; 357 struct sched_entity **se;
326 /* runqueue "owned" by this group on each cpu */ 358 /* runqueue "owned" by this group on each CPU */
327 struct cfs_rq **cfs_rq; 359 struct cfs_rq **cfs_rq;
328 unsigned long shares; 360 unsigned long shares;
329 361
330#ifdef CONFIG_SMP 362#ifdef CONFIG_SMP
331 /* 363 /*
@@ -333,29 +365,29 @@ struct task_group {
333 * it in its own cacheline separated from the fields above which 365 * it in its own cacheline separated from the fields above which
334 * will also be accessed at each tick. 366 * will also be accessed at each tick.
335 */ 367 */
336 atomic_long_t load_avg ____cacheline_aligned; 368 atomic_long_t load_avg ____cacheline_aligned;
337#endif 369#endif
338#endif 370#endif
339 371
340#ifdef CONFIG_RT_GROUP_SCHED 372#ifdef CONFIG_RT_GROUP_SCHED
341 struct sched_rt_entity **rt_se; 373 struct sched_rt_entity **rt_se;
342 struct rt_rq **rt_rq; 374 struct rt_rq **rt_rq;
343 375
344 struct rt_bandwidth rt_bandwidth; 376 struct rt_bandwidth rt_bandwidth;
345#endif 377#endif
346 378
347 struct rcu_head rcu; 379 struct rcu_head rcu;
348 struct list_head list; 380 struct list_head list;
349 381
350 struct task_group *parent; 382 struct task_group *parent;
351 struct list_head siblings; 383 struct list_head siblings;
352 struct list_head children; 384 struct list_head children;
353 385
354#ifdef CONFIG_SCHED_AUTOGROUP 386#ifdef CONFIG_SCHED_AUTOGROUP
355 struct autogroup *autogroup; 387 struct autogroup *autogroup;
356#endif 388#endif
357 389
358 struct cfs_bandwidth cfs_bandwidth; 390 struct cfs_bandwidth cfs_bandwidth;
359}; 391};
360 392
361#ifdef CONFIG_FAIR_GROUP_SCHED 393#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +401,8 @@ struct task_group {
369 * (The default weight is 1024 - so there's no practical 401 * (The default weight is 1024 - so there's no practical
370 * limitation from this.) 402 * limitation from this.)
371 */ 403 */
372#define MIN_SHARES (1UL << 1) 404#define MIN_SHARES (1UL << 1)
373#define MAX_SHARES (1UL << 18) 405#define MAX_SHARES (1UL << 18)
374#endif 406#endif
375 407
376typedef int (*tg_visitor)(struct task_group *, void *); 408typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
443 475
444/* CFS-related fields in a runqueue */ 476/* CFS-related fields in a runqueue */
445struct cfs_rq { 477struct cfs_rq {
446 struct load_weight load; 478 struct load_weight load;
447 unsigned long runnable_weight; 479 unsigned long runnable_weight;
448 unsigned int nr_running, h_nr_running; 480 unsigned int nr_running;
481 unsigned int h_nr_running;
449 482
450 u64 exec_clock; 483 u64 exec_clock;
451 u64 min_vruntime; 484 u64 min_vruntime;
452#ifndef CONFIG_64BIT 485#ifndef CONFIG_64BIT
453 u64 min_vruntime_copy; 486 u64 min_vruntime_copy;
454#endif 487#endif
455 488
456 struct rb_root_cached tasks_timeline; 489 struct rb_root_cached tasks_timeline;
457 490
458 /* 491 /*
459 * 'curr' points to currently running entity on this cfs_rq. 492 * 'curr' points to currently running entity on this cfs_rq.
460 * It is set to NULL otherwise (i.e when none are currently running). 493 * It is set to NULL otherwise (i.e when none are currently running).
461 */ 494 */
462 struct sched_entity *curr, *next, *last, *skip; 495 struct sched_entity *curr;
496 struct sched_entity *next;
497 struct sched_entity *last;
498 struct sched_entity *skip;
463 499
464#ifdef CONFIG_SCHED_DEBUG 500#ifdef CONFIG_SCHED_DEBUG
465 unsigned int nr_spread_over; 501 unsigned int nr_spread_over;
466#endif 502#endif
467 503
468#ifdef CONFIG_SMP 504#ifdef CONFIG_SMP
469 /* 505 /*
470 * CFS load tracking 506 * CFS load tracking
471 */ 507 */
472 struct sched_avg avg; 508 struct sched_avg avg;
473#ifndef CONFIG_64BIT 509#ifndef CONFIG_64BIT
474 u64 load_last_update_time_copy; 510 u64 load_last_update_time_copy;
475#endif 511#endif
476 struct { 512 struct {
477 raw_spinlock_t lock ____cacheline_aligned; 513 raw_spinlock_t lock ____cacheline_aligned;
@@ -482,9 +518,9 @@ struct cfs_rq {
482 } removed; 518 } removed;
483 519
484#ifdef CONFIG_FAIR_GROUP_SCHED 520#ifdef CONFIG_FAIR_GROUP_SCHED
485 unsigned long tg_load_avg_contrib; 521 unsigned long tg_load_avg_contrib;
486 long propagate; 522 long propagate;
487 long prop_runnable_sum; 523 long prop_runnable_sum;
488 524
489 /* 525 /*
490 * h_load = weight * f(tg) 526 * h_load = weight * f(tg)
@@ -492,36 +528,38 @@ struct cfs_rq {
492 * Where f(tg) is the recursive weight fraction assigned to 528 * Where f(tg) is the recursive weight fraction assigned to
493 * this group. 529 * this group.
494 */ 530 */
495 unsigned long h_load; 531 unsigned long h_load;
496 u64 last_h_load_update; 532 u64 last_h_load_update;
497 struct sched_entity *h_load_next; 533 struct sched_entity *h_load_next;
498#endif /* CONFIG_FAIR_GROUP_SCHED */ 534#endif /* CONFIG_FAIR_GROUP_SCHED */
499#endif /* CONFIG_SMP */ 535#endif /* CONFIG_SMP */
500 536
501#ifdef CONFIG_FAIR_GROUP_SCHED 537#ifdef CONFIG_FAIR_GROUP_SCHED
502 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 538 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
503 539
504 /* 540 /*
505 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 541 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
506 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 542 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
507 * (like users, containers etc.) 543 * (like users, containers etc.)
508 * 544 *
509 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 545 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
510 * list is used during load balance. 546 * This list is used during load balance.
511 */ 547 */
512 int on_list; 548 int on_list;
513 struct list_head leaf_cfs_rq_list; 549 struct list_head leaf_cfs_rq_list;
514 struct task_group *tg; /* group that "owns" this runqueue */ 550 struct task_group *tg; /* group that "owns" this runqueue */
515 551
516#ifdef CONFIG_CFS_BANDWIDTH 552#ifdef CONFIG_CFS_BANDWIDTH
517 int runtime_enabled; 553 int runtime_enabled;
518 u64 runtime_expires; 554 u64 runtime_expires;
519 s64 runtime_remaining; 555 s64 runtime_remaining;
520 556
521 u64 throttled_clock, throttled_clock_task; 557 u64 throttled_clock;
522 u64 throttled_clock_task_time; 558 u64 throttled_clock_task;
523 int throttled, throttle_count; 559 u64 throttled_clock_task_time;
524 struct list_head throttled_list; 560 int throttled;
561 int throttle_count;
562 struct list_head throttled_list;
525#endif /* CONFIG_CFS_BANDWIDTH */ 563#endif /* CONFIG_CFS_BANDWIDTH */
526#endif /* CONFIG_FAIR_GROUP_SCHED */ 564#endif /* CONFIG_FAIR_GROUP_SCHED */
527}; 565};
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
538 576
539/* Real-Time classes' related field in a runqueue: */ 577/* Real-Time classes' related field in a runqueue: */
540struct rt_rq { 578struct rt_rq {
541 struct rt_prio_array active; 579 struct rt_prio_array active;
542 unsigned int rt_nr_running; 580 unsigned int rt_nr_running;
543 unsigned int rr_nr_running; 581 unsigned int rr_nr_running;
544#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 582#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
545 struct { 583 struct {
546 int curr; /* highest queued rt task prio */ 584 int curr; /* highest queued rt task prio */
547#ifdef CONFIG_SMP 585#ifdef CONFIG_SMP
548 int next; /* next highest */ 586 int next; /* next highest */
549#endif 587#endif
550 } highest_prio; 588 } highest_prio;
551#endif 589#endif
552#ifdef CONFIG_SMP 590#ifdef CONFIG_SMP
553 unsigned long rt_nr_migratory; 591 unsigned long rt_nr_migratory;
554 unsigned long rt_nr_total; 592 unsigned long rt_nr_total;
555 int overloaded; 593 int overloaded;
556 struct plist_head pushable_tasks; 594 struct plist_head pushable_tasks;
557#endif /* CONFIG_SMP */ 595#endif /* CONFIG_SMP */
558 int rt_queued; 596 int rt_queued;
559 597
560 int rt_throttled; 598 int rt_throttled;
561 u64 rt_time; 599 u64 rt_time;
562 u64 rt_runtime; 600 u64 rt_runtime;
563 /* Nests inside the rq lock: */ 601 /* Nests inside the rq lock: */
564 raw_spinlock_t rt_runtime_lock; 602 raw_spinlock_t rt_runtime_lock;
565 603
566#ifdef CONFIG_RT_GROUP_SCHED 604#ifdef CONFIG_RT_GROUP_SCHED
567 unsigned long rt_nr_boosted; 605 unsigned long rt_nr_boosted;
568 606
569 struct rq *rq; 607 struct rq *rq;
570 struct task_group *tg; 608 struct task_group *tg;
571#endif 609#endif
572}; 610};
573 611
574/* Deadline class' related fields in a runqueue */ 612/* Deadline class' related fields in a runqueue */
575struct dl_rq { 613struct dl_rq {
576 /* runqueue is an rbtree, ordered by deadline */ 614 /* runqueue is an rbtree, ordered by deadline */
577 struct rb_root_cached root; 615 struct rb_root_cached root;
578 616
579 unsigned long dl_nr_running; 617 unsigned long dl_nr_running;
580 618
581#ifdef CONFIG_SMP 619#ifdef CONFIG_SMP
582 /* 620 /*
@@ -586,28 +624,28 @@ struct dl_rq {
586 * should migrate somewhere else. 624 * should migrate somewhere else.
587 */ 625 */
588 struct { 626 struct {
589 u64 curr; 627 u64 curr;
590 u64 next; 628 u64 next;
591 } earliest_dl; 629 } earliest_dl;
592 630
593 unsigned long dl_nr_migratory; 631 unsigned long dl_nr_migratory;
594 int overloaded; 632 int overloaded;
595 633
596 /* 634 /*
597 * Tasks on this rq that can be pushed away. They are kept in 635 * Tasks on this rq that can be pushed away. They are kept in
598 * an rb-tree, ordered by tasks' deadlines, with caching 636 * an rb-tree, ordered by tasks' deadlines, with caching
599 * of the leftmost (earliest deadline) element. 637 * of the leftmost (earliest deadline) element.
600 */ 638 */
601 struct rb_root_cached pushable_dl_tasks_root; 639 struct rb_root_cached pushable_dl_tasks_root;
602#else 640#else
603 struct dl_bw dl_bw; 641 struct dl_bw dl_bw;
604#endif 642#endif
605 /* 643 /*
606 * "Active utilization" for this runqueue: increased when a 644 * "Active utilization" for this runqueue: increased when a
607 * task wakes up (becomes TASK_RUNNING) and decreased when a 645 * task wakes up (becomes TASK_RUNNING) and decreased when a
608 * task blocks 646 * task blocks
609 */ 647 */
610 u64 running_bw; 648 u64 running_bw;
611 649
612 /* 650 /*
613 * Utilization of the tasks "assigned" to this runqueue (including 651 * Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +656,14 @@ struct dl_rq {
618 * This is needed to compute the "inactive utilization" for the 656 * This is needed to compute the "inactive utilization" for the
619 * runqueue (inactive utilization = this_bw - running_bw). 657 * runqueue (inactive utilization = this_bw - running_bw).
620 */ 658 */
621 u64 this_bw; 659 u64 this_bw;
622 u64 extra_bw; 660 u64 extra_bw;
623 661
624 /* 662 /*
625 * Inverse of the fraction of CPU utilization that can be reclaimed 663 * Inverse of the fraction of CPU utilization that can be reclaimed
626 * by the GRUB algorithm. 664 * by the GRUB algorithm.
627 */ 665 */
628 u64 bw_ratio; 666 u64 bw_ratio;
629}; 667};
630 668
631#ifdef CONFIG_SMP 669#ifdef CONFIG_SMP
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
638/* 676/*
639 * We add the notion of a root-domain which will be used to define per-domain 677 * We add the notion of a root-domain which will be used to define per-domain
640 * variables. Each exclusive cpuset essentially defines an island domain by 678 * variables. Each exclusive cpuset essentially defines an island domain by
641 * fully partitioning the member cpus from any other cpuset. Whenever a new 679 * fully partitioning the member CPUs from any other cpuset. Whenever a new
642 * exclusive cpuset is created, we also create and attach a new root-domain 680 * exclusive cpuset is created, we also create and attach a new root-domain
643 * object. 681 * object.
644 * 682 *
645 */ 683 */
646struct root_domain { 684struct root_domain {
647 atomic_t refcount; 685 atomic_t refcount;
648 atomic_t rto_count; 686 atomic_t rto_count;
649 struct rcu_head rcu; 687 struct rcu_head rcu;
650 cpumask_var_t span; 688 cpumask_var_t span;
651 cpumask_var_t online; 689 cpumask_var_t online;
652 690
653 /* Indicate more than one runnable task for any CPU */ 691 /* Indicate more than one runnable task for any CPU */
654 bool overload; 692 bool overload;
655 693
656 /* 694 /*
657 * The bit corresponding to a CPU gets set here if such CPU has more 695 * The bit corresponding to a CPU gets set here if such CPU has more
658 * than one runnable -deadline task (as it is below for RT tasks). 696 * than one runnable -deadline task (as it is below for RT tasks).
659 */ 697 */
660 cpumask_var_t dlo_mask; 698 cpumask_var_t dlo_mask;
661 atomic_t dlo_count; 699 atomic_t dlo_count;
662 struct dl_bw dl_bw; 700 struct dl_bw dl_bw;
663 struct cpudl cpudl; 701 struct cpudl cpudl;
664 702
665#ifdef HAVE_RT_PUSH_IPI 703#ifdef HAVE_RT_PUSH_IPI
666 /* 704 /*
667 * For IPI pull requests, loop across the rto_mask. 705 * For IPI pull requests, loop across the rto_mask.
668 */ 706 */
669 struct irq_work rto_push_work; 707 struct irq_work rto_push_work;
670 raw_spinlock_t rto_lock; 708 raw_spinlock_t rto_lock;
671 /* These are only updated and read within rto_lock */ 709 /* These are only updated and read within rto_lock */
672 int rto_loop; 710 int rto_loop;
673 int rto_cpu; 711 int rto_cpu;
674 /* These atomics are updated outside of a lock */ 712 /* These atomics are updated outside of a lock */
675 atomic_t rto_loop_next; 713 atomic_t rto_loop_next;
676 atomic_t rto_loop_start; 714 atomic_t rto_loop_start;
677#endif 715#endif
678 /* 716 /*
679 * The "RT overload" flag: it gets set if a CPU has more than 717 * The "RT overload" flag: it gets set if a CPU has more than
680 * one runnable RT task. 718 * one runnable RT task.
681 */ 719 */
682 cpumask_var_t rto_mask; 720 cpumask_var_t rto_mask;
683 struct cpupri cpupri; 721 struct cpupri cpupri;
684 722
685 unsigned long max_cpu_capacity; 723 unsigned long max_cpu_capacity;
686}; 724};
687 725
688extern struct root_domain def_root_domain; 726extern struct root_domain def_root_domain;
@@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work);
708 */ 746 */
709struct rq { 747struct rq {
710 /* runqueue lock: */ 748 /* runqueue lock: */
711 raw_spinlock_t lock; 749 raw_spinlock_t lock;
712 750
713 /* 751 /*
714 * nr_running and cpu_load should be in the same cacheline because 752 * nr_running and cpu_load should be in the same cacheline because
715 * remote CPUs use both these fields when doing load calculation. 753 * remote CPUs use both these fields when doing load calculation.
716 */ 754 */
717 unsigned int nr_running; 755 unsigned int nr_running;
718#ifdef CONFIG_NUMA_BALANCING 756#ifdef CONFIG_NUMA_BALANCING
719 unsigned int nr_numa_running; 757 unsigned int nr_numa_running;
720 unsigned int nr_preferred_running; 758 unsigned int nr_preferred_running;
721#endif 759#endif
722 #define CPU_LOAD_IDX_MAX 5 760 #define CPU_LOAD_IDX_MAX 5
723 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 761 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
724#ifdef CONFIG_NO_HZ_COMMON 762#ifdef CONFIG_NO_HZ_COMMON
725#ifdef CONFIG_SMP 763#ifdef CONFIG_SMP
726 unsigned long last_load_update_tick; 764 unsigned long last_load_update_tick;
765 unsigned long last_blocked_load_update_tick;
766 unsigned int has_blocked_load;
727#endif /* CONFIG_SMP */ 767#endif /* CONFIG_SMP */
728 unsigned long nohz_flags; 768 unsigned int nohz_tick_stopped;
769 atomic_t nohz_flags;
729#endif /* CONFIG_NO_HZ_COMMON */ 770#endif /* CONFIG_NO_HZ_COMMON */
730#ifdef CONFIG_NO_HZ_FULL
731 unsigned long last_sched_tick;
732#endif
733 /* capture load from *all* tasks on this cpu: */
734 struct load_weight load;
735 unsigned long nr_load_updates;
736 u64 nr_switches;
737 771
738 struct cfs_rq cfs; 772 /* capture load from *all* tasks on this CPU: */
739 struct rt_rq rt; 773 struct load_weight load;
740 struct dl_rq dl; 774 unsigned long nr_load_updates;
775 u64 nr_switches;
776
777 struct cfs_rq cfs;
778 struct rt_rq rt;
779 struct dl_rq dl;
741 780
742#ifdef CONFIG_FAIR_GROUP_SCHED 781#ifdef CONFIG_FAIR_GROUP_SCHED
743 /* list of leaf cfs_rq on this cpu: */ 782 /* list of leaf cfs_rq on this CPU: */
744 struct list_head leaf_cfs_rq_list; 783 struct list_head leaf_cfs_rq_list;
745 struct list_head *tmp_alone_branch; 784 struct list_head *tmp_alone_branch;
746#endif /* CONFIG_FAIR_GROUP_SCHED */ 785#endif /* CONFIG_FAIR_GROUP_SCHED */
747 786
748 /* 787 /*
@@ -751,94 +790,98 @@ struct rq {
751 * one CPU and if it got migrated afterwards it may decrease 790 * one CPU and if it got migrated afterwards it may decrease
752 * it on another CPU. Always updated under the runqueue lock: 791 * it on another CPU. Always updated under the runqueue lock:
753 */ 792 */
754 unsigned long nr_uninterruptible; 793 unsigned long nr_uninterruptible;
755 794
756 struct task_struct *curr, *idle, *stop; 795 struct task_struct *curr;
757 unsigned long next_balance; 796 struct task_struct *idle;
758 struct mm_struct *prev_mm; 797 struct task_struct *stop;
798 unsigned long next_balance;
799 struct mm_struct *prev_mm;
759 800
760 unsigned int clock_update_flags; 801 unsigned int clock_update_flags;
761 u64 clock; 802 u64 clock;
762 u64 clock_task; 803 u64 clock_task;
763 804
764 atomic_t nr_iowait; 805 atomic_t nr_iowait;
765 806
766#ifdef CONFIG_SMP 807#ifdef CONFIG_SMP
767 struct root_domain *rd; 808 struct root_domain *rd;
768 struct sched_domain *sd; 809 struct sched_domain *sd;
769 810
770 unsigned long cpu_capacity; 811 unsigned long cpu_capacity;
771 unsigned long cpu_capacity_orig; 812 unsigned long cpu_capacity_orig;
772 813
773 struct callback_head *balance_callback; 814 struct callback_head *balance_callback;
815
816 unsigned char idle_balance;
774 817
775 unsigned char idle_balance;
776 /* For active balancing */ 818 /* For active balancing */
777 int active_balance; 819 int active_balance;
778 int push_cpu; 820 int push_cpu;
779 struct cpu_stop_work active_balance_work; 821 struct cpu_stop_work active_balance_work;
780 /* cpu of this runqueue: */ 822
781 int cpu; 823 /* CPU of this runqueue: */
782 int online; 824 int cpu;
825 int online;
783 826
784 struct list_head cfs_tasks; 827 struct list_head cfs_tasks;
785 828
786 u64 rt_avg; 829 u64 rt_avg;
787 u64 age_stamp; 830 u64 age_stamp;
788 u64 idle_stamp; 831 u64 idle_stamp;
789 u64 avg_idle; 832 u64 avg_idle;
790 833
791 /* This is used to determine avg_idle's max value */ 834 /* This is used to determine avg_idle's max value */
792 u64 max_idle_balance_cost; 835 u64 max_idle_balance_cost;
793#endif 836#endif
794 837
795#ifdef CONFIG_IRQ_TIME_ACCOUNTING 838#ifdef CONFIG_IRQ_TIME_ACCOUNTING
796 u64 prev_irq_time; 839 u64 prev_irq_time;
797#endif 840#endif
798#ifdef CONFIG_PARAVIRT 841#ifdef CONFIG_PARAVIRT
799 u64 prev_steal_time; 842 u64 prev_steal_time;
800#endif 843#endif
801#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 844#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
802 u64 prev_steal_time_rq; 845 u64 prev_steal_time_rq;
803#endif 846#endif
804 847
805 /* calc_load related fields */ 848 /* calc_load related fields */
806 unsigned long calc_load_update; 849 unsigned long calc_load_update;
807 long calc_load_active; 850 long calc_load_active;
808 851
809#ifdef CONFIG_SCHED_HRTICK 852#ifdef CONFIG_SCHED_HRTICK
810#ifdef CONFIG_SMP 853#ifdef CONFIG_SMP
811 int hrtick_csd_pending; 854 int hrtick_csd_pending;
812 call_single_data_t hrtick_csd; 855 call_single_data_t hrtick_csd;
813#endif 856#endif
814 struct hrtimer hrtick_timer; 857 struct hrtimer hrtick_timer;
815#endif 858#endif
816 859
817#ifdef CONFIG_SCHEDSTATS 860#ifdef CONFIG_SCHEDSTATS
818 /* latency stats */ 861 /* latency stats */
819 struct sched_info rq_sched_info; 862 struct sched_info rq_sched_info;
820 unsigned long long rq_cpu_time; 863 unsigned long long rq_cpu_time;
821 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 864 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
822 865
823 /* sys_sched_yield() stats */ 866 /* sys_sched_yield() stats */
824 unsigned int yld_count; 867 unsigned int yld_count;
825 868
826 /* schedule() stats */ 869 /* schedule() stats */
827 unsigned int sched_count; 870 unsigned int sched_count;
828 unsigned int sched_goidle; 871 unsigned int sched_goidle;
829 872
830 /* try_to_wake_up() stats */ 873 /* try_to_wake_up() stats */
831 unsigned int ttwu_count; 874 unsigned int ttwu_count;
832 unsigned int ttwu_local; 875 unsigned int ttwu_local;
833#endif 876#endif
834 877
835#ifdef CONFIG_SMP 878#ifdef CONFIG_SMP
836 struct llist_head wake_list; 879 struct llist_head wake_list;
837#endif 880#endif
838 881
839#ifdef CONFIG_CPU_IDLE 882#ifdef CONFIG_CPU_IDLE
840 /* Must be inspected within a rcu lock section */ 883 /* Must be inspected within a rcu lock section */
841 struct cpuidle_state *idle_state; 884 struct cpuidle_state *idle_state;
842#endif 885#endif
843}; 886};
844 887
@@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
904 * one position though, because the next rq_unpin_lock() will shift it 947 * one position though, because the next rq_unpin_lock() will shift it
905 * back. 948 * back.
906 */ 949 */
907#define RQCF_REQ_SKIP 0x01 950#define RQCF_REQ_SKIP 0x01
908#define RQCF_ACT_SKIP 0x02 951#define RQCF_ACT_SKIP 0x02
909#define RQCF_UPDATED 0x04 952#define RQCF_UPDATED 0x04
910 953
911static inline void assert_clock_updated(struct rq *rq) 954static inline void assert_clock_updated(struct rq *rq)
912{ 955{
@@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void);
1059 1102
1060/** 1103/**
1061 * highest_flag_domain - Return highest sched_domain containing flag. 1104 * highest_flag_domain - Return highest sched_domain containing flag.
1062 * @cpu: The cpu whose highest level of sched domain is to 1105 * @cpu: The CPU whose highest level of sched domain is to
1063 * be returned. 1106 * be returned.
1064 * @flag: The flag to check for the highest sched_domain 1107 * @flag: The flag to check for the highest sched_domain
1065 * for the given cpu. 1108 * for the given CPU.
1066 * 1109 *
1067 * Returns the highest sched_domain of a cpu which contains the given flag. 1110 * Returns the highest sched_domain of a CPU which contains the given flag.
1068 */ 1111 */
1069static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1112static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
1070{ 1113{
@@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
1099DECLARE_PER_CPU(struct sched_domain *, sd_asym); 1142DECLARE_PER_CPU(struct sched_domain *, sd_asym);
1100 1143
1101struct sched_group_capacity { 1144struct sched_group_capacity {
1102 atomic_t ref; 1145 atomic_t ref;
1103 /* 1146 /*
1104 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 1147 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
1105 * for a single CPU. 1148 * for a single CPU.
1106 */ 1149 */
1107 unsigned long capacity; 1150 unsigned long capacity;
1108 unsigned long min_capacity; /* Min per-CPU capacity in group */ 1151 unsigned long min_capacity; /* Min per-CPU capacity in group */
1109 unsigned long next_update; 1152 unsigned long next_update;
1110 int imbalance; /* XXX unrelated to capacity but shared group state */ 1153 int imbalance; /* XXX unrelated to capacity but shared group state */
1111 1154
1112#ifdef CONFIG_SCHED_DEBUG 1155#ifdef CONFIG_SCHED_DEBUG
1113 int id; 1156 int id;
1114#endif 1157#endif
1115 1158
1116 unsigned long cpumask[0]; /* balance mask */ 1159 unsigned long cpumask[0]; /* Balance mask */
1117}; 1160};
1118 1161
1119struct sched_group { 1162struct sched_group {
1120 struct sched_group *next; /* Must be a circular list */ 1163 struct sched_group *next; /* Must be a circular list */
1121 atomic_t ref; 1164 atomic_t ref;
1122 1165
1123 unsigned int group_weight; 1166 unsigned int group_weight;
1124 struct sched_group_capacity *sgc; 1167 struct sched_group_capacity *sgc;
1125 int asym_prefer_cpu; /* cpu of highest priority in group */ 1168 int asym_prefer_cpu; /* CPU of highest priority in group */
1126 1169
1127 /* 1170 /*
1128 * The CPUs this group covers. 1171 * The CPUs this group covers.
@@ -1131,7 +1174,7 @@ struct sched_group {
1131 * by attaching extra space to the end of the structure, 1174 * by attaching extra space to the end of the structure,
1132 * depending on how many CPUs the kernel has booted up with) 1175 * depending on how many CPUs the kernel has booted up with)
1133 */ 1176 */
1134 unsigned long cpumask[0]; 1177 unsigned long cpumask[0];
1135}; 1178};
1136 1179
1137static inline struct cpumask *sched_group_span(struct sched_group *sg) 1180static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
1148} 1191}
1149 1192
1150/** 1193/**
1151 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 1194 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
1152 * @group: The group whose first cpu is to be returned. 1195 * @group: The group whose first CPU is to be returned.
1153 */ 1196 */
1154static inline unsigned int group_first_cpu(struct sched_group *group) 1197static inline unsigned int group_first_cpu(struct sched_group *group)
1155{ 1198{
@@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1349 return p->on_rq == TASK_ON_RQ_MIGRATING; 1392 return p->on_rq == TASK_ON_RQ_MIGRATING;
1350} 1393}
1351 1394
1352#ifndef prepare_arch_switch
1353# define prepare_arch_switch(next) do { } while (0)
1354#endif
1355#ifndef finish_arch_post_lock_switch
1356# define finish_arch_post_lock_switch() do { } while (0)
1357#endif
1358
1359/* 1395/*
1360 * wake flags 1396 * wake flags
1361 */ 1397 */
1362#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ 1398#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
1363#define WF_FORK 0x02 /* child wakeup after fork */ 1399#define WF_FORK 0x02 /* Child wakeup after fork */
1364#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 1400#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
1365 1401
1366/* 1402/*
1367 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1403 * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1372 * slice expiry etc. 1408 * slice expiry etc.
1373 */ 1409 */
1374 1410
1375#define WEIGHT_IDLEPRIO 3 1411#define WEIGHT_IDLEPRIO 3
1376#define WMULT_IDLEPRIO 1431655765 1412#define WMULT_IDLEPRIO 1431655765
1377 1413
1378extern const int sched_prio_to_weight[40]; 1414extern const int sched_prio_to_weight[40];
1379extern const u32 sched_prio_to_wmult[40]; 1415extern const u32 sched_prio_to_wmult[40];
1380 1416
1381/* 1417/*
1382 * {de,en}queue flags: 1418 * {de,en}queue flags:
@@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40];
1398 */ 1434 */
1399 1435
1400#define DEQUEUE_SLEEP 0x01 1436#define DEQUEUE_SLEEP 0x01
1401#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ 1437#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
1402#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ 1438#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
1403#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ 1439#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
1404 1440
1405#define ENQUEUE_WAKEUP 0x01 1441#define ENQUEUE_WAKEUP 0x01
1406#define ENQUEUE_RESTORE 0x02 1442#define ENQUEUE_RESTORE 0x02
@@ -1422,10 +1458,10 @@ struct sched_class {
1422 1458
1423 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1459 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1424 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1460 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1425 void (*yield_task) (struct rq *rq); 1461 void (*yield_task) (struct rq *rq);
1426 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 1462 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
1427 1463
1428 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1464 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
1429 1465
1430 /* 1466 /*
1431 * It is the responsibility of the pick_next_task() method that will 1467 * It is the responsibility of the pick_next_task() method that will
@@ -1435,16 +1471,16 @@ struct sched_class {
1435 * May return RETRY_TASK when it finds a higher prio class has runnable 1471 * May return RETRY_TASK when it finds a higher prio class has runnable
1436 * tasks. 1472 * tasks.
1437 */ 1473 */
1438 struct task_struct * (*pick_next_task) (struct rq *rq, 1474 struct task_struct * (*pick_next_task)(struct rq *rq,
1439 struct task_struct *prev, 1475 struct task_struct *prev,
1440 struct rq_flags *rf); 1476 struct rq_flags *rf);
1441 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1477 void (*put_prev_task)(struct rq *rq, struct task_struct *p);
1442 1478
1443#ifdef CONFIG_SMP 1479#ifdef CONFIG_SMP
1444 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1480 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1445 void (*migrate_task_rq)(struct task_struct *p); 1481 void (*migrate_task_rq)(struct task_struct *p);
1446 1482
1447 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1483 void (*task_woken)(struct rq *this_rq, struct task_struct *task);
1448 1484
1449 void (*set_cpus_allowed)(struct task_struct *p, 1485 void (*set_cpus_allowed)(struct task_struct *p,
1450 const struct cpumask *newmask); 1486 const struct cpumask *newmask);
@@ -1453,31 +1489,31 @@ struct sched_class {
1453 void (*rq_offline)(struct rq *rq); 1489 void (*rq_offline)(struct rq *rq);
1454#endif 1490#endif
1455 1491
1456 void (*set_curr_task) (struct rq *rq); 1492 void (*set_curr_task)(struct rq *rq);
1457 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1493 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1458 void (*task_fork) (struct task_struct *p); 1494 void (*task_fork)(struct task_struct *p);
1459 void (*task_dead) (struct task_struct *p); 1495 void (*task_dead)(struct task_struct *p);
1460 1496
1461 /* 1497 /*
1462 * The switched_from() call is allowed to drop rq->lock, therefore we 1498 * The switched_from() call is allowed to drop rq->lock, therefore we
1463 * cannot assume the switched_from/switched_to pair is serliazed by 1499 * cannot assume the switched_from/switched_to pair is serliazed by
1464 * rq->lock. They are however serialized by p->pi_lock. 1500 * rq->lock. They are however serialized by p->pi_lock.
1465 */ 1501 */
1466 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1502 void (*switched_from)(struct rq *this_rq, struct task_struct *task);
1467 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1503 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1468 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1504 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1469 int oldprio); 1505 int oldprio);
1470 1506
1471 unsigned int (*get_rr_interval) (struct rq *rq, 1507 unsigned int (*get_rr_interval)(struct rq *rq,
1472 struct task_struct *task); 1508 struct task_struct *task);
1473 1509
1474 void (*update_curr) (struct rq *rq); 1510 void (*update_curr)(struct rq *rq);
1475 1511
1476#define TASK_SET_GROUP 0 1512#define TASK_SET_GROUP 0
1477#define TASK_MOVE_GROUP 1 1513#define TASK_MOVE_GROUP 1
1478 1514
1479#ifdef CONFIG_FAIR_GROUP_SCHED 1515#ifdef CONFIG_FAIR_GROUP_SCHED
1480 void (*task_change_group) (struct task_struct *p, int type); 1516 void (*task_change_group)(struct task_struct *p, int type);
1481#endif 1517#endif
1482}; 1518};
1483 1519
@@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq,
1526static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1562static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1527{ 1563{
1528 SCHED_WARN_ON(!rcu_read_lock_held()); 1564 SCHED_WARN_ON(!rcu_read_lock_held());
1565
1529 return rq->idle_state; 1566 return rq->idle_state;
1530} 1567}
1531#else 1568#else
@@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1564extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1601extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
1565extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1602extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
1566 1603
1567#define BW_SHIFT 20 1604#define BW_SHIFT 20
1568#define BW_UNIT (1 << BW_SHIFT) 1605#define BW_UNIT (1 << BW_SHIFT)
1569#define RATIO_SHIFT 8 1606#define RATIO_SHIFT 8
1570unsigned long to_ratio(u64 period, u64 runtime); 1607unsigned long to_ratio(u64 period, u64 runtime);
1571 1608
1572extern void init_entity_runnable_average(struct sched_entity *se); 1609extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
1574 1611
1575#ifdef CONFIG_NO_HZ_FULL 1612#ifdef CONFIG_NO_HZ_FULL
1576extern bool sched_can_stop_tick(struct rq *rq); 1613extern bool sched_can_stop_tick(struct rq *rq);
1614extern int __init sched_tick_offload_init(void);
1577 1615
1578/* 1616/*
1579 * Tick may be needed by tasks in the runqueue depending on their policy and 1617 * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
1598 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 1636 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
1599} 1637}
1600#else 1638#else
1639static inline int sched_tick_offload_init(void) { return 0; }
1601static inline void sched_update_tick_dependency(struct rq *rq) { } 1640static inline void sched_update_tick_dependency(struct rq *rq) { }
1602#endif 1641#endif
1603 1642
@@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
1624 sched_update_tick_dependency(rq); 1663 sched_update_tick_dependency(rq);
1625} 1664}
1626 1665
1627static inline void rq_last_tick_reset(struct rq *rq)
1628{
1629#ifdef CONFIG_NO_HZ_FULL
1630 rq->last_sched_tick = jiffies;
1631#endif
1632}
1633
1634extern void update_rq_clock(struct rq *rq); 1666extern void update_rq_clock(struct rq *rq);
1635 1667
1636extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1668extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1821/* 1853/*
1822 * Unfair double_lock_balance: Optimizes throughput at the expense of 1854 * Unfair double_lock_balance: Optimizes throughput at the expense of
1823 * latency by eliminating extra atomic operations when the locks are 1855 * latency by eliminating extra atomic operations when the locks are
1824 * already in proper order on entry. This favors lower cpu-ids and will 1856 * already in proper order on entry. This favors lower CPU-ids and will
1825 * grant the double lock to lower cpus over higher ids under contention, 1857 * grant the double lock to lower CPUs over higher ids under contention,
1826 * regardless of entry order into the function. 1858 * regardless of entry order into the function.
1827 */ 1859 */
1828static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1860static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1854static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1886static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1855{ 1887{
1856 if (unlikely(!irqs_disabled())) { 1888 if (unlikely(!irqs_disabled())) {
1857 /* printk() doesn't work good under rq->lock */ 1889 /* printk() doesn't work well under rq->lock */
1858 raw_spin_unlock(&this_rq->lock); 1890 raw_spin_unlock(&this_rq->lock);
1859 BUG_ON(1); 1891 BUG_ON(1);
1860 } 1892 }
@@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void);
2005extern void cfs_bandwidth_usage_dec(void); 2037extern void cfs_bandwidth_usage_dec(void);
2006 2038
2007#ifdef CONFIG_NO_HZ_COMMON 2039#ifdef CONFIG_NO_HZ_COMMON
2008enum rq_nohz_flag_bits { 2040#define NOHZ_BALANCE_KICK_BIT 0
2009 NOHZ_TICK_STOPPED, 2041#define NOHZ_STATS_KICK_BIT 1
2010 NOHZ_BALANCE_KICK, 2042
2011}; 2043#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
2044#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
2045
2046#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
2012 2047
2013#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 2048#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
2014 2049
2015extern void nohz_balance_exit_idle(unsigned int cpu); 2050extern void nohz_balance_exit_idle(struct rq *rq);
2016#else 2051#else
2017static inline void nohz_balance_exit_idle(unsigned int cpu) { } 2052static inline void nohz_balance_exit_idle(struct rq *rq) { }
2018#endif 2053#endif
2019 2054
2020 2055
@@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2113#endif /* CONFIG_CPU_FREQ */ 2148#endif /* CONFIG_CPU_FREQ */
2114 2149
2115#ifdef arch_scale_freq_capacity 2150#ifdef arch_scale_freq_capacity
2116#ifndef arch_scale_freq_invariant 2151# ifndef arch_scale_freq_invariant
2117#define arch_scale_freq_invariant() (true) 2152# define arch_scale_freq_invariant() true
2118#endif 2153# endif
2119#else /* arch_scale_freq_capacity */ 2154#else
2120#define arch_scale_freq_invariant() (false) 2155# define arch_scale_freq_invariant() false
2121#endif 2156#endif
2122 2157
2123#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2158#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2124
2125static inline unsigned long cpu_util_dl(struct rq *rq) 2159static inline unsigned long cpu_util_dl(struct rq *rq)
2126{ 2160{
2127 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2161 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
2129 2163
2130static inline unsigned long cpu_util_cfs(struct rq *rq) 2164static inline unsigned long cpu_util_cfs(struct rq *rq)
2131{ 2165{
2132 return rq->cfs.avg.util_avg; 2166 unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
2133} 2167
2168 if (sched_feat(UTIL_EST)) {
2169 util = max_t(unsigned long, util,
2170 READ_ONCE(rq->cfs.avg.util_est.enqueued));
2171 }
2134 2172
2173 return util;
2174}
2135#endif 2175#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..ab112cbfd7c8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,14 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2 2/*
3#include <linux/slab.h> 3 * /proc/schedstat implementation
4#include <linux/fs.h> 4 */
5#include <linux/seq_file.h>
6#include <linux/proc_fs.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10/* 7/*
11 * bump this up when changing the output format or the meaning of an existing 8 * Current schedstat API version.
9 *
10 * Bump this up when changing the output format or the meaning of an existing
12 * format, so that tools can adapt (or abort) 11 * format, so that tools can adapt (or abort)
13 */ 12 */
14#define SCHEDSTAT_VERSION 15 13#define SCHEDSTAT_VERSION 15
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
78 * This itererator needs some explanation. 77 * This itererator needs some explanation.
79 * It returns 1 for the header position. 78 * It returns 1 for the header position.
80 * This means 2 is cpu 0. 79 * This means 2 is cpu 0.
81 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 80 * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
82 * to use cpumask_* to iterate over the cpus. 81 * to use cpumask_* to iterate over the CPUs.
83 */ 82 */
84static void *schedstat_start(struct seq_file *file, loff_t *offset) 83static void *schedstat_start(struct seq_file *file, loff_t *offset)
85{ 84{
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
99 98
100 if (n < nr_cpu_ids) 99 if (n < nr_cpu_ids)
101 return (void *)(unsigned long)(n + 2); 100 return (void *)(unsigned long)(n + 2);
101
102 return NULL; 102 return NULL;
103} 103}
104 104
105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) 105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
106{ 106{
107 (*offset)++; 107 (*offset)++;
108
108 return schedstat_start(file, offset); 109 return schedstat_start(file, offset);
109} 110}
110 111
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
134static int __init proc_schedstat_init(void) 135static int __init proc_schedstat_init(void)
135{ 136{
136 proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 137 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
138
137 return 0; 139 return 0;
138} 140}
139subsys_initcall(proc_schedstat_init); 141subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
30 if (rq) 30 if (rq)
31 rq->rq_sched_info.run_delay += delta; 31 rq->rq_sched_info.run_delay += delta;
32} 32}
33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
34#define __schedstat_inc(var) do { var++; } while (0) 34#define __schedstat_inc(var) do { var++; } while (0)
35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) 35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
36#define __schedstat_add(var, amt) do { var += (amt); } while (0) 36#define __schedstat_add(var, amt) do { var += (amt); } while (0)
37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) 37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
38#define __schedstat_set(var, val) do { var = (val); } while (0) 38#define __schedstat_set(var, val) do { var = (val); } while (0)
39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
40#define schedstat_val(var) (var) 40#define schedstat_val(var) (var)
41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) 41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
42 42
43#else /* !CONFIG_SCHEDSTATS */ 43#else /* !CONFIG_SCHEDSTATS: */
44static inline void 44static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
45rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 45static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
46{} 46static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
47static inline void 47# define schedstat_enabled() 0
48rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) 48# define __schedstat_inc(var) do { } while (0)
49{} 49# define schedstat_inc(var) do { } while (0)
50static inline void 50# define __schedstat_add(var, amt) do { } while (0)
51rq_sched_info_depart(struct rq *rq, unsigned long long delta) 51# define schedstat_add(var, amt) do { } while (0)
52{} 52# define __schedstat_set(var, val) do { } while (0)
53#define schedstat_enabled() 0 53# define schedstat_set(var, val) do { } while (0)
54#define __schedstat_inc(var) do { } while (0) 54# define schedstat_val(var) 0
55#define schedstat_inc(var) do { } while (0) 55# define schedstat_val_or_zero(var) 0
56#define __schedstat_add(var, amt) do { } while (0)
57#define schedstat_add(var, amt) do { } while (0)
58#define __schedstat_set(var, val) do { } while (0)
59#define schedstat_set(var, val) do { } while (0)
60#define schedstat_val(var) 0
61#define schedstat_val_or_zero(var) 0
62#endif /* CONFIG_SCHEDSTATS */ 56#endif /* CONFIG_SCHEDSTATS */
63 57
64#ifdef CONFIG_SCHED_INFO 58#ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
69 63
70/* 64/*
71 * We are interested in knowing how long it was from the *first* time a 65 * We are interested in knowing how long it was from the *first* time a
72 * task was queued to the time that it finally hit a cpu, we call this routine 66 * task was queued to the time that it finally hit a CPU, we call this routine
73 * from dequeue_task() to account for possible rq->clock skew across cpus. The 67 * from dequeue_task() to account for possible rq->clock skew across CPUs. The
74 * delta taken on each cpu would annul the skew. 68 * delta taken on each CPU would annul the skew.
75 */ 69 */
76static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) 70static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
77{ 71{
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
87} 81}
88 82
89/* 83/*
90 * Called when a task finally hits the cpu. We can now calculate how 84 * Called when a task finally hits the CPU. We can now calculate how
91 * long it was waiting to run. We also note when it began so that we 85 * long it was waiting to run. We also note when it began so that we
92 * can keep stats on how long its timeslice is. 86 * can keep stats on how long its timeslice is.
93 */ 87 */
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
112 */ 106 */
113static inline void sched_info_queued(struct rq *rq, struct task_struct *t) 107static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
114{ 108{
115 if (unlikely(sched_info_on())) 109 if (unlikely(sched_info_on())) {
116 if (!t->sched_info.last_queued) 110 if (!t->sched_info.last_queued)
117 t->sched_info.last_queued = rq_clock(rq); 111 t->sched_info.last_queued = rq_clock(rq);
112 }
118} 113}
119 114
120/* 115/*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
127 */ 122 */
128static inline void sched_info_depart(struct rq *rq, struct task_struct *t) 123static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
129{ 124{
130 unsigned long long delta = rq_clock(rq) - 125 unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
131 t->sched_info.last_arrival;
132 126
133 rq_sched_info_depart(rq, delta); 127 rq_sched_info_depart(rq, delta);
134 128
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
142 * the idle task.) We are only called when prev != next. 136 * the idle task.) We are only called when prev != next.
143 */ 137 */
144static inline void 138static inline void
145__sched_info_switch(struct rq *rq, 139__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
146 struct task_struct *prev, struct task_struct *next)
147{ 140{
148 /* 141 /*
149 * prev now departs the cpu. It's not interesting to record 142 * prev now departs the CPU. It's not interesting to record
150 * stats about how efficient we were at scheduling the idle 143 * stats about how efficient we were at scheduling the idle
151 * process, however. 144 * process, however.
152 */ 145 */
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
156 if (next != rq->idle) 149 if (next != rq->idle)
157 sched_info_arrive(rq, next); 150 sched_info_arrive(rq, next);
158} 151}
152
159static inline void 153static inline void
160sched_info_switch(struct rq *rq, 154sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
161 struct task_struct *prev, struct task_struct *next)
162{ 155{
163 if (unlikely(sched_info_on())) 156 if (unlikely(sched_info_on()))
164 __sched_info_switch(rq, prev, next); 157 __sched_info_switch(rq, prev, next);
165} 158}
166#else 159
167#define sched_info_queued(rq, t) do { } while (0) 160#else /* !CONFIG_SCHED_INFO: */
168#define sched_info_reset_dequeued(t) do { } while (0) 161# define sched_info_queued(rq, t) do { } while (0)
169#define sched_info_dequeued(rq, t) do { } while (0) 162# define sched_info_reset_dequeued(t) do { } while (0)
170#define sched_info_depart(rq, t) do { } while (0) 163# define sched_info_dequeued(rq, t) do { } while (0)
171#define sched_info_arrive(rq, next) do { } while (0) 164# define sched_info_depart(rq, t) do { } while (0)
172#define sched_info_switch(rq, t, next) do { } while (0) 165# define sched_info_arrive(rq, next) do { } while (0)
166# define sched_info_switch(rq, t, next) do { } while (0)
173#endif /* CONFIG_SCHED_INFO */ 167#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 210b1f2146ff..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/* 2/*
5 * stop-task scheduling class. 3 * stop-task scheduling class.
6 * 4 *
@@ -9,6 +7,7 @@
9 * 7 *
10 * See kernel/stop_machine.c 8 * See kernel/stop_machine.c
11 */ 9 */
10#include "sched.h"
12 11
13#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
14static int 13static int
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
75 cgroup_account_cputime(curr, delta_exec); 74 cgroup_account_cputime(curr, delta_exec);
76} 75}
77 76
77/*
78 * scheduler tick hitting a task of our scheduling class.
79 *
80 * NOTE: This function can be called remotely by the tick offload that
81 * goes along full dynticks. Therefore no local assumption can be made
82 * and everything must be accessed through the @rq and @curr passed in
83 * parameters.
84 */
78static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) 85static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
79{ 86{
80} 87}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b6fb2c3b3ff7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,6 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/sched/signal.h> 2/*
3#include <linux/swait.h> 3 * <linux/swait.h> (simple wait queues ) implementation:
4 */
5#include "sched.h"
4 6
5void __init_swait_queue_head(struct swait_queue_head *q, const char *name, 7void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
6 struct lock_class_key *key) 8 struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..64cc564f5255 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,10 +2,6 @@
2/* 2/*
3 * Scheduler topology setup/handling methods 3 * Scheduler topology setup/handling methods
4 */ 4 */
5#include <linux/sched.h>
6#include <linux/mutex.h>
7#include <linux/sched/isolation.h>
8
9#include "sched.h" 5#include "sched.h"
10 6
11DEFINE_MUTEX(sched_domains_mutex); 7DEFINE_MUTEX(sched_domains_mutex);
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
41 if (!(sd->flags & SD_LOAD_BALANCE)) { 37 if (!(sd->flags & SD_LOAD_BALANCE)) {
42 printk("does not load-balance\n"); 38 printk("does not load-balance\n");
43 if (sd->parent) 39 if (sd->parent)
44 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 40 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
45 " has parent");
46 return -1; 41 return -1;
47 } 42 }
48 43
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
50 cpumask_pr_args(sched_domain_span(sd)), sd->name); 45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
51 46
52 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
53 printk(KERN_ERR "ERROR: domain->span does not contain " 48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
54 "CPU%d\n", cpu);
55 } 49 }
56 if (!cpumask_test_cpu(cpu, sched_group_span(group))) { 50 if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
57 printk(KERN_ERR "ERROR: domain->groups does not contain" 51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
58 " CPU%d\n", cpu);
59 } 52 }
60 53
61 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 54 printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
115 108
116 if (sd->parent && 109 if (sd->parent &&
117 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 110 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
118 printk(KERN_ERR "ERROR: parent span is not a superset " 111 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
119 "of domain->span\n");
120 return 0; 112 return 0;
121} 113}
122 114
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
595 * are not. 587 * are not.
596 * 588 *
597 * This leads to a few particularly weird cases where the sched_domain's are 589 * This leads to a few particularly weird cases where the sched_domain's are
598 * not of the same number for each cpu. Consider: 590 * not of the same number for each CPU. Consider:
599 * 591 *
600 * NUMA-2 0-3 0-3 592 * NUMA-2 0-3 0-3
601 * groups: {0-2},{1-3} {1-3},{0-2} 593 * groups: {0-2},{1-3} {1-3},{0-2}
@@ -780,7 +772,7 @@ fail:
780 * ^ ^ ^ ^ 772 * ^ ^ ^ ^
781 * `-' `-' 773 * `-' `-'
782 * 774 *
783 * The sched_domains are per-cpu and have a two way link (parent & child) and 775 * The sched_domains are per-CPU and have a two way link (parent & child) and
784 * denote the ever growing mask of CPUs belonging to that level of topology. 776 * denote the ever growing mask of CPUs belonging to that level of topology.
785 * 777 *
786 * Each sched_domain has a circular (double) linked list of sched_group's, each 778 * Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1021 d->rd = alloc_rootdomain(); 1013 d->rd = alloc_rootdomain();
1022 if (!d->rd) 1014 if (!d->rd)
1023 return sa_sd; 1015 return sa_sd;
1016
1024 return sa_rootdomain; 1017 return sa_rootdomain;
1025} 1018}
1026 1019
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
1047} 1040}
1048 1041
1049#ifdef CONFIG_NUMA 1042#ifdef CONFIG_NUMA
1050static int sched_domains_numa_levels;
1051enum numa_topology_type sched_numa_topology_type; 1043enum numa_topology_type sched_numa_topology_type;
1052static int *sched_domains_numa_distance; 1044
1053int sched_max_numa_distance; 1045static int sched_domains_numa_levels;
1054static struct cpumask ***sched_domains_numa_masks; 1046static int sched_domains_curr_level;
1055static int sched_domains_curr_level; 1047
1048int sched_max_numa_distance;
1049static int *sched_domains_numa_distance;
1050static struct cpumask ***sched_domains_numa_masks;
1056#endif 1051#endif
1057 1052
1058/* 1053/*
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
1074 * SD_ASYM_PACKING - describes SMT quirks 1069 * SD_ASYM_PACKING - describes SMT quirks
1075 */ 1070 */
1076#define TOPOLOGY_SD_FLAGS \ 1071#define TOPOLOGY_SD_FLAGS \
1077 (SD_SHARE_CPUCAPACITY | \ 1072 (SD_SHARE_CPUCAPACITY | \
1078 SD_SHARE_PKG_RESOURCES | \ 1073 SD_SHARE_PKG_RESOURCES | \
1079 SD_NUMA | \ 1074 SD_NUMA | \
1080 SD_ASYM_PACKING | \ 1075 SD_ASYM_PACKING | \
1081 SD_ASYM_CPUCAPACITY | \ 1076 SD_ASYM_CPUCAPACITY | \
1082 SD_SHARE_POWERDOMAIN) 1077 SD_SHARE_POWERDOMAIN)
1083 1078
1084static struct sched_domain * 1079static struct sched_domain *
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
1628 pr_err(" the %s domain not a subset of the %s domain\n", 1623 pr_err(" the %s domain not a subset of the %s domain\n",
1629 child->name, sd->name); 1624 child->name, sd->name);
1630#endif 1625#endif
1631 /* Fixup, ensure @sd has at least @child cpus. */ 1626 /* Fixup, ensure @sd has at least @child CPUs. */
1632 cpumask_or(sched_domain_span(sd), 1627 cpumask_or(sched_domain_span(sd),
1633 sched_domain_span(sd), 1628 sched_domain_span(sd),
1634 sched_domain_span(child)); 1629 sched_domain_span(child));
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
1720 ret = 0; 1715 ret = 0;
1721error: 1716error:
1722 __free_domain_allocs(&d, alloc_state, cpu_map); 1717 __free_domain_allocs(&d, alloc_state, cpu_map);
1718
1723 return ret; 1719 return ret;
1724} 1720}
1725 1721
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1824 return 1; 1820 return 1;
1825 1821
1826 tmp = SD_ATTR_INIT; 1822 tmp = SD_ATTR_INIT;
1823
1827 return !memcmp(cur ? (cur + idx_cur) : &tmp, 1824 return !memcmp(cur ? (cur + idx_cur) : &tmp,
1828 new ? (new + idx_new) : &tmp, 1825 new ? (new + idx_new) : &tmp,
1829 sizeof(struct sched_domain_attr)); 1826 sizeof(struct sched_domain_attr));
@@ -1929,4 +1926,3 @@ match2:
1929 1926
1930 mutex_unlock(&sched_domains_mutex); 1927 mutex_unlock(&sched_domains_mutex);
1931} 1928}
1932
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..928be527477e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -3,14 +3,7 @@
3 * 3 *
4 * (C) 2004 Nadia Yvette Chambers, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include "sched.h"
7#include <linux/export.h>
8#include <linux/sched/signal.h>
9#include <linux/sched/debug.h>
10#include <linux/mm.h>
11#include <linux/wait.h>
12#include <linux/hash.h>
13#include <linux/kthread.h>
14 7
15void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) 8void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
16{ 9{
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
107 break; 100 break;
108 } 101 }
109 } 102 }
103
110 return nr_exclusive; 104 return nr_exclusive;
111} 105}
112 106
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
317 spin_unlock(&wq->lock); 311 spin_unlock(&wq->lock);
318 schedule(); 312 schedule();
319 spin_lock(&wq->lock); 313 spin_lock(&wq->lock);
314
320 return 0; 315 return 0;
321} 316}
322EXPORT_SYMBOL(do_wait_intr); 317EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
333 spin_unlock_irq(&wq->lock); 328 spin_unlock_irq(&wq->lock);
334 schedule(); 329 schedule();
335 spin_lock_irq(&wq->lock); 330 spin_lock_irq(&wq->lock);
331
336 return 0; 332 return 0;
337} 333}
338EXPORT_SYMBOL(do_wait_intr_irq); 334EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
378 374
379 if (ret) 375 if (ret)
380 list_del_init(&wq_entry->entry); 376 list_del_init(&wq_entry->entry);
377
381 return ret; 378 return ret;
382} 379}
383EXPORT_SYMBOL(autoremove_wake_function); 380EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..4239c78f5cd3 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,10 +1,7 @@
1/* 1/*
2 * The implementation of the wait_bit*() and related waiting APIs: 2 * The implementation of the wait_bit*() and related waiting APIs:
3 */ 3 */
4#include <linux/wait_bit.h> 4#include "sched.h"
5#include <linux/sched/signal.h>
6#include <linux/sched/debug.h>
7#include <linux/hash.h>
8 5
9#define WAIT_TABLE_BITS 8 6#define WAIT_TABLE_BITS 8
10#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) 7#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
29 wait_bit->key.bit_nr != key->bit_nr || 26 wait_bit->key.bit_nr != key->bit_nr ||
30 test_bit(key->bit_nr, key->flags)) 27 test_bit(key->bit_nr, key->flags))
31 return 0; 28 return 0;
32 else 29
33 return autoremove_wake_function(wq_entry, mode, sync, key); 30 return autoremove_wake_function(wq_entry, mode, sync, key);
34} 31}
35EXPORT_SYMBOL(wake_bit_function); 32EXPORT_SYMBOL(wake_bit_function);
36 33
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
50 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) 47 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
51 ret = (*action)(&wbq_entry->key, mode); 48 ret = (*action)(&wbq_entry->key, mode);
52 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); 49 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
50
53 finish_wait(wq_head, &wbq_entry->wq_entry); 51 finish_wait(wq_head, &wbq_entry->wq_entry);
52
54 return ret; 53 return ret;
55} 54}
56EXPORT_SYMBOL(__wait_on_bit); 55EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
73 DEFINE_WAIT_BIT(wq_entry, word, bit); 72 DEFINE_WAIT_BIT(wq_entry, word, bit);
74 73
75 wq_entry.key.timeout = jiffies + timeout; 74 wq_entry.key.timeout = jiffies + timeout;
75
76 return __wait_on_bit(wq_head, &wq_entry, action, mode); 76 return __wait_on_bit(wq_head, &wq_entry, action, mode);
77} 77}
78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); 78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) 120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
121{ 121{
122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
123
123 if (waitqueue_active(wq_head)) 124 if (waitqueue_active(wq_head))
124 __wake_up(wq_head, TASK_NORMAL, 1, &key); 125 __wake_up(wq_head, TASK_NORMAL, 1, &key);
125} 126}
@@ -157,6 +158,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
157{ 158{
158 if (BITS_PER_LONG == 64) { 159 if (BITS_PER_LONG == 64) {
159 unsigned long q = (unsigned long)p; 160 unsigned long q = (unsigned long)p;
161
160 return bit_waitqueue((void *)(q & ~1), q & 1); 162 return bit_waitqueue((void *)(q & ~1), q & 1);
161 } 163 }
162 return bit_waitqueue(p, 0); 164 return bit_waitqueue(p, 0);
@@ -173,6 +175,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
173 wait_bit->key.bit_nr != key->bit_nr || 175 wait_bit->key.bit_nr != key->bit_nr ||
174 atomic_read(val) != 0) 176 atomic_read(val) != 0)
175 return 0; 177 return 0;
178
176 return autoremove_wake_function(wq_entry, mode, sync, key); 179 return autoremove_wake_function(wq_entry, mode, sync, key);
177} 180}
178 181
@@ -196,6 +199,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
196 ret = (*action)(val, mode); 199 ret = (*action)(val, mode);
197 } while (!ret && atomic_read(val) != 0); 200 } while (!ret && atomic_read(val) != 0);
198 finish_wait(wq_head, &wbq_entry->wq_entry); 201 finish_wait(wq_head, &wbq_entry->wq_entry);
202
199 return ret; 203 return ret;
200} 204}
201 205
@@ -226,6 +230,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
226 schedule(); 230 schedule();
227 if (signal_pending_state(mode, current)) 231 if (signal_pending_state(mode, current))
228 return -EINTR; 232 return -EINTR;
233
229 return 0; 234 return 0;
230} 235}
231EXPORT_SYMBOL(atomic_t_wait); 236EXPORT_SYMBOL(atomic_t_wait);
@@ -250,6 +255,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
250 schedule(); 255 schedule();
251 if (signal_pending_state(mode, current)) 256 if (signal_pending_state(mode, current))
252 return -EINTR; 257 return -EINTR;
258
253 return 0; 259 return 0;
254} 260}
255EXPORT_SYMBOL(bit_wait); 261EXPORT_SYMBOL(bit_wait);
@@ -259,6 +265,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
259 io_schedule(); 265 io_schedule();
260 if (signal_pending_state(mode, current)) 266 if (signal_pending_state(mode, current))
261 return -EINTR; 267 return -EINTR;
268
262 return 0; 269 return 0;
263} 270}
264EXPORT_SYMBOL(bit_wait_io); 271EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +273,13 @@ EXPORT_SYMBOL(bit_wait_io);
266__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) 273__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
267{ 274{
268 unsigned long now = READ_ONCE(jiffies); 275 unsigned long now = READ_ONCE(jiffies);
276
269 if (time_after_eq(now, word->timeout)) 277 if (time_after_eq(now, word->timeout))
270 return -EAGAIN; 278 return -EAGAIN;
271 schedule_timeout(word->timeout - now); 279 schedule_timeout(word->timeout - now);
272 if (signal_pending_state(mode, current)) 280 if (signal_pending_state(mode, current))
273 return -EINTR; 281 return -EINTR;
282
274 return 0; 283 return 0;
275} 284}
276EXPORT_SYMBOL_GPL(bit_wait_timeout); 285EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +287,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
278__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) 287__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
279{ 288{
280 unsigned long now = READ_ONCE(jiffies); 289 unsigned long now = READ_ONCE(jiffies);
290
281 if (time_after_eq(now, word->timeout)) 291 if (time_after_eq(now, word->timeout))
282 return -EAGAIN; 292 return -EAGAIN;
283 io_schedule_timeout(word->timeout - now); 293 io_schedule_timeout(word->timeout - now);
284 if (signal_pending_state(mode, current)) 294 if (signal_pending_state(mode, current))
285 return -EINTR; 295 return -EINTR;
296
286 return 0; 297 return 0;
287} 298}
288EXPORT_SYMBOL_GPL(bit_wait_io_timeout); 299EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ccd3782da0bf..5d4a0342f934 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -463,11 +463,18 @@ static int __init setup_tick_nohz(char *str)
463 463
464__setup("nohz=", setup_tick_nohz); 464__setup("nohz=", setup_tick_nohz);
465 465
466int tick_nohz_tick_stopped(void) 466bool tick_nohz_tick_stopped(void)
467{ 467{
468 return __this_cpu_read(tick_cpu_sched.tick_stopped); 468 return __this_cpu_read(tick_cpu_sched.tick_stopped);
469} 469}
470 470
471bool tick_nohz_tick_stopped_cpu(int cpu)
472{
473 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
474
475 return ts->tick_stopped;
476}
477
471/** 478/**
472 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 479 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
473 * 480 *
@@ -723,12 +730,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
723 delta = KTIME_MAX; 730 delta = KTIME_MAX;
724 } 731 }
725 732
726#ifdef CONFIG_NO_HZ_FULL
727 /* Limit the tick delta to the maximum scheduler deferment */
728 if (!ts->inidle)
729 delta = min(delta, scheduler_tick_max_deferment());
730#endif
731
732 /* Calculate the next expiry time */ 733 /* Calculate the next expiry time */
733 if (delta < (KTIME_MAX - basemono)) 734 if (delta < (KTIME_MAX - basemono))
734 expires = basemono + delta; 735 expires = basemono + delta;
@@ -935,13 +936,6 @@ void tick_nohz_idle_enter(void)
935 struct tick_sched *ts; 936 struct tick_sched *ts;
936 937
937 lockdep_assert_irqs_enabled(); 938 lockdep_assert_irqs_enabled();
938 /*
939 * Update the idle state in the scheduler domain hierarchy
940 * when tick_nohz_stop_sched_tick() is called from the idle loop.
941 * State will be updated to busy during the first busy tick after
942 * exiting idle.
943 */
944 set_cpu_sd_state_idle();
945 939
946 local_irq_disable(); 940 local_irq_disable();
947 941
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6ec6ba65127b..254e636a3d6b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void)
5573int __init workqueue_init_early(void) 5573int __init workqueue_init_early(void)
5574{ 5574{
5575 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 5575 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
5576 int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
5576 int i, cpu; 5577 int i, cpu;
5577 5578
5578 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 5579 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5579 5580
5580 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); 5581 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
5581 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); 5582 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
5582 5583
5583 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5584 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
5584 5585