diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 14:49:41 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-02 14:49:41 -0400 |
commit | 46e0d28bdb8e6d00e27a0fe9e1d15df6098f0ffb (patch) | |
tree | d5cb66fbd85b3d5c3220aacd2d9a60f9a515903a | |
parent | 86bbbebac1933e6e95e8234c4f7d220c5ddd38bc (diff) | |
parent | b720342849fe685310fca01748a32730a6eca5aa (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main scheduler changes in this cycle were:
- NUMA balancing improvements (Mel Gorman)
- Further load tracking improvements (Patrick Bellasi)
- Various NOHZ balancing cleanups and optimizations (Peter Zijlstra)
- Improve blocked load handling, in particular we can now reduce and
eventually stop periodic load updates on 'very idle' CPUs. (Vincent
Guittot)
- On isolated CPUs offload the final 1Hz scheduler tick as well, plus
related cleanups and reorganization. (Frederic Weisbecker)
- Core scheduler code cleanups (Ingo Molnar)"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits)
sched/core: Update preempt_notifier_key to modern API
sched/cpufreq: Rate limits for SCHED_DEADLINE
sched/fair: Update util_est only on util_avg updates
sched/cpufreq/schedutil: Use util_est for OPP selection
sched/fair: Use util_est in LB and WU paths
sched/fair: Add util_est on top of PELT
sched/core: Remove TASK_ALL
sched/completions: Use bool in try_wait_for_completion()
sched/fair: Update blocked load when newly idle
sched/fair: Move idle_balance()
sched/nohz: Merge CONFIG_NO_HZ_COMMON blocks
sched/fair: Move rebalance_domains()
sched/nohz: Optimize nohz_idle_balance()
sched/fair: Reduce the periodic update duration
sched/nohz: Stop NOHZ stats when decayed
sched/cpufreq: Provide migration hint
sched/nohz: Clean up nohz enter/exit
sched/fair: Update blocked load from NEWIDLE
sched/fair: Add NOHZ stats balancing
sched/fair: Restructure nohz_balance_kick()
...
41 files changed, 2082 insertions, 1529 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1d1d53f85ddd..50b9837e985b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -1766,6 +1766,17 @@ | |||
1766 | 1766 | ||
1767 | nohz | 1767 | nohz |
1768 | Disable the tick when a single task runs. | 1768 | Disable the tick when a single task runs. |
1769 | |||
1770 | A residual 1Hz tick is offloaded to workqueues, which you | ||
1771 | need to affine to housekeeping through the global | ||
1772 | workqueue's affinity configured via the | ||
1773 | /sys/devices/virtual/workqueue/cpumask sysfs file, or | ||
1774 | by using the 'domain' flag described below. | ||
1775 | |||
1776 | NOTE: by default the global workqueue runs on all CPUs, | ||
1777 | so to protect individual CPUs the 'cpumask' file has to | ||
1778 | be configured manually after bootup. | ||
1779 | |||
1769 | domain | 1780 | domain |
1770 | Isolate from the general SMP balancing and scheduling | 1781 | Isolate from the general SMP balancing and scheduling |
1771 | algorithms. Note that performing domain isolation this way | 1782 | algorithms. Note that performing domain isolation this way |
diff --git a/include/linux/sched.h b/include/linux/sched.h index b161ef8a902e..f228c6033832 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -93,7 +93,6 @@ struct task_group; | |||
93 | 93 | ||
94 | /* Convenience macros for the sake of wake_up(): */ | 94 | /* Convenience macros for the sake of wake_up(): */ |
95 | #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) | 95 | #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) |
96 | #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) | ||
97 | 96 | ||
98 | /* get_task_state(): */ | 97 | /* get_task_state(): */ |
99 | #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ | 98 | #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ |
@@ -275,6 +274,34 @@ struct load_weight { | |||
275 | u32 inv_weight; | 274 | u32 inv_weight; |
276 | }; | 275 | }; |
277 | 276 | ||
277 | /** | ||
278 | * struct util_est - Estimation utilization of FAIR tasks | ||
279 | * @enqueued: instantaneous estimated utilization of a task/cpu | ||
280 | * @ewma: the Exponential Weighted Moving Average (EWMA) | ||
281 | * utilization of a task | ||
282 | * | ||
283 | * Support data structure to track an Exponential Weighted Moving Average | ||
284 | * (EWMA) of a FAIR task's utilization. New samples are added to the moving | ||
285 | * average each time a task completes an activation. Sample's weight is chosen | ||
286 | * so that the EWMA will be relatively insensitive to transient changes to the | ||
287 | * task's workload. | ||
288 | * | ||
289 | * The enqueued attribute has a slightly different meaning for tasks and cpus: | ||
290 | * - task: the task's util_avg at last task dequeue time | ||
291 | * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU | ||
292 | * Thus, the util_est.enqueued of a task represents the contribution on the | ||
293 | * estimated utilization of the CPU where that task is currently enqueued. | ||
294 | * | ||
295 | * Only for tasks we track a moving average of the past instantaneous | ||
296 | * estimated utilization. This allows to absorb sporadic drops in utilization | ||
297 | * of an otherwise almost periodic task. | ||
298 | */ | ||
299 | struct util_est { | ||
300 | unsigned int enqueued; | ||
301 | unsigned int ewma; | ||
302 | #define UTIL_EST_WEIGHT_SHIFT 2 | ||
303 | }; | ||
304 | |||
278 | /* | 305 | /* |
279 | * The load_avg/util_avg accumulates an infinite geometric series | 306 | * The load_avg/util_avg accumulates an infinite geometric series |
280 | * (see __update_load_avg() in kernel/sched/fair.c). | 307 | * (see __update_load_avg() in kernel/sched/fair.c). |
@@ -336,6 +363,7 @@ struct sched_avg { | |||
336 | unsigned long load_avg; | 363 | unsigned long load_avg; |
337 | unsigned long runnable_load_avg; | 364 | unsigned long runnable_load_avg; |
338 | unsigned long util_avg; | 365 | unsigned long util_avg; |
366 | struct util_est util_est; | ||
339 | }; | 367 | }; |
340 | 368 | ||
341 | struct sched_statistics { | 369 | struct sched_statistics { |
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 0b55834efd46..59667444669f 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h | |||
@@ -8,9 +8,8 @@ | |||
8 | * Interface between cpufreq drivers and the scheduler: | 8 | * Interface between cpufreq drivers and the scheduler: |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #define SCHED_CPUFREQ_RT (1U << 0) | 11 | #define SCHED_CPUFREQ_IOWAIT (1U << 0) |
12 | #define SCHED_CPUFREQ_DL (1U << 1) | 12 | #define SCHED_CPUFREQ_MIGRATION (1U << 1) |
13 | #define SCHED_CPUFREQ_IOWAIT (1U << 2) | ||
14 | 13 | ||
15 | #ifdef CONFIG_CPU_FREQ | 14 | #ifdef CONFIG_CPU_FREQ |
16 | struct update_util_data { | 15 | struct update_util_data { |
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index a5bc8728ead7..0cb034331cbb 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h | |||
@@ -1,8 +1,4 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_SCHED_DEADLINE_H | ||
3 | #define _LINUX_SCHED_DEADLINE_H | ||
4 | |||
5 | #include <linux/sched.h> | ||
6 | 2 | ||
7 | /* | 3 | /* |
8 | * SCHED_DEADLINE tasks has negative priorities, reflecting | 4 | * SCHED_DEADLINE tasks has negative priorities, reflecting |
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b) | |||
28 | { | 24 | { |
29 | return (s64)(a - b) < 0; | 25 | return (s64)(a - b) < 0; |
30 | } | 26 | } |
31 | |||
32 | #endif /* _LINUX_SCHED_DEADLINE_H */ | ||
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index d849431c8060..4a6582c27dea 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h | |||
@@ -12,6 +12,7 @@ enum hk_flags { | |||
12 | HK_FLAG_SCHED = (1 << 3), | 12 | HK_FLAG_SCHED = (1 << 3), |
13 | HK_FLAG_TICK = (1 << 4), | 13 | HK_FLAG_TICK = (1 << 4), |
14 | HK_FLAG_DOMAIN = (1 << 5), | 14 | HK_FLAG_DOMAIN = (1 << 5), |
15 | HK_FLAG_WQ = (1 << 6), | ||
15 | }; | 16 | }; |
16 | 17 | ||
17 | #ifdef CONFIG_CPU_ISOLATION | 18 | #ifdef CONFIG_CPU_ISOLATION |
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 3d3a97d9399d..b36f4cf38111 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h | |||
@@ -16,11 +16,9 @@ static inline void cpu_load_update_nohz_stop(void) { } | |||
16 | 16 | ||
17 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | 17 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
18 | extern void nohz_balance_enter_idle(int cpu); | 18 | extern void nohz_balance_enter_idle(int cpu); |
19 | extern void set_cpu_sd_state_idle(void); | ||
20 | extern int get_nohz_timer_target(void); | 19 | extern int get_nohz_timer_target(void); |
21 | #else | 20 | #else |
22 | static inline void nohz_balance_enter_idle(int cpu) { } | 21 | static inline void nohz_balance_enter_idle(int cpu) { } |
23 | static inline void set_cpu_sd_state_idle(void) { } | ||
24 | #endif | 22 | #endif |
25 | 23 | ||
26 | #ifdef CONFIG_NO_HZ_COMMON | 24 | #ifdef CONFIG_NO_HZ_COMMON |
@@ -37,8 +35,4 @@ extern void wake_up_nohz_cpu(int cpu); | |||
37 | static inline void wake_up_nohz_cpu(int cpu) { } | 35 | static inline void wake_up_nohz_cpu(int cpu) { } |
38 | #endif | 36 | #endif |
39 | 37 | ||
40 | #ifdef CONFIG_NO_HZ_FULL | ||
41 | extern u64 scheduler_tick_max_deferment(void); | ||
42 | #endif | ||
43 | |||
44 | #endif /* _LINUX_SCHED_NOHZ_H */ | 38 | #endif /* _LINUX_SCHED_NOHZ_H */ |
diff --git a/include/linux/tick.h b/include/linux/tick.h index 7cc35921218e..7f8c9a127f5a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h | |||
@@ -113,7 +113,8 @@ enum tick_dep_bits { | |||
113 | 113 | ||
114 | #ifdef CONFIG_NO_HZ_COMMON | 114 | #ifdef CONFIG_NO_HZ_COMMON |
115 | extern bool tick_nohz_enabled; | 115 | extern bool tick_nohz_enabled; |
116 | extern int tick_nohz_tick_stopped(void); | 116 | extern bool tick_nohz_tick_stopped(void); |
117 | extern bool tick_nohz_tick_stopped_cpu(int cpu); | ||
117 | extern void tick_nohz_idle_enter(void); | 118 | extern void tick_nohz_idle_enter(void); |
118 | extern void tick_nohz_idle_exit(void); | 119 | extern void tick_nohz_idle_exit(void); |
119 | extern void tick_nohz_irq_exit(void); | 120 | extern void tick_nohz_irq_exit(void); |
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); | |||
125 | #else /* !CONFIG_NO_HZ_COMMON */ | 126 | #else /* !CONFIG_NO_HZ_COMMON */ |
126 | #define tick_nohz_enabled (0) | 127 | #define tick_nohz_enabled (0) |
127 | static inline int tick_nohz_tick_stopped(void) { return 0; } | 128 | static inline int tick_nohz_tick_stopped(void) { return 0; } |
129 | static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } | ||
128 | static inline void tick_nohz_idle_enter(void) { } | 130 | static inline void tick_nohz_idle_enter(void) { } |
129 | static inline void tick_nohz_idle_exit(void) { } | 131 | static inline void tick_nohz_idle_exit(void) { } |
130 | 132 | ||
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e2f9d4feff40..d9a02b318108 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
17 | endif | 17 | endif |
18 | 18 | ||
19 | obj-y += core.o loadavg.o clock.o cputime.o | 19 | obj-y += core.o loadavg.o clock.o cputime.o |
20 | obj-y += idle_task.o fair.o rt.o deadline.o | 20 | obj-y += idle.o fair.o rt.o deadline.o |
21 | obj-y += wait.o wait_bit.o swait.o completion.o idle.o | 21 | obj-y += wait.o wait_bit.o swait.o completion.o |
22 | |||
22 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o | 23 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o |
23 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o | 24 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
24 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 25 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index bb4b9fe026a1..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c | |||
@@ -1,10 +1,7 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/proc_fs.h> | 2 | /* |
3 | #include <linux/seq_file.h> | 3 | * Auto-group scheduling implementation: |
4 | #include <linux/utsname.h> | 4 | */ |
5 | #include <linux/security.h> | ||
6 | #include <linux/export.h> | ||
7 | |||
8 | #include "sched.h" | 5 | #include "sched.h" |
9 | 6 | ||
10 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 7 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
168 | autogroup_kref_put(prev); | 165 | autogroup_kref_put(prev); |
169 | } | 166 | } |
170 | 167 | ||
171 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | 168 | /* Allocates GFP_KERNEL, cannot be called under any spinlock: */ |
172 | void sched_autogroup_create_attach(struct task_struct *p) | 169 | void sched_autogroup_create_attach(struct task_struct *p) |
173 | { | 170 | { |
174 | struct autogroup *ag = autogroup_create(); | 171 | struct autogroup *ag = autogroup_create(); |
175 | 172 | ||
176 | autogroup_move_group(p, ag); | 173 | autogroup_move_group(p, ag); |
177 | /* drop extra reference added by autogroup_create() */ | 174 | |
175 | /* Drop extra reference added by autogroup_create(): */ | ||
178 | autogroup_kref_put(ag); | 176 | autogroup_kref_put(ag); |
179 | } | 177 | } |
180 | EXPORT_SYMBOL(sched_autogroup_create_attach); | 178 | EXPORT_SYMBOL(sched_autogroup_create_attach); |
181 | 179 | ||
182 | /* Cannot be called under siglock. Currently has no users */ | 180 | /* Cannot be called under siglock. Currently has no users: */ |
183 | void sched_autogroup_detach(struct task_struct *p) | 181 | void sched_autogroup_detach(struct task_struct *p) |
184 | { | 182 | { |
185 | autogroup_move_group(p, &autogroup_default); | 183 | autogroup_move_group(p, &autogroup_default); |
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str) | |||
202 | 200 | ||
203 | return 1; | 201 | return 1; |
204 | } | 202 | } |
205 | |||
206 | __setup("noautogroup", setup_autogroup); | 203 | __setup("noautogroup", setup_autogroup); |
207 | 204 | ||
208 | #ifdef CONFIG_PROC_FS | 205 | #ifdef CONFIG_PROC_FS |
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) | |||
224 | if (nice < 0 && !can_nice(current, nice)) | 221 | if (nice < 0 && !can_nice(current, nice)) |
225 | return -EPERM; | 222 | return -EPERM; |
226 | 223 | ||
227 | /* this is a heavy operation taking global locks.. */ | 224 | /* This is a heavy operation, taking global locks.. */ |
228 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | 225 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) |
229 | return -EAGAIN; | 226 | return -EAGAIN; |
230 | 227 | ||
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) | |||
267 | 264 | ||
268 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 265 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
269 | } | 266 | } |
270 | #endif /* CONFIG_SCHED_DEBUG */ | 267 | #endif |
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 27cd22b89824..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h | |||
@@ -1,15 +1,11 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifdef CONFIG_SCHED_AUTOGROUP | 2 | #ifdef CONFIG_SCHED_AUTOGROUP |
3 | 3 | ||
4 | #include <linux/kref.h> | ||
5 | #include <linux/rwsem.h> | ||
6 | #include <linux/sched/autogroup.h> | ||
7 | |||
8 | struct autogroup { | 4 | struct autogroup { |
9 | /* | 5 | /* |
10 | * reference doesn't mean how many thread attach to this | 6 | * Reference doesn't mean how many threads attach to this |
11 | * autogroup now. It just stands for the number of task | 7 | * autogroup now. It just stands for the number of tasks |
12 | * could use this autogroup. | 8 | * which could use this autogroup. |
13 | */ | 9 | */ |
14 | struct kref kref; | 10 | struct kref kref; |
15 | struct task_group *tg; | 11 | struct task_group *tg; |
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) | |||
56 | return tg; | 52 | return tg; |
57 | } | 53 | } |
58 | 54 | ||
59 | #ifdef CONFIG_SCHED_DEBUG | ||
60 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 55 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) |
61 | { | 56 | { |
62 | return 0; | 57 | return 0; |
63 | } | 58 | } |
64 | #endif | ||
65 | 59 | ||
66 | #endif /* CONFIG_SCHED_AUTOGROUP */ | 60 | #endif /* CONFIG_SCHED_AUTOGROUP */ |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e086babe6c61..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * sched_clock for unstable cpu clocks | 2 | * sched_clock() for unstable CPU clocks |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra | 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra |
5 | * | 5 | * |
@@ -11,7 +11,7 @@ | |||
11 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
12 | * | 12 | * |
13 | * | 13 | * |
14 | * What: | 14 | * What this file implements: |
15 | * | 15 | * |
16 | * cpu_clock(i) provides a fast (execution time) high resolution | 16 | * cpu_clock(i) provides a fast (execution time) high resolution |
17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) | 17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) |
@@ -26,11 +26,11 @@ | |||
26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
27 | * | 27 | * |
28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
29 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current CPU. |
30 | * | 30 | * |
31 | * sched_clock_cpu(i) | 31 | * sched_clock_cpu(i) |
32 | * | 32 | * |
33 | * How: | 33 | * How it is implemented: |
34 | * | 34 | * |
35 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the | 36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the |
@@ -52,19 +52,7 @@ | |||
52 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
53 | * | 53 | * |
54 | */ | 54 | */ |
55 | #include <linux/spinlock.h> | 55 | #include "sched.h" |
56 | #include <linux/hardirq.h> | ||
57 | #include <linux/export.h> | ||
58 | #include <linux/percpu.h> | ||
59 | #include <linux/ktime.h> | ||
60 | #include <linux/sched.h> | ||
61 | #include <linux/nmi.h> | ||
62 | #include <linux/sched/clock.h> | ||
63 | #include <linux/static_key.h> | ||
64 | #include <linux/workqueue.h> | ||
65 | #include <linux/compiler.h> | ||
66 | #include <linux/tick.h> | ||
67 | #include <linux/init.h> | ||
68 | 56 | ||
69 | /* | 57 | /* |
70 | * Scheduler clock - returns current time in nanosec units. | 58 | * Scheduler clock - returns current time in nanosec units. |
@@ -302,21 +290,21 @@ again: | |||
302 | * cmpxchg64 below only protects one readout. | 290 | * cmpxchg64 below only protects one readout. |
303 | * | 291 | * |
304 | * We must reread via sched_clock_local() in the retry case on | 292 | * We must reread via sched_clock_local() in the retry case on |
305 | * 32bit as an NMI could use sched_clock_local() via the | 293 | * 32-bit kernels as an NMI could use sched_clock_local() via the |
306 | * tracer and hit between the readout of | 294 | * tracer and hit between the readout of |
307 | * the low32bit and the high 32bit portion. | 295 | * the low 32-bit and the high 32-bit portion. |
308 | */ | 296 | */ |
309 | this_clock = sched_clock_local(my_scd); | 297 | this_clock = sched_clock_local(my_scd); |
310 | /* | 298 | /* |
311 | * We must enforce atomic readout on 32bit, otherwise the | 299 | * We must enforce atomic readout on 32-bit, otherwise the |
312 | * update on the remote cpu can hit inbetween the readout of | 300 | * update on the remote CPU can hit inbetween the readout of |
313 | * the low32bit and the high 32bit portion. | 301 | * the low 32-bit and the high 32-bit portion. |
314 | */ | 302 | */ |
315 | remote_clock = cmpxchg64(&scd->clock, 0, 0); | 303 | remote_clock = cmpxchg64(&scd->clock, 0, 0); |
316 | #else | 304 | #else |
317 | /* | 305 | /* |
318 | * On 64bit the read of [my]scd->clock is atomic versus the | 306 | * On 64-bit kernels the read of [my]scd->clock is atomic versus the |
319 | * update, so we can avoid the above 32bit dance. | 307 | * update, so we can avoid the above 32-bit dance. |
320 | */ | 308 | */ |
321 | sched_clock_local(my_scd); | 309 | sched_clock_local(my_scd); |
322 | again: | 310 | again: |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..e426b0cb9ac6 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -11,10 +11,7 @@ | |||
11 | * typically be used for exclusion which gives rise to priority inversion. | 11 | * typically be used for exclusion which gives rise to priority inversion. |
12 | * Waiting for completion is a typically sync point, but not an exclusion point. | 12 | * Waiting for completion is a typically sync point, but not an exclusion point. |
13 | */ | 13 | */ |
14 | 14 | #include "sched.h" | |
15 | #include <linux/sched/signal.h> | ||
16 | #include <linux/sched/debug.h> | ||
17 | #include <linux/completion.h> | ||
18 | 15 | ||
19 | /** | 16 | /** |
20 | * complete: - signals a single thread waiting on this completion | 17 | * complete: - signals a single thread waiting on this completion |
@@ -283,7 +280,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); | |||
283 | bool try_wait_for_completion(struct completion *x) | 280 | bool try_wait_for_completion(struct completion *x) |
284 | { | 281 | { |
285 | unsigned long flags; | 282 | unsigned long flags; |
286 | int ret = 1; | 283 | bool ret = true; |
287 | 284 | ||
288 | /* | 285 | /* |
289 | * Since x->done will need to be locked only | 286 | * Since x->done will need to be locked only |
@@ -292,11 +289,11 @@ bool try_wait_for_completion(struct completion *x) | |||
292 | * return early in the blocking case. | 289 | * return early in the blocking case. |
293 | */ | 290 | */ |
294 | if (!READ_ONCE(x->done)) | 291 | if (!READ_ONCE(x->done)) |
295 | return 0; | 292 | return false; |
296 | 293 | ||
297 | spin_lock_irqsave(&x->wait.lock, flags); | 294 | spin_lock_irqsave(&x->wait.lock, flags); |
298 | if (!x->done) | 295 | if (!x->done) |
299 | ret = 0; | 296 | ret = false; |
300 | else if (x->done != UINT_MAX) | 297 | else if (x->done != UINT_MAX) |
301 | x->done--; | 298 | x->done--; |
302 | spin_unlock_irqrestore(&x->wait.lock, flags); | 299 | spin_unlock_irqrestore(&x->wait.lock, flags); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c94895bc5a2c..de440456f15c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -5,37 +5,11 @@ | |||
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | */ | 7 | */ |
8 | #include <linux/sched.h> | 8 | #include "sched.h" |
9 | #include <linux/sched/clock.h> | ||
10 | #include <uapi/linux/sched/types.h> | ||
11 | #include <linux/sched/loadavg.h> | ||
12 | #include <linux/sched/hotplug.h> | ||
13 | #include <linux/wait_bit.h> | ||
14 | #include <linux/cpuset.h> | ||
15 | #include <linux/delayacct.h> | ||
16 | #include <linux/init_task.h> | ||
17 | #include <linux/context_tracking.h> | ||
18 | #include <linux/rcupdate_wait.h> | ||
19 | #include <linux/compat.h> | ||
20 | |||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/mmu_context.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/nmi.h> | ||
26 | #include <linux/prefetch.h> | ||
27 | #include <linux/profile.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/syscalls.h> | ||
30 | #include <linux/sched/isolation.h> | ||
31 | 9 | ||
32 | #include <asm/switch_to.h> | 10 | #include <asm/switch_to.h> |
33 | #include <asm/tlb.h> | 11 | #include <asm/tlb.h> |
34 | #ifdef CONFIG_PARAVIRT | ||
35 | #include <asm/paravirt.h> | ||
36 | #endif | ||
37 | 12 | ||
38 | #include "sched.h" | ||
39 | #include "../workqueue_internal.h" | 13 | #include "../workqueue_internal.h" |
40 | #include "../smpboot.h" | 14 | #include "../smpboot.h" |
41 | 15 | ||
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
135 | * [L] ->on_rq | 109 | * [L] ->on_rq |
136 | * RELEASE (rq->lock) | 110 | * RELEASE (rq->lock) |
137 | * | 111 | * |
138 | * If we observe the old cpu in task_rq_lock, the acquire of | 112 | * If we observe the old CPU in task_rq_lock, the acquire of |
139 | * the old rq->lock will fully serialize against the stores. | 113 | * the old rq->lock will fully serialize against the stores. |
140 | * | 114 | * |
141 | * If we observe the new CPU in task_rq_lock, the acquire will | 115 | * If we observe the new CPU in task_rq_lock, the acquire will |
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay) | |||
333 | } | 307 | } |
334 | #endif /* CONFIG_SMP */ | 308 | #endif /* CONFIG_SMP */ |
335 | 309 | ||
336 | static void init_rq_hrtick(struct rq *rq) | 310 | static void hrtick_rq_init(struct rq *rq) |
337 | { | 311 | { |
338 | #ifdef CONFIG_SMP | 312 | #ifdef CONFIG_SMP |
339 | rq->hrtick_csd_pending = 0; | 313 | rq->hrtick_csd_pending = 0; |
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq) | |||
351 | { | 325 | { |
352 | } | 326 | } |
353 | 327 | ||
354 | static inline void init_rq_hrtick(struct rq *rq) | 328 | static inline void hrtick_rq_init(struct rq *rq) |
355 | { | 329 | { |
356 | } | 330 | } |
357 | #endif /* CONFIG_SCHED_HRTICK */ | 331 | #endif /* CONFIG_SCHED_HRTICK */ |
@@ -609,7 +583,7 @@ static inline bool got_nohz_idle_kick(void) | |||
609 | { | 583 | { |
610 | int cpu = smp_processor_id(); | 584 | int cpu = smp_processor_id(); |
611 | 585 | ||
612 | if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) | 586 | if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) |
613 | return false; | 587 | return false; |
614 | 588 | ||
615 | if (idle_cpu(cpu) && !need_resched()) | 589 | if (idle_cpu(cpu) && !need_resched()) |
@@ -619,7 +593,7 @@ static inline bool got_nohz_idle_kick(void) | |||
619 | * We can't run Idle Load Balance on this CPU for this time so we | 593 | * We can't run Idle Load Balance on this CPU for this time so we |
620 | * cancel it and clear NOHZ_BALANCE_KICK | 594 | * cancel it and clear NOHZ_BALANCE_KICK |
621 | */ | 595 | */ |
622 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | 596 | atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); |
623 | return false; | 597 | return false; |
624 | } | 598 | } |
625 | 599 | ||
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1457 | * | 1431 | * |
1458 | * - cpu_active must be a subset of cpu_online | 1432 | * - cpu_active must be a subset of cpu_online |
1459 | * | 1433 | * |
1460 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, | 1434 | * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, |
1461 | * see __set_cpus_allowed_ptr(). At this point the newly online | 1435 | * see __set_cpus_allowed_ptr(). At this point the newly online |
1462 | * CPU isn't yet part of the sched domains, and balancing will not | 1436 | * CPU isn't yet part of the sched domains, and balancing will not |
1463 | * see it. | 1437 | * see it. |
@@ -2488,17 +2462,17 @@ void wake_up_new_task(struct task_struct *p) | |||
2488 | 2462 | ||
2489 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2463 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2490 | 2464 | ||
2491 | static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; | 2465 | static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); |
2492 | 2466 | ||
2493 | void preempt_notifier_inc(void) | 2467 | void preempt_notifier_inc(void) |
2494 | { | 2468 | { |
2495 | static_key_slow_inc(&preempt_notifier_key); | 2469 | static_branch_inc(&preempt_notifier_key); |
2496 | } | 2470 | } |
2497 | EXPORT_SYMBOL_GPL(preempt_notifier_inc); | 2471 | EXPORT_SYMBOL_GPL(preempt_notifier_inc); |
2498 | 2472 | ||
2499 | void preempt_notifier_dec(void) | 2473 | void preempt_notifier_dec(void) |
2500 | { | 2474 | { |
2501 | static_key_slow_dec(&preempt_notifier_key); | 2475 | static_branch_dec(&preempt_notifier_key); |
2502 | } | 2476 | } |
2503 | EXPORT_SYMBOL_GPL(preempt_notifier_dec); | 2477 | EXPORT_SYMBOL_GPL(preempt_notifier_dec); |
2504 | 2478 | ||
@@ -2508,7 +2482,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_dec); | |||
2508 | */ | 2482 | */ |
2509 | void preempt_notifier_register(struct preempt_notifier *notifier) | 2483 | void preempt_notifier_register(struct preempt_notifier *notifier) |
2510 | { | 2484 | { |
2511 | if (!static_key_false(&preempt_notifier_key)) | 2485 | if (!static_branch_unlikely(&preempt_notifier_key)) |
2512 | WARN(1, "registering preempt_notifier while notifiers disabled\n"); | 2486 | WARN(1, "registering preempt_notifier while notifiers disabled\n"); |
2513 | 2487 | ||
2514 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); | 2488 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); |
@@ -2537,7 +2511,7 @@ static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) | |||
2537 | 2511 | ||
2538 | static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 2512 | static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
2539 | { | 2513 | { |
2540 | if (static_key_false(&preempt_notifier_key)) | 2514 | if (static_branch_unlikely(&preempt_notifier_key)) |
2541 | __fire_sched_in_preempt_notifiers(curr); | 2515 | __fire_sched_in_preempt_notifiers(curr); |
2542 | } | 2516 | } |
2543 | 2517 | ||
@@ -2555,7 +2529,7 @@ static __always_inline void | |||
2555 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 2529 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
2556 | struct task_struct *next) | 2530 | struct task_struct *next) |
2557 | { | 2531 | { |
2558 | if (static_key_false(&preempt_notifier_key)) | 2532 | if (static_branch_unlikely(&preempt_notifier_key)) |
2559 | __fire_sched_out_preempt_notifiers(curr, next); | 2533 | __fire_sched_out_preempt_notifiers(curr, next); |
2560 | } | 2534 | } |
2561 | 2535 | ||
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) | |||
2629 | raw_spin_unlock_irq(&rq->lock); | 2603 | raw_spin_unlock_irq(&rq->lock); |
2630 | } | 2604 | } |
2631 | 2605 | ||
2606 | /* | ||
2607 | * NOP if the arch has not defined these: | ||
2608 | */ | ||
2609 | |||
2610 | #ifndef prepare_arch_switch | ||
2611 | # define prepare_arch_switch(next) do { } while (0) | ||
2612 | #endif | ||
2613 | |||
2614 | #ifndef finish_arch_post_lock_switch | ||
2615 | # define finish_arch_post_lock_switch() do { } while (0) | ||
2616 | #endif | ||
2617 | |||
2632 | /** | 2618 | /** |
2633 | * prepare_task_switch - prepare to switch tasks | 2619 | * prepare_task_switch - prepare to switch tasks |
2634 | * @rq: the runqueue preparing to switch | 2620 | * @rq: the runqueue preparing to switch |
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3037 | 3023 | ||
3038 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | 3024 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) |
3039 | /* | 3025 | /* |
3040 | * 64-bit doesn't need locks to atomically read a 64bit value. | 3026 | * 64-bit doesn't need locks to atomically read a 64-bit value. |
3041 | * So we have a optimization chance when the task's delta_exec is 0. | 3027 | * So we have a optimization chance when the task's delta_exec is 0. |
3042 | * Reading ->on_cpu is racy, but this is ok. | 3028 | * Reading ->on_cpu is racy, but this is ok. |
3043 | * | 3029 | * |
@@ -3096,35 +3082,99 @@ void scheduler_tick(void) | |||
3096 | rq->idle_balance = idle_cpu(cpu); | 3082 | rq->idle_balance = idle_cpu(cpu); |
3097 | trigger_load_balance(rq); | 3083 | trigger_load_balance(rq); |
3098 | #endif | 3084 | #endif |
3099 | rq_last_tick_reset(rq); | ||
3100 | } | 3085 | } |
3101 | 3086 | ||
3102 | #ifdef CONFIG_NO_HZ_FULL | 3087 | #ifdef CONFIG_NO_HZ_FULL |
3103 | /** | 3088 | |
3104 | * scheduler_tick_max_deferment | 3089 | struct tick_work { |
3105 | * | 3090 | int cpu; |
3106 | * Keep at least one tick per second when a single | 3091 | struct delayed_work work; |
3107 | * active task is running because the scheduler doesn't | 3092 | }; |
3108 | * yet completely support full dynticks environment. | 3093 | |
3109 | * | 3094 | static struct tick_work __percpu *tick_work_cpu; |
3110 | * This makes sure that uptime, CFS vruntime, load | 3095 | |
3111 | * balancing, etc... continue to move forward, even | 3096 | static void sched_tick_remote(struct work_struct *work) |
3112 | * with a very low granularity. | ||
3113 | * | ||
3114 | * Return: Maximum deferment in nanoseconds. | ||
3115 | */ | ||
3116 | u64 scheduler_tick_max_deferment(void) | ||
3117 | { | 3097 | { |
3118 | struct rq *rq = this_rq(); | 3098 | struct delayed_work *dwork = to_delayed_work(work); |
3119 | unsigned long next, now = READ_ONCE(jiffies); | 3099 | struct tick_work *twork = container_of(dwork, struct tick_work, work); |
3100 | int cpu = twork->cpu; | ||
3101 | struct rq *rq = cpu_rq(cpu); | ||
3102 | struct rq_flags rf; | ||
3120 | 3103 | ||
3121 | next = rq->last_sched_tick + HZ; | 3104 | /* |
3105 | * Handle the tick only if it appears the remote CPU is running in full | ||
3106 | * dynticks mode. The check is racy by nature, but missing a tick or | ||
3107 | * having one too much is no big deal because the scheduler tick updates | ||
3108 | * statistics and checks timeslices in a time-independent way, regardless | ||
3109 | * of when exactly it is running. | ||
3110 | */ | ||
3111 | if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { | ||
3112 | struct task_struct *curr; | ||
3113 | u64 delta; | ||
3122 | 3114 | ||
3123 | if (time_before_eq(next, now)) | 3115 | rq_lock_irq(rq, &rf); |
3124 | return 0; | 3116 | update_rq_clock(rq); |
3117 | curr = rq->curr; | ||
3118 | delta = rq_clock_task(rq) - curr->se.exec_start; | ||
3125 | 3119 | ||
3126 | return jiffies_to_nsecs(next - now); | 3120 | /* |
3121 | * Make sure the next tick runs within a reasonable | ||
3122 | * amount of time. | ||
3123 | */ | ||
3124 | WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); | ||
3125 | curr->sched_class->task_tick(rq, curr, 0); | ||
3126 | rq_unlock_irq(rq, &rf); | ||
3127 | } | ||
3128 | |||
3129 | /* | ||
3130 | * Run the remote tick once per second (1Hz). This arbitrary | ||
3131 | * frequency is large enough to avoid overload but short enough | ||
3132 | * to keep scheduler internal stats reasonably up to date. | ||
3133 | */ | ||
3134 | queue_delayed_work(system_unbound_wq, dwork, HZ); | ||
3127 | } | 3135 | } |
3136 | |||
3137 | static void sched_tick_start(int cpu) | ||
3138 | { | ||
3139 | struct tick_work *twork; | ||
3140 | |||
3141 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
3142 | return; | ||
3143 | |||
3144 | WARN_ON_ONCE(!tick_work_cpu); | ||
3145 | |||
3146 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
3147 | twork->cpu = cpu; | ||
3148 | INIT_DELAYED_WORK(&twork->work, sched_tick_remote); | ||
3149 | queue_delayed_work(system_unbound_wq, &twork->work, HZ); | ||
3150 | } | ||
3151 | |||
3152 | #ifdef CONFIG_HOTPLUG_CPU | ||
3153 | static void sched_tick_stop(int cpu) | ||
3154 | { | ||
3155 | struct tick_work *twork; | ||
3156 | |||
3157 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
3158 | return; | ||
3159 | |||
3160 | WARN_ON_ONCE(!tick_work_cpu); | ||
3161 | |||
3162 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
3163 | cancel_delayed_work_sync(&twork->work); | ||
3164 | } | ||
3165 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
3166 | |||
3167 | int __init sched_tick_offload_init(void) | ||
3168 | { | ||
3169 | tick_work_cpu = alloc_percpu(struct tick_work); | ||
3170 | BUG_ON(!tick_work_cpu); | ||
3171 | |||
3172 | return 0; | ||
3173 | } | ||
3174 | |||
3175 | #else /* !CONFIG_NO_HZ_FULL */ | ||
3176 | static inline void sched_tick_start(int cpu) { } | ||
3177 | static inline void sched_tick_stop(int cpu) { } | ||
3128 | #endif | 3178 | #endif |
3129 | 3179 | ||
3130 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 3180 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu) | |||
5786 | { | 5836 | { |
5787 | set_cpu_rq_start_time(cpu); | 5837 | set_cpu_rq_start_time(cpu); |
5788 | sched_rq_cpu_starting(cpu); | 5838 | sched_rq_cpu_starting(cpu); |
5839 | sched_tick_start(cpu); | ||
5789 | return 0; | 5840 | return 0; |
5790 | } | 5841 | } |
5791 | 5842 | ||
@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu) | |||
5797 | 5848 | ||
5798 | /* Handle pending wakeups and then migrate everything off */ | 5849 | /* Handle pending wakeups and then migrate everything off */ |
5799 | sched_ttwu_pending(); | 5850 | sched_ttwu_pending(); |
5851 | sched_tick_stop(cpu); | ||
5800 | 5852 | ||
5801 | rq_lock_irqsave(rq, &rf); | 5853 | rq_lock_irqsave(rq, &rf); |
5802 | if (rq->rd) { | 5854 | if (rq->rd) { |
@@ -5809,7 +5861,7 @@ int sched_cpu_dying(unsigned int cpu) | |||
5809 | 5861 | ||
5810 | calc_load_migrate(rq); | 5862 | calc_load_migrate(rq); |
5811 | update_max_interval(); | 5863 | update_max_interval(); |
5812 | nohz_balance_exit_idle(cpu); | 5864 | nohz_balance_exit_idle(rq); |
5813 | hrtick_clear(rq); | 5865 | hrtick_clear(rq); |
5814 | return 0; | 5866 | return 0; |
5815 | } | 5867 | } |
@@ -6022,13 +6074,11 @@ void __init sched_init(void) | |||
6022 | rq_attach_root(rq, &def_root_domain); | 6074 | rq_attach_root(rq, &def_root_domain); |
6023 | #ifdef CONFIG_NO_HZ_COMMON | 6075 | #ifdef CONFIG_NO_HZ_COMMON |
6024 | rq->last_load_update_tick = jiffies; | 6076 | rq->last_load_update_tick = jiffies; |
6025 | rq->nohz_flags = 0; | 6077 | rq->last_blocked_load_update_tick = jiffies; |
6026 | #endif | 6078 | atomic_set(&rq->nohz_flags, 0); |
6027 | #ifdef CONFIG_NO_HZ_FULL | ||
6028 | rq->last_sched_tick = 0; | ||
6029 | #endif | 6079 | #endif |
6030 | #endif /* CONFIG_SMP */ | 6080 | #endif /* CONFIG_SMP */ |
6031 | init_rq_hrtick(rq); | 6081 | hrtick_rq_init(rq); |
6032 | atomic_set(&rq->nr_iowait, 0); | 6082 | atomic_set(&rq->nr_iowait, 0); |
6033 | } | 6083 | } |
6034 | 6084 | ||
@@ -7027,3 +7077,5 @@ const u32 sched_prio_to_wmult[40] = { | |||
7027 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | 7077 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
7028 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 7078 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
7029 | }; | 7079 | }; |
7080 | |||
7081 | #undef CREATE_TRACE_POINTS | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 44ab32a4fab6..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -1,24 +1,13 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/cgroup.h> | ||
3 | #include <linux/slab.h> | ||
4 | #include <linux/percpu.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/cpumask.h> | ||
7 | #include <linux/seq_file.h> | ||
8 | #include <linux/rcupdate.h> | ||
9 | #include <linux/kernel_stat.h> | ||
10 | #include <linux/err.h> | ||
11 | |||
12 | #include "sched.h" | ||
13 | |||
14 | /* | 2 | /* |
15 | * CPU accounting code for task groups. | 3 | * CPU accounting code for task groups. |
16 | * | 4 | * |
17 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | 5 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh |
18 | * (balbir@in.ibm.com). | 6 | * (balbir@in.ibm.com). |
19 | */ | 7 | */ |
8 | #include "sched.h" | ||
20 | 9 | ||
21 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 10 | /* Time spent by the tasks of the CPU accounting group executing in ... */ |
22 | enum cpuacct_stat_index { | 11 | enum cpuacct_stat_index { |
23 | CPUACCT_STAT_USER, /* ... user mode */ | 12 | CPUACCT_STAT_USER, /* ... user mode */ |
24 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | 13 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ |
@@ -35,12 +24,12 @@ struct cpuacct_usage { | |||
35 | u64 usages[CPUACCT_STAT_NSTATS]; | 24 | u64 usages[CPUACCT_STAT_NSTATS]; |
36 | }; | 25 | }; |
37 | 26 | ||
38 | /* track cpu usage of a group of tasks and its child groups */ | 27 | /* track CPU usage of a group of tasks and its child groups */ |
39 | struct cpuacct { | 28 | struct cpuacct { |
40 | struct cgroup_subsys_state css; | 29 | struct cgroup_subsys_state css; |
41 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 30 | /* cpuusage holds pointer to a u64-type object on every CPU */ |
42 | struct cpuacct_usage __percpu *cpuusage; | 31 | struct cpuacct_usage __percpu *cpuusage; |
43 | struct kernel_cpustat __percpu *cpustat; | 32 | struct kernel_cpustat __percpu *cpustat; |
44 | }; | 33 | }; |
45 | 34 | ||
46 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | 35 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | |||
48 | return css ? container_of(css, struct cpuacct, css) : NULL; | 37 | return css ? container_of(css, struct cpuacct, css) : NULL; |
49 | } | 38 | } |
50 | 39 | ||
51 | /* return cpu accounting group to which this task belongs */ | 40 | /* Return CPU accounting group to which this task belongs */ |
52 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 41 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
53 | { | 42 | { |
54 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); | 43 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); |
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = { | |||
65 | .cpuusage = &root_cpuacct_cpuusage, | 54 | .cpuusage = &root_cpuacct_cpuusage, |
66 | }; | 55 | }; |
67 | 56 | ||
68 | /* create a new cpu accounting group */ | 57 | /* Create a new CPU accounting group */ |
69 | static struct cgroup_subsys_state * | 58 | static struct cgroup_subsys_state * |
70 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | 59 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) |
71 | { | 60 | { |
@@ -96,7 +85,7 @@ out: | |||
96 | return ERR_PTR(-ENOMEM); | 85 | return ERR_PTR(-ENOMEM); |
97 | } | 86 | } |
98 | 87 | ||
99 | /* destroy an existing cpu accounting group */ | 88 | /* Destroy an existing CPU accounting group */ |
100 | static void cpuacct_css_free(struct cgroup_subsys_state *css) | 89 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
101 | { | 90 | { |
102 | struct cpuacct *ca = css_ca(css); | 91 | struct cpuacct *ca = css_ca(css); |
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
162 | #endif | 151 | #endif |
163 | } | 152 | } |
164 | 153 | ||
165 | /* return total cpu usage (in nanoseconds) of a group */ | 154 | /* Return total CPU usage (in nanoseconds) of a group */ |
166 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, | 155 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, |
167 | enum cpuacct_stat_index index) | 156 | enum cpuacct_stat_index index) |
168 | { | 157 | { |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8d9562d890d3..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -10,11 +10,7 @@ | |||
10 | * as published by the Free Software Foundation; version 2 | 10 | * as published by the Free Software Foundation; version 2 |
11 | * of the License. | 11 | * of the License. |
12 | */ | 12 | */ |
13 | 13 | #include "sched.h" | |
14 | #include <linux/gfp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include "cpudeadline.h" | ||
18 | 14 | ||
19 | static inline int parent(int i) | 15 | static inline int parent(int i) |
20 | { | 16 | { |
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) | |||
42 | return; | 38 | return; |
43 | 39 | ||
44 | /* adapted from lib/prio_heap.c */ | 40 | /* adapted from lib/prio_heap.c */ |
45 | while(1) { | 41 | while (1) { |
46 | u64 largest_dl; | 42 | u64 largest_dl; |
43 | |||
47 | l = left_child(idx); | 44 | l = left_child(idx); |
48 | r = right_child(idx); | 45 | r = right_child(idx); |
49 | largest = idx; | 46 | largest = idx; |
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
131 | return 1; | 128 | return 1; |
132 | } else { | 129 | } else { |
133 | int best_cpu = cpudl_maximum(cp); | 130 | int best_cpu = cpudl_maximum(cp); |
131 | |||
134 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | 132 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); |
135 | 133 | ||
136 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && | 134 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && |
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
145 | } | 143 | } |
146 | 144 | ||
147 | /* | 145 | /* |
148 | * cpudl_clear - remove a cpu from the cpudl max-heap | 146 | * cpudl_clear - remove a CPU from the cpudl max-heap |
149 | * @cp: the cpudl max-heap context | 147 | * @cp: the cpudl max-heap context |
150 | * @cpu: the target cpu | 148 | * @cpu: the target CPU |
151 | * | 149 | * |
152 | * Notes: assumes cpu_rq(cpu)->lock is locked | 150 | * Notes: assumes cpu_rq(cpu)->lock is locked |
153 | * | 151 | * |
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) | |||
186 | /* | 184 | /* |
187 | * cpudl_set - update the cpudl max-heap | 185 | * cpudl_set - update the cpudl max-heap |
188 | * @cp: the cpudl max-heap context | 186 | * @cp: the cpudl max-heap context |
189 | * @cpu: the target cpu | 187 | * @cpu: the target CPU |
190 | * @dl: the new earliest deadline for this cpu | 188 | * @dl: the new earliest deadline for this CPU |
191 | * | 189 | * |
192 | * Notes: assumes cpu_rq(cpu)->lock is locked | 190 | * Notes: assumes cpu_rq(cpu)->lock is locked |
193 | * | 191 | * |
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
205 | old_idx = cp->elements[cpu].idx; | 203 | old_idx = cp->elements[cpu].idx; |
206 | if (old_idx == IDX_INVALID) { | 204 | if (old_idx == IDX_INVALID) { |
207 | int new_idx = cp->size++; | 205 | int new_idx = cp->size++; |
206 | |||
208 | cp->elements[new_idx].dl = dl; | 207 | cp->elements[new_idx].dl = dl; |
209 | cp->elements[new_idx].cpu = cpu; | 208 | cp->elements[new_idx].cpu = cpu; |
210 | cp->elements[cpu].idx = new_idx; | 209 | cp->elements[cpu].idx = new_idx; |
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
221 | /* | 220 | /* |
222 | * cpudl_set_freecpu - Set the cpudl.free_cpus | 221 | * cpudl_set_freecpu - Set the cpudl.free_cpus |
223 | * @cp: the cpudl max-heap context | 222 | * @cp: the cpudl max-heap context |
224 | * @cpu: rd attached cpu | 223 | * @cpu: rd attached CPU |
225 | */ | 224 | */ |
226 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) | 225 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) |
227 | { | 226 | { |
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) | |||
231 | /* | 230 | /* |
232 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus | 231 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus |
233 | * @cp: the cpudl max-heap context | 232 | * @cp: the cpudl max-heap context |
234 | * @cpu: rd attached cpu | 233 | * @cpu: rd attached CPU |
235 | */ | 234 | */ |
236 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) | 235 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) |
237 | { | 236 | { |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index b010d26e108e..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -1,35 +1,26 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_CPUDL_H | ||
3 | #define _LINUX_CPUDL_H | ||
4 | 2 | ||
5 | #include <linux/sched.h> | 3 | #define IDX_INVALID -1 |
6 | #include <linux/sched/deadline.h> | ||
7 | |||
8 | #define IDX_INVALID -1 | ||
9 | 4 | ||
10 | struct cpudl_item { | 5 | struct cpudl_item { |
11 | u64 dl; | 6 | u64 dl; |
12 | int cpu; | 7 | int cpu; |
13 | int idx; | 8 | int idx; |
14 | }; | 9 | }; |
15 | 10 | ||
16 | struct cpudl { | 11 | struct cpudl { |
17 | raw_spinlock_t lock; | 12 | raw_spinlock_t lock; |
18 | int size; | 13 | int size; |
19 | cpumask_var_t free_cpus; | 14 | cpumask_var_t free_cpus; |
20 | struct cpudl_item *elements; | 15 | struct cpudl_item *elements; |
21 | }; | 16 | }; |
22 | 17 | ||
23 | |||
24 | #ifdef CONFIG_SMP | 18 | #ifdef CONFIG_SMP |
25 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | 19 | int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); |
26 | struct cpumask *later_mask); | ||
27 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); | 20 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); |
28 | void cpudl_clear(struct cpudl *cp, int cpu); | 21 | void cpudl_clear(struct cpudl *cp, int cpu); |
29 | int cpudl_init(struct cpudl *cp); | 22 | int cpudl_init(struct cpudl *cp); |
30 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | 23 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); |
31 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | 24 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); |
32 | void cpudl_cleanup(struct cpudl *cp); | 25 | void cpudl_cleanup(struct cpudl *cp); |
33 | #endif /* CONFIG_SMP */ | 26 | #endif /* CONFIG_SMP */ |
34 | |||
35 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
@@ -8,7 +8,6 @@ | |||
8 | * it under the terms of the GNU General Public License version 2 as | 8 | * it under the terms of the GNU General Public License version 2 as |
9 | * published by the Free Software Foundation. | 9 | * published by the Free Software Foundation. |
10 | */ | 10 | */ |
11 | |||
12 | #include "sched.h" | 11 | #include "sched.h" |
13 | 12 | ||
14 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | 13 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7936f548e071..2b124811947d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -11,61 +11,56 @@ | |||
11 | 11 | ||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | 13 | ||
14 | #include <linux/cpufreq.h> | ||
15 | #include <linux/kthread.h> | ||
16 | #include <uapi/linux/sched/types.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <trace/events/power.h> | ||
19 | |||
20 | #include "sched.h" | 14 | #include "sched.h" |
21 | 15 | ||
16 | #include <trace/events/power.h> | ||
17 | |||
22 | struct sugov_tunables { | 18 | struct sugov_tunables { |
23 | struct gov_attr_set attr_set; | 19 | struct gov_attr_set attr_set; |
24 | unsigned int rate_limit_us; | 20 | unsigned int rate_limit_us; |
25 | }; | 21 | }; |
26 | 22 | ||
27 | struct sugov_policy { | 23 | struct sugov_policy { |
28 | struct cpufreq_policy *policy; | 24 | struct cpufreq_policy *policy; |
29 | 25 | ||
30 | struct sugov_tunables *tunables; | 26 | struct sugov_tunables *tunables; |
31 | struct list_head tunables_hook; | 27 | struct list_head tunables_hook; |
32 | 28 | ||
33 | raw_spinlock_t update_lock; /* For shared policies */ | 29 | raw_spinlock_t update_lock; /* For shared policies */ |
34 | u64 last_freq_update_time; | 30 | u64 last_freq_update_time; |
35 | s64 freq_update_delay_ns; | 31 | s64 freq_update_delay_ns; |
36 | unsigned int next_freq; | 32 | unsigned int next_freq; |
37 | unsigned int cached_raw_freq; | 33 | unsigned int cached_raw_freq; |
38 | 34 | ||
39 | /* The next fields are only needed if fast switch cannot be used. */ | 35 | /* The next fields are only needed if fast switch cannot be used: */ |
40 | struct irq_work irq_work; | 36 | struct irq_work irq_work; |
41 | struct kthread_work work; | 37 | struct kthread_work work; |
42 | struct mutex work_lock; | 38 | struct mutex work_lock; |
43 | struct kthread_worker worker; | 39 | struct kthread_worker worker; |
44 | struct task_struct *thread; | 40 | struct task_struct *thread; |
45 | bool work_in_progress; | 41 | bool work_in_progress; |
46 | 42 | ||
47 | bool need_freq_update; | 43 | bool need_freq_update; |
48 | }; | 44 | }; |
49 | 45 | ||
50 | struct sugov_cpu { | 46 | struct sugov_cpu { |
51 | struct update_util_data update_util; | 47 | struct update_util_data update_util; |
52 | struct sugov_policy *sg_policy; | 48 | struct sugov_policy *sg_policy; |
53 | unsigned int cpu; | 49 | unsigned int cpu; |
54 | 50 | ||
55 | bool iowait_boost_pending; | 51 | bool iowait_boost_pending; |
56 | unsigned int iowait_boost; | 52 | unsigned int iowait_boost; |
57 | unsigned int iowait_boost_max; | 53 | unsigned int iowait_boost_max; |
58 | u64 last_update; | 54 | u64 last_update; |
59 | 55 | ||
60 | /* The fields below are only needed when sharing a policy. */ | 56 | /* The fields below are only needed when sharing a policy: */ |
61 | unsigned long util_cfs; | 57 | unsigned long util_cfs; |
62 | unsigned long util_dl; | 58 | unsigned long util_dl; |
63 | unsigned long max; | 59 | unsigned long max; |
64 | unsigned int flags; | ||
65 | 60 | ||
66 | /* The field below is for single-CPU policies only. */ | 61 | /* The field below is for single-CPU policies only: */ |
67 | #ifdef CONFIG_NO_HZ_COMMON | 62 | #ifdef CONFIG_NO_HZ_COMMON |
68 | unsigned long saved_idle_calls; | 63 | unsigned long saved_idle_calls; |
69 | #endif | 64 | #endif |
70 | }; | 65 | }; |
71 | 66 | ||
@@ -79,9 +74,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
79 | 74 | ||
80 | /* | 75 | /* |
81 | * Since cpufreq_update_util() is called with rq->lock held for | 76 | * Since cpufreq_update_util() is called with rq->lock held for |
82 | * the @target_cpu, our per-cpu data is fully serialized. | 77 | * the @target_cpu, our per-CPU data is fully serialized. |
83 | * | 78 | * |
84 | * However, drivers cannot in general deal with cross-cpu | 79 | * However, drivers cannot in general deal with cross-CPU |
85 | * requests, so while get_next_freq() will work, our | 80 | * requests, so while get_next_freq() will work, our |
86 | * sugov_update_commit() call may not for the fast switching platforms. | 81 | * sugov_update_commit() call may not for the fast switching platforms. |
87 | * | 82 | * |
@@ -111,6 +106,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
111 | } | 106 | } |
112 | 107 | ||
113 | delta_ns = time - sg_policy->last_freq_update_time; | 108 | delta_ns = time - sg_policy->last_freq_update_time; |
109 | |||
114 | return delta_ns >= sg_policy->freq_update_delay_ns; | 110 | return delta_ns >= sg_policy->freq_update_delay_ns; |
115 | } | 111 | } |
116 | 112 | ||
@@ -186,17 +182,28 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) | |||
186 | 182 | ||
187 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) | 183 | static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) |
188 | { | 184 | { |
185 | struct rq *rq = cpu_rq(sg_cpu->cpu); | ||
186 | unsigned long util; | ||
187 | |||
188 | if (rq->rt.rt_nr_running) { | ||
189 | util = sg_cpu->max; | ||
190 | } else { | ||
191 | util = sg_cpu->util_dl; | ||
192 | if (rq->cfs.h_nr_running) | ||
193 | util += sg_cpu->util_cfs; | ||
194 | } | ||
195 | |||
189 | /* | 196 | /* |
190 | * Ideally we would like to set util_dl as min/guaranteed freq and | 197 | * Ideally we would like to set util_dl as min/guaranteed freq and |
191 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet | 198 | * util_cfs + util_dl as requested freq. However, cpufreq is not yet |
192 | * ready for such an interface. So, we only do the latter for now. | 199 | * ready for such an interface. So, we only do the latter for now. |
193 | */ | 200 | */ |
194 | return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); | 201 | return min(util, sg_cpu->max); |
195 | } | 202 | } |
196 | 203 | ||
197 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) | 204 | static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) |
198 | { | 205 | { |
199 | if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { | 206 | if (flags & SCHED_CPUFREQ_IOWAIT) { |
200 | if (sg_cpu->iowait_boost_pending) | 207 | if (sg_cpu->iowait_boost_pending) |
201 | return; | 208 | return; |
202 | 209 | ||
@@ -260,43 +267,51 @@ static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) | |||
260 | static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } | 267 | static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } |
261 | #endif /* CONFIG_NO_HZ_COMMON */ | 268 | #endif /* CONFIG_NO_HZ_COMMON */ |
262 | 269 | ||
270 | /* | ||
271 | * Make sugov_should_update_freq() ignore the rate limit when DL | ||
272 | * has increased the utilization. | ||
273 | */ | ||
274 | static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) | ||
275 | { | ||
276 | if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl) | ||
277 | sg_policy->need_freq_update = true; | ||
278 | } | ||
279 | |||
263 | static void sugov_update_single(struct update_util_data *hook, u64 time, | 280 | static void sugov_update_single(struct update_util_data *hook, u64 time, |
264 | unsigned int flags) | 281 | unsigned int flags) |
265 | { | 282 | { |
266 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 283 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
267 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 284 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
268 | struct cpufreq_policy *policy = sg_policy->policy; | ||
269 | unsigned long util, max; | 285 | unsigned long util, max; |
270 | unsigned int next_f; | 286 | unsigned int next_f; |
271 | bool busy; | 287 | bool busy; |
272 | 288 | ||
273 | sugov_set_iowait_boost(sg_cpu, time); | 289 | sugov_set_iowait_boost(sg_cpu, time, flags); |
274 | sg_cpu->last_update = time; | 290 | sg_cpu->last_update = time; |
275 | 291 | ||
292 | ignore_dl_rate_limit(sg_cpu, sg_policy); | ||
293 | |||
276 | if (!sugov_should_update_freq(sg_policy, time)) | 294 | if (!sugov_should_update_freq(sg_policy, time)) |
277 | return; | 295 | return; |
278 | 296 | ||
279 | busy = sugov_cpu_is_busy(sg_cpu); | 297 | busy = sugov_cpu_is_busy(sg_cpu); |
280 | 298 | ||
281 | if (flags & SCHED_CPUFREQ_RT) { | 299 | sugov_get_util(sg_cpu); |
282 | next_f = policy->cpuinfo.max_freq; | 300 | max = sg_cpu->max; |
283 | } else { | 301 | util = sugov_aggregate_util(sg_cpu); |
284 | sugov_get_util(sg_cpu); | 302 | sugov_iowait_boost(sg_cpu, &util, &max); |
285 | max = sg_cpu->max; | 303 | next_f = get_next_freq(sg_policy, util, max); |
286 | util = sugov_aggregate_util(sg_cpu); | 304 | /* |
287 | sugov_iowait_boost(sg_cpu, &util, &max); | 305 | * Do not reduce the frequency if the CPU has not been idle |
288 | next_f = get_next_freq(sg_policy, util, max); | 306 | * recently, as the reduction is likely to be premature then. |
289 | /* | 307 | */ |
290 | * Do not reduce the frequency if the CPU has not been idle | 308 | if (busy && next_f < sg_policy->next_freq) { |
291 | * recently, as the reduction is likely to be premature then. | 309 | next_f = sg_policy->next_freq; |
292 | */ | ||
293 | if (busy && next_f < sg_policy->next_freq) { | ||
294 | next_f = sg_policy->next_freq; | ||
295 | 310 | ||
296 | /* Reset cached freq as next_freq has changed */ | 311 | /* Reset cached freq as next_freq has changed */ |
297 | sg_policy->cached_raw_freq = 0; | 312 | sg_policy->cached_raw_freq = 0; |
298 | } | ||
299 | } | 313 | } |
314 | |||
300 | sugov_update_commit(sg_policy, time, next_f); | 315 | sugov_update_commit(sg_policy, time, next_f); |
301 | } | 316 | } |
302 | 317 | ||
@@ -312,6 +327,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
312 | unsigned long j_util, j_max; | 327 | unsigned long j_util, j_max; |
313 | s64 delta_ns; | 328 | s64 delta_ns; |
314 | 329 | ||
330 | sugov_get_util(j_sg_cpu); | ||
331 | |||
315 | /* | 332 | /* |
316 | * If the CFS CPU utilization was last updated before the | 333 | * If the CFS CPU utilization was last updated before the |
317 | * previous frequency update and the time elapsed between the | 334 | * previous frequency update and the time elapsed between the |
@@ -325,28 +342,22 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
325 | if (delta_ns > TICK_NSEC) { | 342 | if (delta_ns > TICK_NSEC) { |
326 | j_sg_cpu->iowait_boost = 0; | 343 | j_sg_cpu->iowait_boost = 0; |
327 | j_sg_cpu->iowait_boost_pending = false; | 344 | j_sg_cpu->iowait_boost_pending = false; |
328 | j_sg_cpu->util_cfs = 0; | ||
329 | if (j_sg_cpu->util_dl == 0) | ||
330 | continue; | ||
331 | } | 345 | } |
332 | if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) | ||
333 | return policy->cpuinfo.max_freq; | ||
334 | 346 | ||
335 | j_max = j_sg_cpu->max; | 347 | j_max = j_sg_cpu->max; |
336 | j_util = sugov_aggregate_util(j_sg_cpu); | 348 | j_util = sugov_aggregate_util(j_sg_cpu); |
349 | sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); | ||
337 | if (j_util * max > j_max * util) { | 350 | if (j_util * max > j_max * util) { |
338 | util = j_util; | 351 | util = j_util; |
339 | max = j_max; | 352 | max = j_max; |
340 | } | 353 | } |
341 | |||
342 | sugov_iowait_boost(j_sg_cpu, &util, &max); | ||
343 | } | 354 | } |
344 | 355 | ||
345 | return get_next_freq(sg_policy, util, max); | 356 | return get_next_freq(sg_policy, util, max); |
346 | } | 357 | } |
347 | 358 | ||
348 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 359 | static void |
349 | unsigned int flags) | 360 | sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) |
350 | { | 361 | { |
351 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 362 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
352 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 363 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
@@ -354,18 +365,13 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, | |||
354 | 365 | ||
355 | raw_spin_lock(&sg_policy->update_lock); | 366 | raw_spin_lock(&sg_policy->update_lock); |
356 | 367 | ||
357 | sugov_get_util(sg_cpu); | 368 | sugov_set_iowait_boost(sg_cpu, time, flags); |
358 | sg_cpu->flags = flags; | ||
359 | |||
360 | sugov_set_iowait_boost(sg_cpu, time); | ||
361 | sg_cpu->last_update = time; | 369 | sg_cpu->last_update = time; |
362 | 370 | ||
363 | if (sugov_should_update_freq(sg_policy, time)) { | 371 | ignore_dl_rate_limit(sg_cpu, sg_policy); |
364 | if (flags & SCHED_CPUFREQ_RT) | ||
365 | next_f = sg_policy->policy->cpuinfo.max_freq; | ||
366 | else | ||
367 | next_f = sugov_next_freq_shared(sg_cpu, time); | ||
368 | 372 | ||
373 | if (sugov_should_update_freq(sg_policy, time)) { | ||
374 | next_f = sugov_next_freq_shared(sg_cpu, time); | ||
369 | sugov_update_commit(sg_policy, time, next_f); | 375 | sugov_update_commit(sg_policy, time, next_f); |
370 | } | 376 | } |
371 | 377 | ||
@@ -423,8 +429,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) | |||
423 | return sprintf(buf, "%u\n", tunables->rate_limit_us); | 429 | return sprintf(buf, "%u\n", tunables->rate_limit_us); |
424 | } | 430 | } |
425 | 431 | ||
426 | static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, | 432 | static ssize_t |
427 | size_t count) | 433 | rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) |
428 | { | 434 | { |
429 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); | 435 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); |
430 | struct sugov_policy *sg_policy; | 436 | struct sugov_policy *sg_policy; |
@@ -479,11 +485,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) | |||
479 | { | 485 | { |
480 | struct task_struct *thread; | 486 | struct task_struct *thread; |
481 | struct sched_attr attr = { | 487 | struct sched_attr attr = { |
482 | .size = sizeof(struct sched_attr), | 488 | .size = sizeof(struct sched_attr), |
483 | .sched_policy = SCHED_DEADLINE, | 489 | .sched_policy = SCHED_DEADLINE, |
484 | .sched_flags = SCHED_FLAG_SUGOV, | 490 | .sched_flags = SCHED_FLAG_SUGOV, |
485 | .sched_nice = 0, | 491 | .sched_nice = 0, |
486 | .sched_priority = 0, | 492 | .sched_priority = 0, |
487 | /* | 493 | /* |
488 | * Fake (unused) bandwidth; workaround to "fix" | 494 | * Fake (unused) bandwidth; workaround to "fix" |
489 | * priority inheritance. | 495 | * priority inheritance. |
@@ -663,21 +669,20 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
663 | struct sugov_policy *sg_policy = policy->governor_data; | 669 | struct sugov_policy *sg_policy = policy->governor_data; |
664 | unsigned int cpu; | 670 | unsigned int cpu; |
665 | 671 | ||
666 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; | 672 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; |
667 | sg_policy->last_freq_update_time = 0; | 673 | sg_policy->last_freq_update_time = 0; |
668 | sg_policy->next_freq = UINT_MAX; | 674 | sg_policy->next_freq = UINT_MAX; |
669 | sg_policy->work_in_progress = false; | 675 | sg_policy->work_in_progress = false; |
670 | sg_policy->need_freq_update = false; | 676 | sg_policy->need_freq_update = false; |
671 | sg_policy->cached_raw_freq = 0; | 677 | sg_policy->cached_raw_freq = 0; |
672 | 678 | ||
673 | for_each_cpu(cpu, policy->cpus) { | 679 | for_each_cpu(cpu, policy->cpus) { |
674 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); | 680 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); |
675 | 681 | ||
676 | memset(sg_cpu, 0, sizeof(*sg_cpu)); | 682 | memset(sg_cpu, 0, sizeof(*sg_cpu)); |
677 | sg_cpu->cpu = cpu; | 683 | sg_cpu->cpu = cpu; |
678 | sg_cpu->sg_policy = sg_policy; | 684 | sg_cpu->sg_policy = sg_policy; |
679 | sg_cpu->flags = 0; | 685 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; |
680 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | ||
681 | } | 686 | } |
682 | 687 | ||
683 | for_each_cpu(cpu, policy->cpus) { | 688 | for_each_cpu(cpu, policy->cpus) { |
@@ -721,14 +726,14 @@ static void sugov_limits(struct cpufreq_policy *policy) | |||
721 | } | 726 | } |
722 | 727 | ||
723 | static struct cpufreq_governor schedutil_gov = { | 728 | static struct cpufreq_governor schedutil_gov = { |
724 | .name = "schedutil", | 729 | .name = "schedutil", |
725 | .owner = THIS_MODULE, | 730 | .owner = THIS_MODULE, |
726 | .dynamic_switching = true, | 731 | .dynamic_switching = true, |
727 | .init = sugov_init, | 732 | .init = sugov_init, |
728 | .exit = sugov_exit, | 733 | .exit = sugov_exit, |
729 | .start = sugov_start, | 734 | .start = sugov_start, |
730 | .stop = sugov_stop, | 735 | .stop = sugov_stop, |
731 | .limits = sugov_limits, | 736 | .limits = sugov_limits, |
732 | }; | 737 | }; |
733 | 738 | ||
734 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL | 739 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 2511aba36b89..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -14,7 +14,7 @@ | |||
14 | * | 14 | * |
15 | * going from the lowest priority to the highest. CPUs in the INVALID state | 15 | * going from the lowest priority to the highest. CPUs in the INVALID state |
16 | * are not eligible for routing. The system maintains this state with | 16 | * are not eligible for routing. The system maintains this state with |
17 | * a 2 dimensional bitmap (the first for priority class, the second for cpus | 17 | * a 2 dimensional bitmap (the first for priority class, the second for CPUs |
18 | * in that class). Therefore a typical application without affinity | 18 | * in that class). Therefore a typical application without affinity |
19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit | 19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit |
20 | * searches). For tasks with affinity restrictions, the algorithm has a | 20 | * searches). For tasks with affinity restrictions, the algorithm has a |
@@ -26,12 +26,7 @@ | |||
26 | * as published by the Free Software Foundation; version 2 | 26 | * as published by the Free Software Foundation; version 2 |
27 | * of the License. | 27 | * of the License. |
28 | */ | 28 | */ |
29 | 29 | #include "sched.h" | |
30 | #include <linux/gfp.h> | ||
31 | #include <linux/sched.h> | ||
32 | #include <linux/sched/rt.h> | ||
33 | #include <linux/slab.h> | ||
34 | #include "cpupri.h" | ||
35 | 30 | ||
36 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 31 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
37 | static int convert_prio(int prio) | 32 | static int convert_prio(int prio) |
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
128 | } | 123 | } |
129 | 124 | ||
130 | /** | 125 | /** |
131 | * cpupri_set - update the cpu priority setting | 126 | * cpupri_set - update the CPU priority setting |
132 | * @cp: The cpupri context | 127 | * @cp: The cpupri context |
133 | * @cpu: The target cpu | 128 | * @cpu: The target CPU |
134 | * @newpri: The priority (INVALID-RT99) to assign to this CPU | 129 | * @newpri: The priority (INVALID-RT99) to assign to this CPU |
135 | * | 130 | * |
136 | * Note: Assumes cpu_rq(cpu)->lock is locked | 131 | * Note: Assumes cpu_rq(cpu)->lock is locked |
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
151 | return; | 146 | return; |
152 | 147 | ||
153 | /* | 148 | /* |
154 | * If the cpu was currently mapped to a different value, we | 149 | * If the CPU was currently mapped to a different value, we |
155 | * need to map it to the new value then remove the old value. | 150 | * need to map it to the new value then remove the old value. |
156 | * Note, we must add the new value first, otherwise we risk the | 151 | * Note, we must add the new value first, otherwise we risk the |
157 | * cpu being missed by the priority loop in cpupri_find. | 152 | * cpu being missed by the priority loop in cpupri_find. |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
@@ -1,32 +1,25 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_CPUPRI_H | ||
3 | #define _LINUX_CPUPRI_H | ||
4 | |||
5 | #include <linux/sched.h> | ||
6 | 2 | ||
7 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 3 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
8 | 4 | ||
9 | #define CPUPRI_INVALID -1 | 5 | #define CPUPRI_INVALID -1 |
10 | #define CPUPRI_IDLE 0 | 6 | #define CPUPRI_IDLE 0 |
11 | #define CPUPRI_NORMAL 1 | 7 | #define CPUPRI_NORMAL 1 |
12 | /* values 2-101 are RT priorities 0-99 */ | 8 | /* values 2-101 are RT priorities 0-99 */ |
13 | 9 | ||
14 | struct cpupri_vec { | 10 | struct cpupri_vec { |
15 | atomic_t count; | 11 | atomic_t count; |
16 | cpumask_var_t mask; | 12 | cpumask_var_t mask; |
17 | }; | 13 | }; |
18 | 14 | ||
19 | struct cpupri { | 15 | struct cpupri { |
20 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 16 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
21 | int *cpu_to_pri; | 17 | int *cpu_to_pri; |
22 | }; | 18 | }; |
23 | 19 | ||
24 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
25 | int cpupri_find(struct cpupri *cp, | 21 | int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); |
26 | struct task_struct *p, struct cpumask *lowest_mask); | ||
27 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 22 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
28 | int cpupri_init(struct cpupri *cp); | 23 | int cpupri_init(struct cpupri *cp); |
29 | void cpupri_cleanup(struct cpupri *cp); | 24 | void cpupri_cleanup(struct cpupri *cp); |
30 | #endif | 25 | #endif |
31 | |||
32 | #endif /* _LINUX_CPUPRI_H */ | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bac6ac9a4ec7..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -1,10 +1,6 @@ | |||
1 | #include <linux/export.h> | 1 | /* |
2 | #include <linux/sched.h> | 2 | * Simple CPU accounting cgroup controller |
3 | #include <linux/tsacct_kern.h> | 3 | */ |
4 | #include <linux/kernel_stat.h> | ||
5 | #include <linux/static_key.h> | ||
6 | #include <linux/context_tracking.h> | ||
7 | #include <linux/sched/cputime.h> | ||
8 | #include "sched.h" | 4 | #include "sched.h" |
9 | 5 | ||
10 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 6 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
113 | } | 109 | } |
114 | 110 | ||
115 | /* | 111 | /* |
116 | * Account user cpu time to a process. | 112 | * Account user CPU time to a process. |
117 | * @p: the process that the cpu time gets accounted to | 113 | * @p: the process that the CPU time gets accounted to |
118 | * @cputime: the cpu time spent in user space since the last update | 114 | * @cputime: the CPU time spent in user space since the last update |
119 | */ | 115 | */ |
120 | void account_user_time(struct task_struct *p, u64 cputime) | 116 | void account_user_time(struct task_struct *p, u64 cputime) |
121 | { | 117 | { |
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime) | |||
135 | } | 131 | } |
136 | 132 | ||
137 | /* | 133 | /* |
138 | * Account guest cpu time to a process. | 134 | * Account guest CPU time to a process. |
139 | * @p: the process that the cpu time gets accounted to | 135 | * @p: the process that the CPU time gets accounted to |
140 | * @cputime: the cpu time spent in virtual machine since the last update | 136 | * @cputime: the CPU time spent in virtual machine since the last update |
141 | */ | 137 | */ |
142 | void account_guest_time(struct task_struct *p, u64 cputime) | 138 | void account_guest_time(struct task_struct *p, u64 cputime) |
143 | { | 139 | { |
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) | |||
159 | } | 155 | } |
160 | 156 | ||
161 | /* | 157 | /* |
162 | * Account system cpu time to a process and desired cpustat field | 158 | * Account system CPU time to a process and desired cpustat field |
163 | * @p: the process that the cpu time gets accounted to | 159 | * @p: the process that the CPU time gets accounted to |
164 | * @cputime: the cpu time spent in kernel space since the last update | 160 | * @cputime: the CPU time spent in kernel space since the last update |
165 | * @index: pointer to cpustat field that has to be updated | 161 | * @index: pointer to cpustat field that has to be updated |
166 | */ | 162 | */ |
167 | void account_system_index_time(struct task_struct *p, | 163 | void account_system_index_time(struct task_struct *p, |
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p, | |||
179 | } | 175 | } |
180 | 176 | ||
181 | /* | 177 | /* |
182 | * Account system cpu time to a process. | 178 | * Account system CPU time to a process. |
183 | * @p: the process that the cpu time gets accounted to | 179 | * @p: the process that the CPU time gets accounted to |
184 | * @hardirq_offset: the offset to subtract from hardirq_count() | 180 | * @hardirq_offset: the offset to subtract from hardirq_count() |
185 | * @cputime: the cpu time spent in kernel space since the last update | 181 | * @cputime: the CPU time spent in kernel space since the last update |
186 | */ | 182 | */ |
187 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | 183 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
188 | { | 184 | { |
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | |||
205 | 201 | ||
206 | /* | 202 | /* |
207 | * Account for involuntary wait time. | 203 | * Account for involuntary wait time. |
208 | * @cputime: the cpu time spent in involuntary wait | 204 | * @cputime: the CPU time spent in involuntary wait |
209 | */ | 205 | */ |
210 | void account_steal_time(u64 cputime) | 206 | void account_steal_time(u64 cputime) |
211 | { | 207 | { |
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime) | |||
216 | 212 | ||
217 | /* | 213 | /* |
218 | * Account for idle time. | 214 | * Account for idle time. |
219 | * @cputime: the cpu time spent in idle wait | 215 | * @cputime: the CPU time spent in idle wait |
220 | */ | 216 | */ |
221 | void account_idle_time(u64 cputime) | 217 | void account_idle_time(u64 cputime) |
222 | { | 218 | { |
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
338 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 334 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
339 | /* | 335 | /* |
340 | * Account a tick to a process and cpustat | 336 | * Account a tick to a process and cpustat |
341 | * @p: the process that the cpu time gets accounted to | 337 | * @p: the process that the CPU time gets accounted to |
342 | * @user_tick: is the tick from userspace | 338 | * @user_tick: is the tick from userspace |
343 | * @rq: the pointer to rq | 339 | * @rq: the pointer to rq |
344 | * | 340 | * |
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks) | |||
400 | irqtime_account_process_tick(current, 0, rq, ticks); | 396 | irqtime_account_process_tick(current, 0, rq, ticks); |
401 | } | 397 | } |
402 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 398 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
403 | static inline void irqtime_account_idle_ticks(int ticks) {} | 399 | static inline void irqtime_account_idle_ticks(int ticks) { } |
404 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 400 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
405 | struct rq *rq, int nr_ticks) {} | 401 | struct rq *rq, int nr_ticks) { } |
406 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 402 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
407 | 403 | ||
408 | /* | 404 | /* |
409 | * Use precise platform statistics if available: | 405 | * Use precise platform statistics if available: |
410 | */ | 406 | */ |
411 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 407 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
412 | 408 | # ifndef __ARCH_HAS_VTIME_TASK_SWITCH | |
413 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
414 | void vtime_common_task_switch(struct task_struct *prev) | 409 | void vtime_common_task_switch(struct task_struct *prev) |
415 | { | 410 | { |
416 | if (is_idle_task(prev)) | 411 | if (is_idle_task(prev)) |
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
421 | vtime_flush(prev); | 416 | vtime_flush(prev); |
422 | arch_vtime_task_switch(prev); | 417 | arch_vtime_task_switch(prev); |
423 | } | 418 | } |
424 | #endif | 419 | # endif |
425 | |||
426 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 420 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
427 | 421 | ||
428 | 422 | ||
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) | |||
469 | *ut = cputime.utime; | 463 | *ut = cputime.utime; |
470 | *st = cputime.stime; | 464 | *st = cputime.stime; |
471 | } | 465 | } |
472 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 466 | |
467 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ | ||
468 | |||
473 | /* | 469 | /* |
474 | * Account a single tick of cpu time. | 470 | * Account a single tick of CPU time. |
475 | * @p: the process that the cpu time gets accounted to | 471 | * @p: the process that the CPU time gets accounted to |
476 | * @user_tick: indicates if the tick is a user or a system tick | 472 | * @user_tick: indicates if the tick is a user or a system tick |
477 | */ | 473 | */ |
478 | void account_process_tick(struct task_struct *p, int user_tick) | 474 | void account_process_tick(struct task_struct *p, int user_tick) |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df09782025c..d1c7bf7c7e5b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -17,9 +17,6 @@ | |||
17 | */ | 17 | */ |
18 | #include "sched.h" | 18 | #include "sched.h" |
19 | 19 | ||
20 | #include <linux/slab.h> | ||
21 | #include <uapi/linux/sched/types.h> | ||
22 | |||
23 | struct dl_bandwidth def_dl_bandwidth; | 20 | struct dl_bandwidth def_dl_bandwidth; |
24 | 21 | ||
25 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | 22 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) |
@@ -87,7 +84,7 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | |||
87 | SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ | 84 | SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ |
88 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); | 85 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); |
89 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ | 86 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
90 | cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); | 87 | cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); |
91 | } | 88 | } |
92 | 89 | ||
93 | static inline | 90 | static inline |
@@ -101,7 +98,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | |||
101 | if (dl_rq->running_bw > old) | 98 | if (dl_rq->running_bw > old) |
102 | dl_rq->running_bw = 0; | 99 | dl_rq->running_bw = 0; |
103 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ | 100 | /* kick cpufreq (see the comment in kernel/sched/sched.h). */ |
104 | cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); | 101 | cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); |
105 | } | 102 | } |
106 | 103 | ||
107 | static inline | 104 | static inline |
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); | |||
514 | static void push_dl_tasks(struct rq *); | 511 | static void push_dl_tasks(struct rq *); |
515 | static void pull_dl_task(struct rq *); | 512 | static void pull_dl_task(struct rq *); |
516 | 513 | ||
517 | static inline void queue_push_tasks(struct rq *rq) | 514 | static inline void deadline_queue_push_tasks(struct rq *rq) |
518 | { | 515 | { |
519 | if (!has_pushable_dl_tasks(rq)) | 516 | if (!has_pushable_dl_tasks(rq)) |
520 | return; | 517 | return; |
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
522 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); | 519 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); |
523 | } | 520 | } |
524 | 521 | ||
525 | static inline void queue_pull_task(struct rq *rq) | 522 | static inline void deadline_queue_pull_task(struct rq *rq) |
526 | { | 523 | { |
527 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); | 524 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); |
528 | } | 525 | } |
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
539 | 536 | ||
540 | /* | 537 | /* |
541 | * If we cannot preempt any rq, fall back to pick any | 538 | * If we cannot preempt any rq, fall back to pick any |
542 | * online cpu. | 539 | * online CPU: |
543 | */ | 540 | */ |
544 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | 541 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); |
545 | if (cpu >= nr_cpu_ids) { | 542 | if (cpu >= nr_cpu_ids) { |
546 | /* | 543 | /* |
547 | * Fail to find any suitable cpu. | 544 | * Failed to find any suitable CPU. |
548 | * The task will never come back! | 545 | * The task will never come back! |
549 | */ | 546 | */ |
550 | BUG_ON(dl_bandwidth_enabled()); | 547 | BUG_ON(dl_bandwidth_enabled()); |
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq) | |||
597 | { | 594 | { |
598 | } | 595 | } |
599 | 596 | ||
600 | static inline void queue_push_tasks(struct rq *rq) | 597 | static inline void deadline_queue_push_tasks(struct rq *rq) |
601 | { | 598 | { |
602 | } | 599 | } |
603 | 600 | ||
604 | static inline void queue_pull_task(struct rq *rq) | 601 | static inline void deadline_queue_pull_task(struct rq *rq) |
605 | { | 602 | { |
606 | } | 603 | } |
607 | #endif /* CONFIG_SMP */ | 604 | #endif /* CONFIG_SMP */ |
608 | 605 | ||
609 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 606 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
610 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 607 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
611 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | 608 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); |
612 | int flags); | ||
613 | 609 | ||
614 | /* | 610 | /* |
615 | * We are being explicitly informed that a new instance is starting, | 611 | * We are being explicitly informed that a new instance is starting, |
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1763 | if (hrtick_enabled(rq)) | 1759 | if (hrtick_enabled(rq)) |
1764 | start_hrtick_dl(rq, p); | 1760 | start_hrtick_dl(rq, p); |
1765 | 1761 | ||
1766 | queue_push_tasks(rq); | 1762 | deadline_queue_push_tasks(rq); |
1767 | 1763 | ||
1768 | return p; | 1764 | return p; |
1769 | } | 1765 | } |
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | |||
1776 | enqueue_pushable_dl_task(rq, p); | 1772 | enqueue_pushable_dl_task(rq, p); |
1777 | } | 1773 | } |
1778 | 1774 | ||
1775 | /* | ||
1776 | * scheduler tick hitting a task of our scheduling class. | ||
1777 | * | ||
1778 | * NOTE: This function can be called remotely by the tick offload that | ||
1779 | * goes along full dynticks. Therefore no local assumption can be made | ||
1780 | * and everything must be accessed through the @rq and @curr passed in | ||
1781 | * parameters. | ||
1782 | */ | ||
1779 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | 1783 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) |
1780 | { | 1784 | { |
1781 | update_curr_dl(rq); | 1785 | update_curr_dl(rq); |
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task) | |||
1865 | 1869 | ||
1866 | /* | 1870 | /* |
1867 | * We have to consider system topology and task affinity | 1871 | * We have to consider system topology and task affinity |
1868 | * first, then we can look for a suitable cpu. | 1872 | * first, then we can look for a suitable CPU. |
1869 | */ | 1873 | */ |
1870 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) | 1874 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) |
1871 | return -1; | 1875 | return -1; |
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task) | |||
1879 | * Now we check how well this matches with task's | 1883 | * Now we check how well this matches with task's |
1880 | * affinity and system topology. | 1884 | * affinity and system topology. |
1881 | * | 1885 | * |
1882 | * The last cpu where the task run is our first | 1886 | * The last CPU where the task run is our first |
1883 | * guess, since it is most likely cache-hot there. | 1887 | * guess, since it is most likely cache-hot there. |
1884 | */ | 1888 | */ |
1885 | if (cpumask_test_cpu(cpu, later_mask)) | 1889 | if (cpumask_test_cpu(cpu, later_mask)) |
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task) | |||
1909 | best_cpu = cpumask_first_and(later_mask, | 1913 | best_cpu = cpumask_first_and(later_mask, |
1910 | sched_domain_span(sd)); | 1914 | sched_domain_span(sd)); |
1911 | /* | 1915 | /* |
1912 | * Last chance: if a cpu being in both later_mask | 1916 | * Last chance: if a CPU being in both later_mask |
1913 | * and current sd span is valid, that becomes our | 1917 | * and current sd span is valid, that becomes our |
1914 | * choice. Of course, the latest possible cpu is | 1918 | * choice. Of course, the latest possible CPU is |
1915 | * already under consideration through later_mask. | 1919 | * already under consideration through later_mask. |
1916 | */ | 1920 | */ |
1917 | if (best_cpu < nr_cpu_ids) { | 1921 | if (best_cpu < nr_cpu_ids) { |
@@ -2067,7 +2071,7 @@ retry: | |||
2067 | if (task == next_task) { | 2071 | if (task == next_task) { |
2068 | /* | 2072 | /* |
2069 | * The task is still there. We don't try | 2073 | * The task is still there. We don't try |
2070 | * again, some other cpu will pull it when ready. | 2074 | * again, some other CPU will pull it when ready. |
2071 | */ | 2075 | */ |
2072 | goto out; | 2076 | goto out; |
2073 | } | 2077 | } |
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
2300 | /* | 2304 | /* |
2301 | * Since this might be the only -deadline task on the rq, | 2305 | * Since this might be the only -deadline task on the rq, |
2302 | * this is the right place to try to pull some other one | 2306 | * this is the right place to try to pull some other one |
2303 | * from an overloaded cpu, if any. | 2307 | * from an overloaded CPU, if any. |
2304 | */ | 2308 | */ |
2305 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) | 2309 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) |
2306 | return; | 2310 | return; |
2307 | 2311 | ||
2308 | queue_pull_task(rq); | 2312 | deadline_queue_pull_task(rq); |
2309 | } | 2313 | } |
2310 | 2314 | ||
2311 | /* | 2315 | /* |
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
2327 | if (rq->curr != p) { | 2331 | if (rq->curr != p) { |
2328 | #ifdef CONFIG_SMP | 2332 | #ifdef CONFIG_SMP |
2329 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) | 2333 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) |
2330 | queue_push_tasks(rq); | 2334 | deadline_queue_push_tasks(rq); |
2331 | #endif | 2335 | #endif |
2332 | if (dl_task(rq->curr)) | 2336 | if (dl_task(rq->curr)) |
2333 | check_preempt_curr_dl(rq, p, 0); | 2337 | check_preempt_curr_dl(rq, p, 0); |
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
2352 | * or lowering its prio, so... | 2356 | * or lowering its prio, so... |
2353 | */ | 2357 | */ |
2354 | if (!rq->dl.overloaded) | 2358 | if (!rq->dl.overloaded) |
2355 | queue_pull_task(rq); | 2359 | deadline_queue_pull_task(rq); |
2356 | 2360 | ||
2357 | /* | 2361 | /* |
2358 | * If we now have a earlier deadline task than p, | 2362 | * If we now have a earlier deadline task than p, |
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p) | |||
2626 | { | 2630 | { |
2627 | struct sched_dl_entity *dl_se = &p->dl; | 2631 | struct sched_dl_entity *dl_se = &p->dl; |
2628 | 2632 | ||
2629 | dl_se->dl_runtime = 0; | 2633 | dl_se->dl_runtime = 0; |
2630 | dl_se->dl_deadline = 0; | 2634 | dl_se->dl_deadline = 0; |
2631 | dl_se->dl_period = 0; | 2635 | dl_se->dl_period = 0; |
2632 | dl_se->flags = 0; | 2636 | dl_se->flags = 0; |
2633 | dl_se->dl_bw = 0; | 2637 | dl_se->dl_bw = 0; |
2634 | dl_se->dl_density = 0; | 2638 | dl_se->dl_density = 0; |
2635 | 2639 | ||
2636 | dl_se->dl_throttled = 0; | 2640 | dl_se->dl_throttled = 0; |
2637 | dl_se->dl_yielded = 0; | 2641 | dl_se->dl_yielded = 0; |
2638 | dl_se->dl_non_contending = 0; | 2642 | dl_se->dl_non_contending = 0; |
2639 | dl_se->dl_overrun = 0; | 2643 | dl_se->dl_overrun = 0; |
2640 | } | 2644 | } |
2641 | 2645 | ||
2642 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | 2646 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) |
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | |||
2655 | #ifdef CONFIG_SMP | 2659 | #ifdef CONFIG_SMP |
2656 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) | 2660 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) |
2657 | { | 2661 | { |
2658 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | 2662 | unsigned int dest_cpu; |
2659 | cs_cpus_allowed); | ||
2660 | struct dl_bw *dl_b; | 2663 | struct dl_bw *dl_b; |
2661 | bool overflow; | 2664 | bool overflow; |
2662 | int cpus, ret; | 2665 | int cpus, ret; |
2663 | unsigned long flags; | 2666 | unsigned long flags; |
2664 | 2667 | ||
2668 | dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); | ||
2669 | |||
2665 | rcu_read_lock_sched(); | 2670 | rcu_read_lock_sched(); |
2666 | dl_b = dl_bw_of(dest_cpu); | 2671 | dl_b = dl_bw_of(dest_cpu); |
2667 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 2672 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
2668 | cpus = dl_bw_cpus(dest_cpu); | 2673 | cpus = dl_bw_cpus(dest_cpu); |
2669 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | 2674 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); |
2670 | if (overflow) | 2675 | if (overflow) { |
2671 | ret = -EBUSY; | 2676 | ret = -EBUSY; |
2672 | else { | 2677 | } else { |
2673 | /* | 2678 | /* |
2674 | * We reserve space for this task in the destination | 2679 | * We reserve space for this task in the destination |
2675 | * root_domain, as we can't fail after this point. | 2680 | * root_domain, as we can't fail after this point. |
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo | |||
2681 | } | 2686 | } |
2682 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2687 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
2683 | rcu_read_unlock_sched(); | 2688 | rcu_read_unlock_sched(); |
2689 | |||
2684 | return ret; | 2690 | return ret; |
2685 | } | 2691 | } |
2686 | 2692 | ||
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | |||
2701 | ret = 0; | 2707 | ret = 0; |
2702 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | 2708 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); |
2703 | rcu_read_unlock_sched(); | 2709 | rcu_read_unlock_sched(); |
2710 | |||
2704 | return ret; | 2711 | return ret; |
2705 | } | 2712 | } |
2706 | 2713 | ||
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu) | |||
2718 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 2725 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
2719 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2726 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
2720 | rcu_read_unlock_sched(); | 2727 | rcu_read_unlock_sched(); |
2728 | |||
2721 | return overflow; | 2729 | return overflow; |
2722 | } | 2730 | } |
2723 | #endif | 2731 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 72c401b3b15c..15b10e210a6b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched/debug.c | 2 | * kernel/sched/debug.c |
3 | * | 3 | * |
4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree and other debugging details |
5 | * | 5 | * |
6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar | 6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar |
7 | * | 7 | * |
@@ -9,16 +9,6 @@ | |||
9 | * it under the terms of the GNU General Public License version 2 as | 9 | * it under the terms of the GNU General Public License version 2 as |
10 | * published by the Free Software Foundation. | 10 | * published by the Free Software Foundation. |
11 | */ | 11 | */ |
12 | |||
13 | #include <linux/proc_fs.h> | ||
14 | #include <linux/sched/mm.h> | ||
15 | #include <linux/sched/task.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include <linux/kallsyms.h> | ||
18 | #include <linux/utsname.h> | ||
19 | #include <linux/mempolicy.h> | ||
20 | #include <linux/debugfs.h> | ||
21 | |||
22 | #include "sched.h" | 12 | #include "sched.h" |
23 | 13 | ||
24 | static DEFINE_SPINLOCK(sched_debug_lock); | 14 | static DEFINE_SPINLOCK(sched_debug_lock); |
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
274 | if (table == NULL) | 264 | if (table == NULL) |
275 | return NULL; | 265 | return NULL; |
276 | 266 | ||
277 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 267 | set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
278 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 268 | set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
279 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 269 | set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
280 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 270 | set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
281 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 271 | set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
282 | sizeof(int), 0644, proc_dointvec_minmax, true); | 272 | set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
283 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 273 | set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
284 | sizeof(int), 0644, proc_dointvec_minmax, true); | 274 | set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); |
285 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 275 | set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); |
286 | sizeof(int), 0644, proc_dointvec_minmax, true); | 276 | set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); |
287 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 277 | set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); |
288 | sizeof(int), 0644, proc_dointvec_minmax, true); | 278 | set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); |
289 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 279 | set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
290 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
291 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
292 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
293 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
294 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
295 | set_table_entry(&table[9], "cache_nice_tries", | ||
296 | &sd->cache_nice_tries, | ||
297 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
298 | set_table_entry(&table[10], "flags", &sd->flags, | ||
299 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
300 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
301 | &sd->max_newidle_lb_cost, | ||
302 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
303 | set_table_entry(&table[12], "name", sd->name, | ||
304 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
305 | /* &table[13] is terminator */ | 280 | /* &table[13] is terminator */ |
306 | 281 | ||
307 | return table; | 282 | return table; |
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
332 | return table; | 307 | return table; |
333 | } | 308 | } |
334 | 309 | ||
335 | static cpumask_var_t sd_sysctl_cpus; | 310 | static cpumask_var_t sd_sysctl_cpus; |
336 | static struct ctl_table_header *sd_sysctl_header; | 311 | static struct ctl_table_header *sd_sysctl_header; |
337 | 312 | ||
338 | void register_sched_domain_sysctl(void) | 313 | void register_sched_domain_sysctl(void) |
339 | { | 314 | { |
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
413 | { | 388 | { |
414 | struct sched_entity *se = tg->se[cpu]; | 389 | struct sched_entity *se = tg->se[cpu]; |
415 | 390 | ||
416 | #define P(F) \ | 391 | #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
417 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 392 | #define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) |
418 | #define P_SCHEDSTAT(F) \ | 393 | #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
419 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) | 394 | #define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) |
420 | #define PN(F) \ | ||
421 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
422 | #define PN_SCHEDSTAT(F) \ | ||
423 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) | ||
424 | 395 | ||
425 | if (!se) | 396 | if (!se) |
426 | return; | 397 | return; |
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
428 | PN(se->exec_start); | 399 | PN(se->exec_start); |
429 | PN(se->vruntime); | 400 | PN(se->vruntime); |
430 | PN(se->sum_exec_runtime); | 401 | PN(se->sum_exec_runtime); |
402 | |||
431 | if (schedstat_enabled()) { | 403 | if (schedstat_enabled()) { |
432 | PN_SCHEDSTAT(se->statistics.wait_start); | 404 | PN_SCHEDSTAT(se->statistics.wait_start); |
433 | PN_SCHEDSTAT(se->statistics.sleep_start); | 405 | PN_SCHEDSTAT(se->statistics.sleep_start); |
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
440 | PN_SCHEDSTAT(se->statistics.wait_sum); | 412 | PN_SCHEDSTAT(se->statistics.wait_sum); |
441 | P_SCHEDSTAT(se->statistics.wait_count); | 413 | P_SCHEDSTAT(se->statistics.wait_count); |
442 | } | 414 | } |
415 | |||
443 | P(se->load.weight); | 416 | P(se->load.weight); |
444 | P(se->runnable_weight); | 417 | P(se->runnable_weight); |
445 | #ifdef CONFIG_SMP | 418 | #ifdef CONFIG_SMP |
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg) | |||
464 | return group_path; | 437 | return group_path; |
465 | 438 | ||
466 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 439 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
440 | |||
467 | return group_path; | 441 | return group_path; |
468 | } | 442 | } |
469 | #endif | 443 | #endif |
@@ -569,6 +543,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
569 | cfs_rq->avg.runnable_load_avg); | 543 | cfs_rq->avg.runnable_load_avg); |
570 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", | 544 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", |
571 | cfs_rq->avg.util_avg); | 545 | cfs_rq->avg.util_avg); |
546 | SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", | ||
547 | cfs_rq->avg.util_est.enqueued); | ||
572 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", | 548 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", |
573 | cfs_rq->removed.load_avg); | 549 | cfs_rq->removed.load_avg); |
574 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", | 550 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", |
@@ -804,9 +780,9 @@ void sysrq_sched_debug_show(void) | |||
804 | /* | 780 | /* |
805 | * This itererator needs some explanation. | 781 | * This itererator needs some explanation. |
806 | * It returns 1 for the header position. | 782 | * It returns 1 for the header position. |
807 | * This means 2 is cpu 0. | 783 | * This means 2 is CPU 0. |
808 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 784 | * In a hotplugged system some CPUs, including CPU 0, may be missing so we have |
809 | * to use cpumask_* to iterate over the cpus. | 785 | * to use cpumask_* to iterate over the CPUs. |
810 | */ | 786 | */ |
811 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) | 787 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) |
812 | { | 788 | { |
@@ -826,6 +802,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) | |||
826 | 802 | ||
827 | if (n < nr_cpu_ids) | 803 | if (n < nr_cpu_ids) |
828 | return (void *)(unsigned long)(n + 2); | 804 | return (void *)(unsigned long)(n + 2); |
805 | |||
829 | return NULL; | 806 | return NULL; |
830 | } | 807 | } |
831 | 808 | ||
@@ -840,10 +817,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) | |||
840 | } | 817 | } |
841 | 818 | ||
842 | static const struct seq_operations sched_debug_sops = { | 819 | static const struct seq_operations sched_debug_sops = { |
843 | .start = sched_debug_start, | 820 | .start = sched_debug_start, |
844 | .next = sched_debug_next, | 821 | .next = sched_debug_next, |
845 | .stop = sched_debug_stop, | 822 | .stop = sched_debug_stop, |
846 | .show = sched_debug_show, | 823 | .show = sched_debug_show, |
847 | }; | 824 | }; |
848 | 825 | ||
849 | static int sched_debug_release(struct inode *inode, struct file *file) | 826 | static int sched_debug_release(struct inode *inode, struct file *file) |
@@ -881,14 +858,10 @@ static int __init init_sched_debug_procfs(void) | |||
881 | 858 | ||
882 | __initcall(init_sched_debug_procfs); | 859 | __initcall(init_sched_debug_procfs); |
883 | 860 | ||
884 | #define __P(F) \ | 861 | #define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
885 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | 862 | #define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
886 | #define P(F) \ | 863 | #define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
887 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | 864 | #define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
888 | #define __PN(F) \ | ||
889 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
890 | #define PN(F) \ | ||
891 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
892 | 865 | ||
893 | 866 | ||
894 | #ifdef CONFIG_NUMA_BALANCING | 867 | #ifdef CONFIG_NUMA_BALANCING |
@@ -1023,6 +996,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, | |||
1023 | P(se.avg.runnable_load_avg); | 996 | P(se.avg.runnable_load_avg); |
1024 | P(se.avg.util_avg); | 997 | P(se.avg.util_avg); |
1025 | P(se.avg.last_update_time); | 998 | P(se.avg.last_update_time); |
999 | P(se.avg.util_est.ewma); | ||
1000 | P(se.avg.util_est.enqueued); | ||
1026 | #endif | 1001 | #endif |
1027 | P(policy); | 1002 | P(policy); |
1028 | P(prio); | 1003 | P(prio); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eb3ffc9be84..0951d1c58d2f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -20,25 +20,10 @@ | |||
20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | 20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra |
21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra | 21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
22 | */ | 22 | */ |
23 | 23 | #include "sched.h" | |
24 | #include <linux/sched/mm.h> | ||
25 | #include <linux/sched/topology.h> | ||
26 | |||
27 | #include <linux/latencytop.h> | ||
28 | #include <linux/cpumask.h> | ||
29 | #include <linux/cpuidle.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/profile.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/mempolicy.h> | ||
34 | #include <linux/migrate.h> | ||
35 | #include <linux/task_work.h> | ||
36 | #include <linux/sched/isolation.h> | ||
37 | 24 | ||
38 | #include <trace/events/sched.h> | 25 | #include <trace/events/sched.h> |
39 | 26 | ||
40 | #include "sched.h" | ||
41 | |||
42 | /* | 27 | /* |
43 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
44 | * | 29 | * |
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
103 | 88 | ||
104 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
105 | /* | 90 | /* |
106 | * For asym packing, by default the lower numbered cpu has higher priority. | 91 | * For asym packing, by default the lower numbered CPU has higher priority. |
107 | */ | 92 | */ |
108 | int __weak arch_asym_cpu_priority(int cpu) | 93 | int __weak arch_asym_cpu_priority(int cpu) |
109 | { | 94 | { |
@@ -787,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
787 | * For !fair tasks do: | 772 | * For !fair tasks do: |
788 | * | 773 | * |
789 | update_cfs_rq_load_avg(now, cfs_rq); | 774 | update_cfs_rq_load_avg(now, cfs_rq); |
790 | attach_entity_load_avg(cfs_rq, se); | 775 | attach_entity_load_avg(cfs_rq, se, 0); |
791 | switched_from_fair(rq, p); | 776 | switched_from_fair(rq, p); |
792 | * | 777 | * |
793 | * such that the next switched_to_fair() has the | 778 | * such that the next switched_to_fair() has the |
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
1181 | } | 1166 | } |
1182 | 1167 | ||
1183 | /* | 1168 | /* |
1184 | * The averaged statistics, shared & private, memory & cpu, | 1169 | * The averaged statistics, shared & private, memory & CPU, |
1185 | * occupy the first half of the array. The second half of the | 1170 | * occupy the first half of the array. The second half of the |
1186 | * array is for current counters, which are averaged into the | 1171 | * array is for current counters, which are averaged into the |
1187 | * first set by task_numa_placement. | 1172 | * first set by task_numa_placement. |
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1587 | * be incurred if the tasks were swapped. | 1572 | * be incurred if the tasks were swapped. |
1588 | */ | 1573 | */ |
1589 | if (cur) { | 1574 | if (cur) { |
1590 | /* Skip this swap candidate if cannot move to the source cpu */ | 1575 | /* Skip this swap candidate if cannot move to the source CPU: */ |
1591 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | 1576 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) |
1592 | goto unlock; | 1577 | goto unlock; |
1593 | 1578 | ||
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1631 | goto balance; | 1616 | goto balance; |
1632 | } | 1617 | } |
1633 | 1618 | ||
1634 | /* Balance doesn't matter much if we're running a task per cpu */ | 1619 | /* Balance doesn't matter much if we're running a task per CPU: */ |
1635 | if (imp > env->best_imp && src_rq->nr_running == 1 && | 1620 | if (imp > env->best_imp && src_rq->nr_running == 1 && |
1636 | dst_rq->nr_running == 1) | 1621 | dst_rq->nr_running == 1) |
1637 | goto assign; | 1622 | goto assign; |
@@ -1676,7 +1661,7 @@ balance: | |||
1676 | */ | 1661 | */ |
1677 | if (!cur) { | 1662 | if (!cur) { |
1678 | /* | 1663 | /* |
1679 | * select_idle_siblings() uses an per-cpu cpumask that | 1664 | * select_idle_siblings() uses an per-CPU cpumask that |
1680 | * can be used from IRQ context. | 1665 | * can be used from IRQ context. |
1681 | */ | 1666 | */ |
1682 | local_irq_disable(); | 1667 | local_irq_disable(); |
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1869 | static void numa_migrate_preferred(struct task_struct *p) | 1854 | static void numa_migrate_preferred(struct task_struct *p) |
1870 | { | 1855 | { |
1871 | unsigned long interval = HZ; | 1856 | unsigned long interval = HZ; |
1857 | unsigned long numa_migrate_retry; | ||
1872 | 1858 | ||
1873 | /* This task has no NUMA fault statistics yet */ | 1859 | /* This task has no NUMA fault statistics yet */ |
1874 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | 1860 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) |
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1876 | 1862 | ||
1877 | /* Periodically retry migrating the task to the preferred node */ | 1863 | /* Periodically retry migrating the task to the preferred node */ |
1878 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); | 1864 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); |
1879 | p->numa_migrate_retry = jiffies + interval; | 1865 | numa_migrate_retry = jiffies + interval; |
1866 | |||
1867 | /* | ||
1868 | * Check that the new retry threshold is after the current one. If | ||
1869 | * the retry is in the future, it implies that wake_affine has | ||
1870 | * temporarily asked NUMA balancing to backoff from placement. | ||
1871 | */ | ||
1872 | if (numa_migrate_retry > p->numa_migrate_retry) | ||
1873 | return; | ||
1874 | |||
1875 | /* Safe to try placing the task on the preferred node */ | ||
1876 | p->numa_migrate_retry = numa_migrate_retry; | ||
1880 | 1877 | ||
1881 | /* Success if task is already running on preferred CPU */ | 1878 | /* Success if task is already running on preferred CPU */ |
1882 | if (task_node(p) == p->numa_preferred_nid) | 1879 | if (task_node(p) == p->numa_preferred_nid) |
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio) | |||
2823 | } | 2820 | } |
2824 | 2821 | ||
2825 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2822 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2826 | # ifdef CONFIG_SMP | 2823 | #ifdef CONFIG_SMP |
2827 | /* | 2824 | /* |
2828 | * All this does is approximate the hierarchical proportion which includes that | 2825 | * All this does is approximate the hierarchical proportion which includes that |
2829 | * global sum we all love to hate. | 2826 | * global sum we all love to hate. |
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) | |||
2974 | 2971 | ||
2975 | return clamp_t(long, runnable, MIN_SHARES, shares); | 2972 | return clamp_t(long, runnable, MIN_SHARES, shares); |
2976 | } | 2973 | } |
2977 | # endif /* CONFIG_SMP */ | 2974 | #endif /* CONFIG_SMP */ |
2978 | 2975 | ||
2979 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | 2976 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); |
2980 | 2977 | ||
@@ -3012,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se) | |||
3012 | } | 3009 | } |
3013 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 3010 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
3014 | 3011 | ||
3015 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 3012 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) |
3016 | { | 3013 | { |
3017 | struct rq *rq = rq_of(cfs_rq); | 3014 | struct rq *rq = rq_of(cfs_rq); |
3018 | 3015 | ||
3019 | if (&rq->cfs == cfs_rq) { | 3016 | if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) { |
3020 | /* | 3017 | /* |
3021 | * There are a few boundary cases this might miss but it should | 3018 | * There are a few boundary cases this might miss but it should |
3022 | * get called often enough that that should (hopefully) not be | 3019 | * get called often enough that that should (hopefully) not be |
@@ -3031,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | |||
3031 | * | 3028 | * |
3032 | * See cpu_util(). | 3029 | * See cpu_util(). |
3033 | */ | 3030 | */ |
3034 | cpufreq_update_util(rq, 0); | 3031 | cpufreq_update_util(rq, flags); |
3035 | } | 3032 | } |
3036 | } | 3033 | } |
3037 | 3034 | ||
@@ -3246,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna | |||
3246 | } | 3243 | } |
3247 | 3244 | ||
3248 | /* | 3245 | /* |
3246 | * When a task is dequeued, its estimated utilization should not be update if | ||
3247 | * its util_avg has not been updated at least once. | ||
3248 | * This flag is used to synchronize util_avg updates with util_est updates. | ||
3249 | * We map this information into the LSB bit of the utilization saved at | ||
3250 | * dequeue time (i.e. util_est.dequeued). | ||
3251 | */ | ||
3252 | #define UTIL_AVG_UNCHANGED 0x1 | ||
3253 | |||
3254 | static inline void cfs_se_util_change(struct sched_avg *avg) | ||
3255 | { | ||
3256 | unsigned int enqueued; | ||
3257 | |||
3258 | if (!sched_feat(UTIL_EST)) | ||
3259 | return; | ||
3260 | |||
3261 | /* Avoid store if the flag has been already set */ | ||
3262 | enqueued = avg->util_est.enqueued; | ||
3263 | if (!(enqueued & UTIL_AVG_UNCHANGED)) | ||
3264 | return; | ||
3265 | |||
3266 | /* Reset flag to report util_avg has been updated */ | ||
3267 | enqueued &= ~UTIL_AVG_UNCHANGED; | ||
3268 | WRITE_ONCE(avg->util_est.enqueued, enqueued); | ||
3269 | } | ||
3270 | |||
3271 | /* | ||
3249 | * sched_entity: | 3272 | * sched_entity: |
3250 | * | 3273 | * |
3251 | * task: | 3274 | * task: |
@@ -3296,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit | |||
3296 | cfs_rq->curr == se)) { | 3319 | cfs_rq->curr == se)) { |
3297 | 3320 | ||
3298 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 3321 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
3322 | cfs_se_util_change(&se->avg); | ||
3299 | return 1; | 3323 | return 1; |
3300 | } | 3324 | } |
3301 | 3325 | ||
@@ -3350,7 +3374,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | |||
3350 | } | 3374 | } |
3351 | 3375 | ||
3352 | /* | 3376 | /* |
3353 | * Called within set_task_rq() right before setting a task's cpu. The | 3377 | * Called within set_task_rq() right before setting a task's CPU. The |
3354 | * caller only guarantees p->pi_lock is held; no other assumptions, | 3378 | * caller only guarantees p->pi_lock is held; no other assumptions, |
3355 | * including the state of rq->lock, should be made. | 3379 | * including the state of rq->lock, should be made. |
3356 | */ | 3380 | */ |
@@ -3529,7 +3553,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf | |||
3529 | 3553 | ||
3530 | /* | 3554 | /* |
3531 | * runnable_sum can't be lower than running_sum | 3555 | * runnable_sum can't be lower than running_sum |
3532 | * As running sum is scale with cpu capacity wehreas the runnable sum | 3556 | * As running sum is scale with CPU capacity wehreas the runnable sum |
3533 | * is not we rescale running_sum 1st | 3557 | * is not we rescale running_sum 1st |
3534 | */ | 3558 | */ |
3535 | running_sum = se->avg.util_sum / | 3559 | running_sum = se->avg.util_sum / |
@@ -3689,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
3689 | #endif | 3713 | #endif |
3690 | 3714 | ||
3691 | if (decayed) | 3715 | if (decayed) |
3692 | cfs_rq_util_change(cfs_rq); | 3716 | cfs_rq_util_change(cfs_rq, 0); |
3693 | 3717 | ||
3694 | return decayed; | 3718 | return decayed; |
3695 | } | 3719 | } |
@@ -3702,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
3702 | * Must call update_cfs_rq_load_avg() before this, since we rely on | 3726 | * Must call update_cfs_rq_load_avg() before this, since we rely on |
3703 | * cfs_rq->avg.last_update_time being current. | 3727 | * cfs_rq->avg.last_update_time being current. |
3704 | */ | 3728 | */ |
3705 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3729 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
3706 | { | 3730 | { |
3707 | u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; | 3731 | u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; |
3708 | 3732 | ||
@@ -3738,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3738 | 3762 | ||
3739 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); | 3763 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); |
3740 | 3764 | ||
3741 | cfs_rq_util_change(cfs_rq); | 3765 | cfs_rq_util_change(cfs_rq, flags); |
3742 | } | 3766 | } |
3743 | 3767 | ||
3744 | /** | 3768 | /** |
@@ -3757,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3757 | 3781 | ||
3758 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); | 3782 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); |
3759 | 3783 | ||
3760 | cfs_rq_util_change(cfs_rq); | 3784 | cfs_rq_util_change(cfs_rq, 0); |
3761 | } | 3785 | } |
3762 | 3786 | ||
3763 | /* | 3787 | /* |
@@ -3787,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3787 | 3811 | ||
3788 | if (!se->avg.last_update_time && (flags & DO_ATTACH)) { | 3812 | if (!se->avg.last_update_time && (flags & DO_ATTACH)) { |
3789 | 3813 | ||
3790 | attach_entity_load_avg(cfs_rq, se); | 3814 | /* |
3815 | * DO_ATTACH means we're here from enqueue_entity(). | ||
3816 | * !last_update_time means we've passed through | ||
3817 | * migrate_task_rq_fair() indicating we migrated. | ||
3818 | * | ||
3819 | * IOW we're enqueueing a task on a new CPU. | ||
3820 | */ | ||
3821 | attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); | ||
3791 | update_tg_load_avg(cfs_rq, 0); | 3822 | update_tg_load_avg(cfs_rq, 0); |
3792 | 3823 | ||
3793 | } else if (decayed && (flags & UPDATE_TG)) | 3824 | } else if (decayed && (flags & UPDATE_TG)) |
@@ -3869,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | |||
3869 | 3900 | ||
3870 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); | 3901 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); |
3871 | 3902 | ||
3903 | static inline unsigned long task_util(struct task_struct *p) | ||
3904 | { | ||
3905 | return READ_ONCE(p->se.avg.util_avg); | ||
3906 | } | ||
3907 | |||
3908 | static inline unsigned long _task_util_est(struct task_struct *p) | ||
3909 | { | ||
3910 | struct util_est ue = READ_ONCE(p->se.avg.util_est); | ||
3911 | |||
3912 | return max(ue.ewma, ue.enqueued); | ||
3913 | } | ||
3914 | |||
3915 | static inline unsigned long task_util_est(struct task_struct *p) | ||
3916 | { | ||
3917 | return max(task_util(p), _task_util_est(p)); | ||
3918 | } | ||
3919 | |||
3920 | static inline void util_est_enqueue(struct cfs_rq *cfs_rq, | ||
3921 | struct task_struct *p) | ||
3922 | { | ||
3923 | unsigned int enqueued; | ||
3924 | |||
3925 | if (!sched_feat(UTIL_EST)) | ||
3926 | return; | ||
3927 | |||
3928 | /* Update root cfs_rq's estimated utilization */ | ||
3929 | enqueued = cfs_rq->avg.util_est.enqueued; | ||
3930 | enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); | ||
3931 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); | ||
3932 | } | ||
3933 | |||
3934 | /* | ||
3935 | * Check if a (signed) value is within a specified (unsigned) margin, | ||
3936 | * based on the observation that: | ||
3937 | * | ||
3938 | * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) | ||
3939 | * | ||
3940 | * NOTE: this only works when value + maring < INT_MAX. | ||
3941 | */ | ||
3942 | static inline bool within_margin(int value, int margin) | ||
3943 | { | ||
3944 | return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); | ||
3945 | } | ||
3946 | |||
3947 | static void | ||
3948 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | ||
3949 | { | ||
3950 | long last_ewma_diff; | ||
3951 | struct util_est ue; | ||
3952 | |||
3953 | if (!sched_feat(UTIL_EST)) | ||
3954 | return; | ||
3955 | |||
3956 | /* | ||
3957 | * Update root cfs_rq's estimated utilization | ||
3958 | * | ||
3959 | * If *p is the last task then the root cfs_rq's estimated utilization | ||
3960 | * of a CPU is 0 by definition. | ||
3961 | */ | ||
3962 | ue.enqueued = 0; | ||
3963 | if (cfs_rq->nr_running) { | ||
3964 | ue.enqueued = cfs_rq->avg.util_est.enqueued; | ||
3965 | ue.enqueued -= min_t(unsigned int, ue.enqueued, | ||
3966 | (_task_util_est(p) | UTIL_AVG_UNCHANGED)); | ||
3967 | } | ||
3968 | WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); | ||
3969 | |||
3970 | /* | ||
3971 | * Skip update of task's estimated utilization when the task has not | ||
3972 | * yet completed an activation, e.g. being migrated. | ||
3973 | */ | ||
3974 | if (!task_sleep) | ||
3975 | return; | ||
3976 | |||
3977 | /* | ||
3978 | * If the PELT values haven't changed since enqueue time, | ||
3979 | * skip the util_est update. | ||
3980 | */ | ||
3981 | ue = p->se.avg.util_est; | ||
3982 | if (ue.enqueued & UTIL_AVG_UNCHANGED) | ||
3983 | return; | ||
3984 | |||
3985 | /* | ||
3986 | * Skip update of task's estimated utilization when its EWMA is | ||
3987 | * already ~1% close to its last activation value. | ||
3988 | */ | ||
3989 | ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); | ||
3990 | last_ewma_diff = ue.enqueued - ue.ewma; | ||
3991 | if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) | ||
3992 | return; | ||
3993 | |||
3994 | /* | ||
3995 | * Update Task's estimated utilization | ||
3996 | * | ||
3997 | * When *p completes an activation we can consolidate another sample | ||
3998 | * of the task size. This is done by storing the current PELT value | ||
3999 | * as ue.enqueued and by using this value to update the Exponential | ||
4000 | * Weighted Moving Average (EWMA): | ||
4001 | * | ||
4002 | * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) | ||
4003 | * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) | ||
4004 | * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) | ||
4005 | * = w * ( last_ewma_diff ) + ewma(t-1) | ||
4006 | * = w * (last_ewma_diff + ewma(t-1) / w) | ||
4007 | * | ||
4008 | * Where 'w' is the weight of new samples, which is configured to be | ||
4009 | * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) | ||
4010 | */ | ||
4011 | ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; | ||
4012 | ue.ewma += last_ewma_diff; | ||
4013 | ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; | ||
4014 | WRITE_ONCE(p->se.avg.util_est, ue); | ||
4015 | } | ||
4016 | |||
3872 | #else /* CONFIG_SMP */ | 4017 | #else /* CONFIG_SMP */ |
3873 | 4018 | ||
3874 | static inline int | 4019 | static inline int |
@@ -3883,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
3883 | 4028 | ||
3884 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) | 4029 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) |
3885 | { | 4030 | { |
3886 | cfs_rq_util_change(cfs_rq); | 4031 | cfs_rq_util_change(cfs_rq, 0); |
3887 | } | 4032 | } |
3888 | 4033 | ||
3889 | static inline void remove_entity_load_avg(struct sched_entity *se) {} | 4034 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
3890 | 4035 | ||
3891 | static inline void | 4036 | static inline void |
3892 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 4037 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} |
3893 | static inline void | 4038 | static inline void |
3894 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 4039 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
3895 | 4040 | ||
@@ -3898,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf) | |||
3898 | return 0; | 4043 | return 0; |
3899 | } | 4044 | } |
3900 | 4045 | ||
4046 | static inline void | ||
4047 | util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} | ||
4048 | |||
4049 | static inline void | ||
4050 | util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, | ||
4051 | bool task_sleep) {} | ||
4052 | |||
3901 | #endif /* CONFIG_SMP */ | 4053 | #endif /* CONFIG_SMP */ |
3902 | 4054 | ||
3903 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) | 4055 | static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -4676,7 +4828,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
4676 | if (!se) | 4828 | if (!se) |
4677 | add_nr_running(rq, task_delta); | 4829 | add_nr_running(rq, task_delta); |
4678 | 4830 | ||
4679 | /* determine whether we need to wake up potentially idle cpu */ | 4831 | /* Determine whether we need to wake up potentially idle CPU: */ |
4680 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 4832 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
4681 | resched_curr(rq); | 4833 | resched_curr(rq); |
4682 | } | 4834 | } |
@@ -5041,7 +5193,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
5041 | } | 5193 | } |
5042 | 5194 | ||
5043 | /* | 5195 | /* |
5044 | * Both these cpu hotplug callbacks race against unregister_fair_sched_group() | 5196 | * Both these CPU hotplug callbacks race against unregister_fair_sched_group() |
5045 | * | 5197 | * |
5046 | * The race is harmless, since modifying bandwidth settings of unhooked group | 5198 | * The race is harmless, since modifying bandwidth settings of unhooked group |
5047 | * bits doesn't do much. | 5199 | * bits doesn't do much. |
@@ -5086,7 +5238,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
5086 | */ | 5238 | */ |
5087 | cfs_rq->runtime_remaining = 1; | 5239 | cfs_rq->runtime_remaining = 1; |
5088 | /* | 5240 | /* |
5089 | * Offline rq is schedulable till cpu is completely disabled | 5241 | * Offline rq is schedulable till CPU is completely disabled |
5090 | * in take_cpu_down(), so we prevent new cfs throttling here. | 5242 | * in take_cpu_down(), so we prevent new cfs throttling here. |
5091 | */ | 5243 | */ |
5092 | cfs_rq->runtime_enabled = 0; | 5244 | cfs_rq->runtime_enabled = 0; |
@@ -5245,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5245 | if (!se) | 5397 | if (!se) |
5246 | add_nr_running(rq, 1); | 5398 | add_nr_running(rq, 1); |
5247 | 5399 | ||
5400 | util_est_enqueue(&rq->cfs, p); | ||
5248 | hrtick_update(rq); | 5401 | hrtick_update(rq); |
5249 | } | 5402 | } |
5250 | 5403 | ||
@@ -5304,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
5304 | if (!se) | 5457 | if (!se) |
5305 | sub_nr_running(rq, 1); | 5458 | sub_nr_running(rq, 1); |
5306 | 5459 | ||
5460 | util_est_dequeue(&rq->cfs, p, task_sleep); | ||
5307 | hrtick_update(rq); | 5461 | hrtick_update(rq); |
5308 | } | 5462 | } |
5309 | 5463 | ||
@@ -5323,8 +5477,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | |||
5323 | * | 5477 | * |
5324 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load | 5478 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load |
5325 | * | 5479 | * |
5326 | * If a cpu misses updates for n ticks (as it was idle) and update gets | 5480 | * If a CPU misses updates for n ticks (as it was idle) and update gets |
5327 | * called on the n+1-th tick when cpu may be busy, then we have: | 5481 | * called on the n+1-th tick when CPU may be busy, then we have: |
5328 | * | 5482 | * |
5329 | * load_n = (1 - 1/2^i)^n * load_0 | 5483 | * load_n = (1 - 1/2^i)^n * load_0 |
5330 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load | 5484 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load |
@@ -5379,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
5379 | } | 5533 | } |
5380 | return load; | 5534 | return load; |
5381 | } | 5535 | } |
5536 | |||
5537 | static struct { | ||
5538 | cpumask_var_t idle_cpus_mask; | ||
5539 | atomic_t nr_cpus; | ||
5540 | int has_blocked; /* Idle CPUS has blocked load */ | ||
5541 | unsigned long next_balance; /* in jiffy units */ | ||
5542 | unsigned long next_blocked; /* Next update of blocked load in jiffies */ | ||
5543 | } nohz ____cacheline_aligned; | ||
5544 | |||
5382 | #endif /* CONFIG_NO_HZ_COMMON */ | 5545 | #endif /* CONFIG_NO_HZ_COMMON */ |
5383 | 5546 | ||
5384 | /** | 5547 | /** |
@@ -5468,7 +5631,7 @@ static unsigned long weighted_cpuload(struct rq *rq) | |||
5468 | #ifdef CONFIG_NO_HZ_COMMON | 5631 | #ifdef CONFIG_NO_HZ_COMMON |
5469 | /* | 5632 | /* |
5470 | * There is no sane way to deal with nohz on smp when using jiffies because the | 5633 | * There is no sane way to deal with nohz on smp when using jiffies because the |
5471 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 5634 | * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading |
5472 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | 5635 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. |
5473 | * | 5636 | * |
5474 | * Therefore we need to avoid the delta approach from the regular tick when | 5637 | * Therefore we need to avoid the delta approach from the regular tick when |
@@ -5579,7 +5742,7 @@ void cpu_load_update_active(struct rq *this_rq) | |||
5579 | } | 5742 | } |
5580 | 5743 | ||
5581 | /* | 5744 | /* |
5582 | * Return a low guess at the load of a migration-source cpu weighted | 5745 | * Return a low guess at the load of a migration-source CPU weighted |
5583 | * according to the scheduling class and "nice" value. | 5746 | * according to the scheduling class and "nice" value. |
5584 | * | 5747 | * |
5585 | * We want to under-estimate the load of migration sources, to | 5748 | * We want to under-estimate the load of migration sources, to |
@@ -5597,7 +5760,7 @@ static unsigned long source_load(int cpu, int type) | |||
5597 | } | 5760 | } |
5598 | 5761 | ||
5599 | /* | 5762 | /* |
5600 | * Return a high guess at the load of a migration-target cpu weighted | 5763 | * Return a high guess at the load of a migration-target CPU weighted |
5601 | * according to the scheduling class and "nice" value. | 5764 | * according to the scheduling class and "nice" value. |
5602 | */ | 5765 | */ |
5603 | static unsigned long target_load(int cpu, int type) | 5766 | static unsigned long target_load(int cpu, int type) |
@@ -5724,7 +5887,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5724 | unsigned long task_load; | 5887 | unsigned long task_load; |
5725 | 5888 | ||
5726 | this_eff_load = target_load(this_cpu, sd->wake_idx); | 5889 | this_eff_load = target_load(this_cpu, sd->wake_idx); |
5727 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
5728 | 5890 | ||
5729 | if (sync) { | 5891 | if (sync) { |
5730 | unsigned long current_load = task_h_load(current); | 5892 | unsigned long current_load = task_h_load(current); |
@@ -5742,18 +5904,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5742 | this_eff_load *= 100; | 5904 | this_eff_load *= 100; |
5743 | this_eff_load *= capacity_of(prev_cpu); | 5905 | this_eff_load *= capacity_of(prev_cpu); |
5744 | 5906 | ||
5907 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
5745 | prev_eff_load -= task_load; | 5908 | prev_eff_load -= task_load; |
5746 | if (sched_feat(WA_BIAS)) | 5909 | if (sched_feat(WA_BIAS)) |
5747 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | 5910 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; |
5748 | prev_eff_load *= capacity_of(this_cpu); | 5911 | prev_eff_load *= capacity_of(this_cpu); |
5749 | 5912 | ||
5750 | return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; | 5913 | /* |
5914 | * If sync, adjust the weight of prev_eff_load such that if | ||
5915 | * prev_eff == this_eff that select_idle_sibling() will consider | ||
5916 | * stacking the wakee on top of the waker if no other CPU is | ||
5917 | * idle. | ||
5918 | */ | ||
5919 | if (sync) | ||
5920 | prev_eff_load += 1; | ||
5921 | |||
5922 | return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; | ||
5923 | } | ||
5924 | |||
5925 | #ifdef CONFIG_NUMA_BALANCING | ||
5926 | static void | ||
5927 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
5928 | { | ||
5929 | unsigned long interval; | ||
5930 | |||
5931 | if (!static_branch_likely(&sched_numa_balancing)) | ||
5932 | return; | ||
5933 | |||
5934 | /* If balancing has no preference then continue gathering data */ | ||
5935 | if (p->numa_preferred_nid == -1) | ||
5936 | return; | ||
5937 | |||
5938 | /* | ||
5939 | * If the wakeup is not affecting locality then it is neutral from | ||
5940 | * the perspective of NUMA balacing so continue gathering data. | ||
5941 | */ | ||
5942 | if (cpu_to_node(prev_cpu) == cpu_to_node(target)) | ||
5943 | return; | ||
5944 | |||
5945 | /* | ||
5946 | * Temporarily prevent NUMA balancing trying to place waker/wakee after | ||
5947 | * wakee has been moved by wake_affine. This will potentially allow | ||
5948 | * related tasks to converge and update their data placement. The | ||
5949 | * 4 * numa_scan_period is to allow the two-pass filter to migrate | ||
5950 | * hot data to the wakers node. | ||
5951 | */ | ||
5952 | interval = max(sysctl_numa_balancing_scan_delay, | ||
5953 | p->numa_scan_period << 2); | ||
5954 | p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
5955 | |||
5956 | interval = max(sysctl_numa_balancing_scan_delay, | ||
5957 | current->numa_scan_period << 2); | ||
5958 | current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
5751 | } | 5959 | } |
5960 | #else | ||
5961 | static void | ||
5962 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
5963 | { | ||
5964 | } | ||
5965 | #endif | ||
5752 | 5966 | ||
5753 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, | 5967 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
5754 | int prev_cpu, int sync) | 5968 | int this_cpu, int prev_cpu, int sync) |
5755 | { | 5969 | { |
5756 | int this_cpu = smp_processor_id(); | ||
5757 | int target = nr_cpumask_bits; | 5970 | int target = nr_cpumask_bits; |
5758 | 5971 | ||
5759 | if (sched_feat(WA_IDLE)) | 5972 | if (sched_feat(WA_IDLE)) |
@@ -5766,12 +5979,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5766 | if (target == nr_cpumask_bits) | 5979 | if (target == nr_cpumask_bits) |
5767 | return prev_cpu; | 5980 | return prev_cpu; |
5768 | 5981 | ||
5982 | update_wa_numa_placement(p, prev_cpu, target); | ||
5769 | schedstat_inc(sd->ttwu_move_affine); | 5983 | schedstat_inc(sd->ttwu_move_affine); |
5770 | schedstat_inc(p->se.statistics.nr_wakeups_affine); | 5984 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
5771 | return target; | 5985 | return target; |
5772 | } | 5986 | } |
5773 | 5987 | ||
5774 | static inline unsigned long task_util(struct task_struct *p); | ||
5775 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); | 5988 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p); |
5776 | 5989 | ||
5777 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) | 5990 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) |
@@ -5826,7 +6039,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5826 | max_spare_cap = 0; | 6039 | max_spare_cap = 0; |
5827 | 6040 | ||
5828 | for_each_cpu(i, sched_group_span(group)) { | 6041 | for_each_cpu(i, sched_group_span(group)) { |
5829 | /* Bias balancing toward cpus of our domain */ | 6042 | /* Bias balancing toward CPUs of our domain */ |
5830 | if (local_group) | 6043 | if (local_group) |
5831 | load = source_load(i, load_idx); | 6044 | load = source_load(i, load_idx); |
5832 | else | 6045 | else |
@@ -5856,7 +6069,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5856 | if (min_runnable_load > (runnable_load + imbalance)) { | 6069 | if (min_runnable_load > (runnable_load + imbalance)) { |
5857 | /* | 6070 | /* |
5858 | * The runnable load is significantly smaller | 6071 | * The runnable load is significantly smaller |
5859 | * so we can pick this new cpu | 6072 | * so we can pick this new CPU: |
5860 | */ | 6073 | */ |
5861 | min_runnable_load = runnable_load; | 6074 | min_runnable_load = runnable_load; |
5862 | min_avg_load = avg_load; | 6075 | min_avg_load = avg_load; |
@@ -5865,7 +6078,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5865 | (100*min_avg_load > imbalance_scale*avg_load)) { | 6078 | (100*min_avg_load > imbalance_scale*avg_load)) { |
5866 | /* | 6079 | /* |
5867 | * The runnable loads are close so take the | 6080 | * The runnable loads are close so take the |
5868 | * blocked load into account through avg_load. | 6081 | * blocked load into account through avg_load: |
5869 | */ | 6082 | */ |
5870 | min_avg_load = avg_load; | 6083 | min_avg_load = avg_load; |
5871 | idlest = group; | 6084 | idlest = group; |
@@ -5903,6 +6116,18 @@ skip_spare: | |||
5903 | if (!idlest) | 6116 | if (!idlest) |
5904 | return NULL; | 6117 | return NULL; |
5905 | 6118 | ||
6119 | /* | ||
6120 | * When comparing groups across NUMA domains, it's possible for the | ||
6121 | * local domain to be very lightly loaded relative to the remote | ||
6122 | * domains but "imbalance" skews the comparison making remote CPUs | ||
6123 | * look much more favourable. When considering cross-domain, add | ||
6124 | * imbalance to the runnable load on the remote node and consider | ||
6125 | * staying local. | ||
6126 | */ | ||
6127 | if ((sd->flags & SD_NUMA) && | ||
6128 | min_runnable_load + imbalance >= this_runnable_load) | ||
6129 | return NULL; | ||
6130 | |||
5906 | if (min_runnable_load > (this_runnable_load + imbalance)) | 6131 | if (min_runnable_load > (this_runnable_load + imbalance)) |
5907 | return NULL; | 6132 | return NULL; |
5908 | 6133 | ||
@@ -5914,7 +6139,7 @@ skip_spare: | |||
5914 | } | 6139 | } |
5915 | 6140 | ||
5916 | /* | 6141 | /* |
5917 | * find_idlest_group_cpu - find the idlest cpu among the cpus in group. | 6142 | * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. |
5918 | */ | 6143 | */ |
5919 | static int | 6144 | static int |
5920 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 6145 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
@@ -5992,12 +6217,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
5992 | 6217 | ||
5993 | new_cpu = find_idlest_group_cpu(group, p, cpu); | 6218 | new_cpu = find_idlest_group_cpu(group, p, cpu); |
5994 | if (new_cpu == cpu) { | 6219 | if (new_cpu == cpu) { |
5995 | /* Now try balancing at a lower domain level of cpu */ | 6220 | /* Now try balancing at a lower domain level of 'cpu': */ |
5996 | sd = sd->child; | 6221 | sd = sd->child; |
5997 | continue; | 6222 | continue; |
5998 | } | 6223 | } |
5999 | 6224 | ||
6000 | /* Now try balancing at a lower domain level of new_cpu */ | 6225 | /* Now try balancing at a lower domain level of 'new_cpu': */ |
6001 | cpu = new_cpu; | 6226 | cpu = new_cpu; |
6002 | weight = sd->span_weight; | 6227 | weight = sd->span_weight; |
6003 | sd = NULL; | 6228 | sd = NULL; |
@@ -6007,7 +6232,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
6007 | if (tmp->flags & sd_flag) | 6232 | if (tmp->flags & sd_flag) |
6008 | sd = tmp; | 6233 | sd = tmp; |
6009 | } | 6234 | } |
6010 | /* while loop will break here if sd == NULL */ | ||
6011 | } | 6235 | } |
6012 | 6236 | ||
6013 | return new_cpu; | 6237 | return new_cpu; |
@@ -6203,12 +6427,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6203 | return target; | 6427 | return target; |
6204 | 6428 | ||
6205 | /* | 6429 | /* |
6206 | * If the previous cpu is cache affine and idle, don't be stupid. | 6430 | * If the previous CPU is cache affine and idle, don't be stupid: |
6207 | */ | 6431 | */ |
6208 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) | 6432 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
6209 | return prev; | 6433 | return prev; |
6210 | 6434 | ||
6211 | /* Check a recently used CPU as a potential idle candidate */ | 6435 | /* Check a recently used CPU as a potential idle candidate: */ |
6212 | recent_used_cpu = p->recent_used_cpu; | 6436 | recent_used_cpu = p->recent_used_cpu; |
6213 | if (recent_used_cpu != prev && | 6437 | if (recent_used_cpu != prev && |
6214 | recent_used_cpu != target && | 6438 | recent_used_cpu != target && |
@@ -6217,7 +6441,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6217 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { | 6441 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { |
6218 | /* | 6442 | /* |
6219 | * Replace recent_used_cpu with prev as it is a potential | 6443 | * Replace recent_used_cpu with prev as it is a potential |
6220 | * candidate for the next wake. | 6444 | * candidate for the next wake: |
6221 | */ | 6445 | */ |
6222 | p->recent_used_cpu = prev; | 6446 | p->recent_used_cpu = prev; |
6223 | return recent_used_cpu; | 6447 | return recent_used_cpu; |
@@ -6242,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6242 | return target; | 6466 | return target; |
6243 | } | 6467 | } |
6244 | 6468 | ||
6245 | /* | 6469 | /** |
6246 | * cpu_util returns the amount of capacity of a CPU that is used by CFS | 6470 | * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks |
6247 | * tasks. The unit of the return value must be the one of capacity so we can | 6471 | * @cpu: the CPU to get the utilization of |
6248 | * compare the utilization with the capacity of the CPU that is available for | 6472 | * |
6249 | * CFS task (ie cpu_capacity). | 6473 | * The unit of the return value must be the one of capacity so we can compare |
6474 | * the utilization with the capacity of the CPU that is available for CFS task | ||
6475 | * (ie cpu_capacity). | ||
6250 | * | 6476 | * |
6251 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the | 6477 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the |
6252 | * recent utilization of currently non-runnable tasks on a CPU. It represents | 6478 | * recent utilization of currently non-runnable tasks on a CPU. It represents |
@@ -6257,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6257 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is | 6483 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is |
6258 | * the running time on this CPU scaled by capacity_curr. | 6484 | * the running time on this CPU scaled by capacity_curr. |
6259 | * | 6485 | * |
6486 | * The estimated utilization of a CPU is defined to be the maximum between its | ||
6487 | * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks | ||
6488 | * currently RUNNABLE on that CPU. | ||
6489 | * This allows to properly represent the expected utilization of a CPU which | ||
6490 | * has just got a big task running since a long sleep period. At the same time | ||
6491 | * however it preserves the benefits of the "blocked utilization" in | ||
6492 | * describing the potential for other tasks waking up on the same CPU. | ||
6493 | * | ||
6260 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even | 6494 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even |
6261 | * higher than capacity_orig because of unfortunate rounding in | 6495 | * higher than capacity_orig because of unfortunate rounding in |
6262 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until | 6496 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until |
@@ -6267,36 +6501,77 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6267 | * available capacity. We allow utilization to overshoot capacity_curr (but not | 6501 | * available capacity. We allow utilization to overshoot capacity_curr (but not |
6268 | * capacity_orig) as it useful for predicting the capacity required after task | 6502 | * capacity_orig) as it useful for predicting the capacity required after task |
6269 | * migrations (scheduler-driven DVFS). | 6503 | * migrations (scheduler-driven DVFS). |
6504 | * | ||
6505 | * Return: the (estimated) utilization for the specified CPU | ||
6270 | */ | 6506 | */ |
6271 | static unsigned long cpu_util(int cpu) | 6507 | static inline unsigned long cpu_util(int cpu) |
6272 | { | 6508 | { |
6273 | unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; | 6509 | struct cfs_rq *cfs_rq; |
6274 | unsigned long capacity = capacity_orig_of(cpu); | 6510 | unsigned int util; |
6275 | 6511 | ||
6276 | return (util >= capacity) ? capacity : util; | 6512 | cfs_rq = &cpu_rq(cpu)->cfs; |
6277 | } | 6513 | util = READ_ONCE(cfs_rq->avg.util_avg); |
6278 | 6514 | ||
6279 | static inline unsigned long task_util(struct task_struct *p) | 6515 | if (sched_feat(UTIL_EST)) |
6280 | { | 6516 | util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); |
6281 | return p->se.avg.util_avg; | 6517 | |
6518 | return min_t(unsigned long, util, capacity_orig_of(cpu)); | ||
6282 | } | 6519 | } |
6283 | 6520 | ||
6284 | /* | 6521 | /* |
6285 | * cpu_util_wake: Compute cpu utilization with any contributions from | 6522 | * cpu_util_wake: Compute CPU utilization with any contributions from |
6286 | * the waking task p removed. | 6523 | * the waking task p removed. |
6287 | */ | 6524 | */ |
6288 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | 6525 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) |
6289 | { | 6526 | { |
6290 | unsigned long util, capacity; | 6527 | struct cfs_rq *cfs_rq; |
6528 | unsigned int util; | ||
6291 | 6529 | ||
6292 | /* Task has no contribution or is new */ | 6530 | /* Task has no contribution or is new */ |
6293 | if (cpu != task_cpu(p) || !p->se.avg.last_update_time) | 6531 | if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) |
6294 | return cpu_util(cpu); | 6532 | return cpu_util(cpu); |
6295 | 6533 | ||
6296 | capacity = capacity_orig_of(cpu); | 6534 | cfs_rq = &cpu_rq(cpu)->cfs; |
6297 | util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); | 6535 | util = READ_ONCE(cfs_rq->avg.util_avg); |
6298 | 6536 | ||
6299 | return (util >= capacity) ? capacity : util; | 6537 | /* Discount task's blocked util from CPU's util */ |
6538 | util -= min_t(unsigned int, util, task_util(p)); | ||
6539 | |||
6540 | /* | ||
6541 | * Covered cases: | ||
6542 | * | ||
6543 | * a) if *p is the only task sleeping on this CPU, then: | ||
6544 | * cpu_util (== task_util) > util_est (== 0) | ||
6545 | * and thus we return: | ||
6546 | * cpu_util_wake = (cpu_util - task_util) = 0 | ||
6547 | * | ||
6548 | * b) if other tasks are SLEEPING on this CPU, which is now exiting | ||
6549 | * IDLE, then: | ||
6550 | * cpu_util >= task_util | ||
6551 | * cpu_util > util_est (== 0) | ||
6552 | * and thus we discount *p's blocked utilization to return: | ||
6553 | * cpu_util_wake = (cpu_util - task_util) >= 0 | ||
6554 | * | ||
6555 | * c) if other tasks are RUNNABLE on that CPU and | ||
6556 | * util_est > cpu_util | ||
6557 | * then we use util_est since it returns a more restrictive | ||
6558 | * estimation of the spare capacity on that CPU, by just | ||
6559 | * considering the expected utilization of tasks already | ||
6560 | * runnable on that CPU. | ||
6561 | * | ||
6562 | * Cases a) and b) are covered by the above code, while case c) is | ||
6563 | * covered by the following code when estimated utilization is | ||
6564 | * enabled. | ||
6565 | */ | ||
6566 | if (sched_feat(UTIL_EST)) | ||
6567 | util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); | ||
6568 | |||
6569 | /* | ||
6570 | * Utilization (estimated) can exceed the CPU capacity, thus let's | ||
6571 | * clamp to the maximum CPU capacity to ensure consistency with | ||
6572 | * the cpu_util call. | ||
6573 | */ | ||
6574 | return min_t(unsigned long, util, capacity_orig_of(cpu)); | ||
6300 | } | 6575 | } |
6301 | 6576 | ||
6302 | /* | 6577 | /* |
@@ -6328,10 +6603,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | |||
6328 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 6603 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
6329 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. | 6604 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. |
6330 | * | 6605 | * |
6331 | * Balances load by selecting the idlest cpu in the idlest group, or under | 6606 | * Balances load by selecting the idlest CPU in the idlest group, or under |
6332 | * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. | 6607 | * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. |
6333 | * | 6608 | * |
6334 | * Returns the target cpu number. | 6609 | * Returns the target CPU number. |
6335 | * | 6610 | * |
6336 | * preempt must be disabled. | 6611 | * preempt must be disabled. |
6337 | */ | 6612 | */ |
@@ -6342,7 +6617,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6342 | int cpu = smp_processor_id(); | 6617 | int cpu = smp_processor_id(); |
6343 | int new_cpu = prev_cpu; | 6618 | int new_cpu = prev_cpu; |
6344 | int want_affine = 0; | 6619 | int want_affine = 0; |
6345 | int sync = wake_flags & WF_SYNC; | 6620 | int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); |
6346 | 6621 | ||
6347 | if (sd_flag & SD_BALANCE_WAKE) { | 6622 | if (sd_flag & SD_BALANCE_WAKE) { |
6348 | record_wakee(p); | 6623 | record_wakee(p); |
@@ -6356,7 +6631,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6356 | break; | 6631 | break; |
6357 | 6632 | ||
6358 | /* | 6633 | /* |
6359 | * If both cpu and prev_cpu are part of this domain, | 6634 | * If both 'cpu' and 'prev_cpu' are part of this domain, |
6360 | * cpu is a valid SD_WAKE_AFFINE target. | 6635 | * cpu is a valid SD_WAKE_AFFINE target. |
6361 | */ | 6636 | */ |
6362 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 6637 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
@@ -6376,7 +6651,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6376 | if (cpu == prev_cpu) | 6651 | if (cpu == prev_cpu) |
6377 | goto pick_cpu; | 6652 | goto pick_cpu; |
6378 | 6653 | ||
6379 | new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); | 6654 | new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); |
6380 | } | 6655 | } |
6381 | 6656 | ||
6382 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { | 6657 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { |
@@ -6407,9 +6682,9 @@ pick_cpu: | |||
6407 | static void detach_entity_cfs_rq(struct sched_entity *se); | 6682 | static void detach_entity_cfs_rq(struct sched_entity *se); |
6408 | 6683 | ||
6409 | /* | 6684 | /* |
6410 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 6685 | * Called immediately before a task is migrated to a new CPU; task_cpu(p) and |
6411 | * cfs_rq_of(p) references at time of call are still valid and identify the | 6686 | * cfs_rq_of(p) references at time of call are still valid and identify the |
6412 | * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. | 6687 | * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. |
6413 | */ | 6688 | */ |
6414 | static void migrate_task_rq_fair(struct task_struct *p) | 6689 | static void migrate_task_rq_fair(struct task_struct *p) |
6415 | { | 6690 | { |
@@ -6738,7 +7013,7 @@ simple: | |||
6738 | 7013 | ||
6739 | p = task_of(se); | 7014 | p = task_of(se); |
6740 | 7015 | ||
6741 | done: __maybe_unused | 7016 | done: __maybe_unused; |
6742 | #ifdef CONFIG_SMP | 7017 | #ifdef CONFIG_SMP |
6743 | /* | 7018 | /* |
6744 | * Move the next running task to the front of | 7019 | * Move the next running task to the front of |
@@ -6843,17 +7118,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6843 | * BASICS | 7118 | * BASICS |
6844 | * | 7119 | * |
6845 | * The purpose of load-balancing is to achieve the same basic fairness the | 7120 | * The purpose of load-balancing is to achieve the same basic fairness the |
6846 | * per-cpu scheduler provides, namely provide a proportional amount of compute | 7121 | * per-CPU scheduler provides, namely provide a proportional amount of compute |
6847 | * time to each task. This is expressed in the following equation: | 7122 | * time to each task. This is expressed in the following equation: |
6848 | * | 7123 | * |
6849 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) | 7124 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) |
6850 | * | 7125 | * |
6851 | * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight | 7126 | * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight |
6852 | * W_i,0 is defined as: | 7127 | * W_i,0 is defined as: |
6853 | * | 7128 | * |
6854 | * W_i,0 = \Sum_j w_i,j (2) | 7129 | * W_i,0 = \Sum_j w_i,j (2) |
6855 | * | 7130 | * |
6856 | * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight | 7131 | * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight |
6857 | * is derived from the nice value as per sched_prio_to_weight[]. | 7132 | * is derived from the nice value as per sched_prio_to_weight[]. |
6858 | * | 7133 | * |
6859 | * The weight average is an exponential decay average of the instantaneous | 7134 | * The weight average is an exponential decay average of the instantaneous |
@@ -6861,7 +7136,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6861 | * | 7136 | * |
6862 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | 7137 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) |
6863 | * | 7138 | * |
6864 | * C_i is the compute capacity of cpu i, typically it is the | 7139 | * C_i is the compute capacity of CPU i, typically it is the |
6865 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | 7140 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it |
6866 | * can also include other factors [XXX]. | 7141 | * can also include other factors [XXX]. |
6867 | * | 7142 | * |
@@ -6882,11 +7157,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6882 | * SCHED DOMAINS | 7157 | * SCHED DOMAINS |
6883 | * | 7158 | * |
6884 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) | 7159 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) |
6885 | * for all i,j solution, we create a tree of cpus that follows the hardware | 7160 | * for all i,j solution, we create a tree of CPUs that follows the hardware |
6886 | * topology where each level pairs two lower groups (or better). This results | 7161 | * topology where each level pairs two lower groups (or better). This results |
6887 | * in O(log n) layers. Furthermore we reduce the number of cpus going up the | 7162 | * in O(log n) layers. Furthermore we reduce the number of CPUs going up the |
6888 | * tree to only the first of the previous level and we decrease the frequency | 7163 | * tree to only the first of the previous level and we decrease the frequency |
6889 | * of load-balance at each level inv. proportional to the number of cpus in | 7164 | * of load-balance at each level inv. proportional to the number of CPUs in |
6890 | * the groups. | 7165 | * the groups. |
6891 | * | 7166 | * |
6892 | * This yields: | 7167 | * This yields: |
@@ -6895,7 +7170,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6895 | * \Sum { --- * --- * 2^i } = O(n) (5) | 7170 | * \Sum { --- * --- * 2^i } = O(n) (5) |
6896 | * i = 0 2^i 2^i | 7171 | * i = 0 2^i 2^i |
6897 | * `- size of each group | 7172 | * `- size of each group |
6898 | * | | `- number of cpus doing load-balance | 7173 | * | | `- number of CPUs doing load-balance |
6899 | * | `- freq | 7174 | * | `- freq |
6900 | * `- sum over all levels | 7175 | * `- sum over all levels |
6901 | * | 7176 | * |
@@ -6903,7 +7178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6903 | * this makes (5) the runtime complexity of the balancer. | 7178 | * this makes (5) the runtime complexity of the balancer. |
6904 | * | 7179 | * |
6905 | * An important property here is that each CPU is still (indirectly) connected | 7180 | * An important property here is that each CPU is still (indirectly) connected |
6906 | * to every other cpu in at most O(log n) steps: | 7181 | * to every other CPU in at most O(log n) steps: |
6907 | * | 7182 | * |
6908 | * The adjacency matrix of the resulting graph is given by: | 7183 | * The adjacency matrix of the resulting graph is given by: |
6909 | * | 7184 | * |
@@ -6915,7 +7190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6915 | * | 7190 | * |
6916 | * A^(log_2 n)_i,j != 0 for all i,j (7) | 7191 | * A^(log_2 n)_i,j != 0 for all i,j (7) |
6917 | * | 7192 | * |
6918 | * Showing there's indeed a path between every cpu in at most O(log n) steps. | 7193 | * Showing there's indeed a path between every CPU in at most O(log n) steps. |
6919 | * The task movement gives a factor of O(m), giving a convergence complexity | 7194 | * The task movement gives a factor of O(m), giving a convergence complexity |
6920 | * of: | 7195 | * of: |
6921 | * | 7196 | * |
@@ -6925,7 +7200,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6925 | * WORK CONSERVING | 7200 | * WORK CONSERVING |
6926 | * | 7201 | * |
6927 | * In order to avoid CPUs going idle while there's still work to do, new idle | 7202 | * In order to avoid CPUs going idle while there's still work to do, new idle |
6928 | * balancing is more aggressive and has the newly idle cpu iterate up the domain | 7203 | * balancing is more aggressive and has the newly idle CPU iterate up the domain |
6929 | * tree itself instead of relying on other CPUs to bring it work. | 7204 | * tree itself instead of relying on other CPUs to bring it work. |
6930 | * | 7205 | * |
6931 | * This adds some complexity to both (5) and (8) but it reduces the total idle | 7206 | * This adds some complexity to both (5) and (8) but it reduces the total idle |
@@ -6946,7 +7221,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6946 | * | 7221 | * |
6947 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) | 7222 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) |
6948 | * | 7223 | * |
6949 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. | 7224 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. |
6950 | * | 7225 | * |
6951 | * The big problem is S_k, its a global sum needed to compute a local (W_i) | 7226 | * The big problem is S_k, its a global sum needed to compute a local (W_i) |
6952 | * property. | 7227 | * property. |
@@ -6963,6 +7238,8 @@ enum fbq_type { regular, remote, all }; | |||
6963 | #define LBF_NEED_BREAK 0x02 | 7238 | #define LBF_NEED_BREAK 0x02 |
6964 | #define LBF_DST_PINNED 0x04 | 7239 | #define LBF_DST_PINNED 0x04 |
6965 | #define LBF_SOME_PINNED 0x08 | 7240 | #define LBF_SOME_PINNED 0x08 |
7241 | #define LBF_NOHZ_STATS 0x10 | ||
7242 | #define LBF_NOHZ_AGAIN 0x20 | ||
6966 | 7243 | ||
6967 | struct lb_env { | 7244 | struct lb_env { |
6968 | struct sched_domain *sd; | 7245 | struct sched_domain *sd; |
@@ -7110,7 +7387,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7110 | env->flags |= LBF_SOME_PINNED; | 7387 | env->flags |= LBF_SOME_PINNED; |
7111 | 7388 | ||
7112 | /* | 7389 | /* |
7113 | * Remember if this task can be migrated to any other cpu in | 7390 | * Remember if this task can be migrated to any other CPU in |
7114 | * our sched_group. We may want to revisit it if we couldn't | 7391 | * our sched_group. We may want to revisit it if we couldn't |
7115 | * meet load balance goals by pulling other tasks on src_cpu. | 7392 | * meet load balance goals by pulling other tasks on src_cpu. |
7116 | * | 7393 | * |
@@ -7120,7 +7397,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7120 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) | 7397 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) |
7121 | return 0; | 7398 | return 0; |
7122 | 7399 | ||
7123 | /* Prevent to re-select dst_cpu via env's cpus */ | 7400 | /* Prevent to re-select dst_cpu via env's CPUs: */ |
7124 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 7401 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
7125 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { | 7402 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { |
7126 | env->flags |= LBF_DST_PINNED; | 7403 | env->flags |= LBF_DST_PINNED; |
@@ -7347,6 +7624,17 @@ static void attach_tasks(struct lb_env *env) | |||
7347 | rq_unlock(env->dst_rq, &rf); | 7624 | rq_unlock(env->dst_rq, &rf); |
7348 | } | 7625 | } |
7349 | 7626 | ||
7627 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) | ||
7628 | { | ||
7629 | if (cfs_rq->avg.load_avg) | ||
7630 | return true; | ||
7631 | |||
7632 | if (cfs_rq->avg.util_avg) | ||
7633 | return true; | ||
7634 | |||
7635 | return false; | ||
7636 | } | ||
7637 | |||
7350 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7638 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7351 | 7639 | ||
7352 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | 7640 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) |
@@ -7371,6 +7659,7 @@ static void update_blocked_averages(int cpu) | |||
7371 | struct rq *rq = cpu_rq(cpu); | 7659 | struct rq *rq = cpu_rq(cpu); |
7372 | struct cfs_rq *cfs_rq, *pos; | 7660 | struct cfs_rq *cfs_rq, *pos; |
7373 | struct rq_flags rf; | 7661 | struct rq_flags rf; |
7662 | bool done = true; | ||
7374 | 7663 | ||
7375 | rq_lock_irqsave(rq, &rf); | 7664 | rq_lock_irqsave(rq, &rf); |
7376 | update_rq_clock(rq); | 7665 | update_rq_clock(rq); |
@@ -7400,7 +7689,17 @@ static void update_blocked_averages(int cpu) | |||
7400 | */ | 7689 | */ |
7401 | if (cfs_rq_is_decayed(cfs_rq)) | 7690 | if (cfs_rq_is_decayed(cfs_rq)) |
7402 | list_del_leaf_cfs_rq(cfs_rq); | 7691 | list_del_leaf_cfs_rq(cfs_rq); |
7692 | |||
7693 | /* Don't need periodic decay once load/util_avg are null */ | ||
7694 | if (cfs_rq_has_blocked(cfs_rq)) | ||
7695 | done = false; | ||
7403 | } | 7696 | } |
7697 | |||
7698 | #ifdef CONFIG_NO_HZ_COMMON | ||
7699 | rq->last_blocked_load_update_tick = jiffies; | ||
7700 | if (done) | ||
7701 | rq->has_blocked_load = 0; | ||
7702 | #endif | ||
7404 | rq_unlock_irqrestore(rq, &rf); | 7703 | rq_unlock_irqrestore(rq, &rf); |
7405 | } | 7704 | } |
7406 | 7705 | ||
@@ -7460,6 +7759,11 @@ static inline void update_blocked_averages(int cpu) | |||
7460 | rq_lock_irqsave(rq, &rf); | 7759 | rq_lock_irqsave(rq, &rf); |
7461 | update_rq_clock(rq); | 7760 | update_rq_clock(rq); |
7462 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); | 7761 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); |
7762 | #ifdef CONFIG_NO_HZ_COMMON | ||
7763 | rq->last_blocked_load_update_tick = jiffies; | ||
7764 | if (!cfs_rq_has_blocked(cfs_rq)) | ||
7765 | rq->has_blocked_load = 0; | ||
7766 | #endif | ||
7463 | rq_unlock_irqrestore(rq, &rf); | 7767 | rq_unlock_irqrestore(rq, &rf); |
7464 | } | 7768 | } |
7465 | 7769 | ||
@@ -7694,8 +7998,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
7694 | * Group imbalance indicates (and tries to solve) the problem where balancing | 7998 | * Group imbalance indicates (and tries to solve) the problem where balancing |
7695 | * groups is inadequate due to ->cpus_allowed constraints. | 7999 | * groups is inadequate due to ->cpus_allowed constraints. |
7696 | * | 8000 | * |
7697 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | 8001 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a |
7698 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | 8002 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. |
7699 | * Something like: | 8003 | * Something like: |
7700 | * | 8004 | * |
7701 | * { 0 1 2 3 } { 4 5 6 7 } | 8005 | * { 0 1 2 3 } { 4 5 6 7 } |
@@ -7703,7 +8007,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
7703 | * | 8007 | * |
7704 | * If we were to balance group-wise we'd place two tasks in the first group and | 8008 | * If we were to balance group-wise we'd place two tasks in the first group and |
7705 | * two tasks in the second group. Clearly this is undesired as it will overload | 8009 | * two tasks in the second group. Clearly this is undesired as it will overload |
7706 | * cpu 3 and leave one of the cpus in the second group unused. | 8010 | * cpu 3 and leave one of the CPUs in the second group unused. |
7707 | * | 8011 | * |
7708 | * The current solution to this issue is detecting the skew in the first group | 8012 | * The current solution to this issue is detecting the skew in the first group |
7709 | * by noticing the lower domain failed to reach balance and had difficulty | 8013 | * by noticing the lower domain failed to reach balance and had difficulty |
@@ -7794,6 +8098,28 @@ group_type group_classify(struct sched_group *group, | |||
7794 | return group_other; | 8098 | return group_other; |
7795 | } | 8099 | } |
7796 | 8100 | ||
8101 | static bool update_nohz_stats(struct rq *rq, bool force) | ||
8102 | { | ||
8103 | #ifdef CONFIG_NO_HZ_COMMON | ||
8104 | unsigned int cpu = rq->cpu; | ||
8105 | |||
8106 | if (!rq->has_blocked_load) | ||
8107 | return false; | ||
8108 | |||
8109 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
8110 | return false; | ||
8111 | |||
8112 | if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick)) | ||
8113 | return true; | ||
8114 | |||
8115 | update_blocked_averages(cpu); | ||
8116 | |||
8117 | return rq->has_blocked_load; | ||
8118 | #else | ||
8119 | return false; | ||
8120 | #endif | ||
8121 | } | ||
8122 | |||
7797 | /** | 8123 | /** |
7798 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 8124 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
7799 | * @env: The load balancing environment. | 8125 | * @env: The load balancing environment. |
@@ -7816,7 +8142,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
7816 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { | 8142 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
7817 | struct rq *rq = cpu_rq(i); | 8143 | struct rq *rq = cpu_rq(i); |
7818 | 8144 | ||
7819 | /* Bias balancing toward cpus of our domain */ | 8145 | if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) |
8146 | env->flags |= LBF_NOHZ_AGAIN; | ||
8147 | |||
8148 | /* Bias balancing toward CPUs of our domain: */ | ||
7820 | if (local_group) | 8149 | if (local_group) |
7821 | load = target_load(i, load_idx); | 8150 | load = target_load(i, load_idx); |
7822 | else | 8151 | else |
@@ -7902,7 +8231,7 @@ asym_packing: | |||
7902 | if (!(env->sd->flags & SD_ASYM_PACKING)) | 8231 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
7903 | return true; | 8232 | return true; |
7904 | 8233 | ||
7905 | /* No ASYM_PACKING if target cpu is already busy */ | 8234 | /* No ASYM_PACKING if target CPU is already busy */ |
7906 | if (env->idle == CPU_NOT_IDLE) | 8235 | if (env->idle == CPU_NOT_IDLE) |
7907 | return true; | 8236 | return true; |
7908 | /* | 8237 | /* |
@@ -7915,7 +8244,7 @@ asym_packing: | |||
7915 | if (!sds->busiest) | 8244 | if (!sds->busiest) |
7916 | return true; | 8245 | return true; |
7917 | 8246 | ||
7918 | /* Prefer to move from lowest priority cpu's work */ | 8247 | /* Prefer to move from lowest priority CPU's work */ |
7919 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, | 8248 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, |
7920 | sg->asym_prefer_cpu)) | 8249 | sg->asym_prefer_cpu)) |
7921 | return true; | 8250 | return true; |
@@ -7971,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
7971 | if (child && child->flags & SD_PREFER_SIBLING) | 8300 | if (child && child->flags & SD_PREFER_SIBLING) |
7972 | prefer_sibling = 1; | 8301 | prefer_sibling = 1; |
7973 | 8302 | ||
8303 | #ifdef CONFIG_NO_HZ_COMMON | ||
8304 | if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) | ||
8305 | env->flags |= LBF_NOHZ_STATS; | ||
8306 | #endif | ||
8307 | |||
7974 | load_idx = get_sd_load_idx(env->sd, env->idle); | 8308 | load_idx = get_sd_load_idx(env->sd, env->idle); |
7975 | 8309 | ||
7976 | do { | 8310 | do { |
@@ -8024,6 +8358,15 @@ next_group: | |||
8024 | sg = sg->next; | 8358 | sg = sg->next; |
8025 | } while (sg != env->sd->groups); | 8359 | } while (sg != env->sd->groups); |
8026 | 8360 | ||
8361 | #ifdef CONFIG_NO_HZ_COMMON | ||
8362 | if ((env->flags & LBF_NOHZ_AGAIN) && | ||
8363 | cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { | ||
8364 | |||
8365 | WRITE_ONCE(nohz.next_blocked, | ||
8366 | jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); | ||
8367 | } | ||
8368 | #endif | ||
8369 | |||
8027 | if (env->sd->flags & SD_NUMA) | 8370 | if (env->sd->flags & SD_NUMA) |
8028 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | 8371 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); |
8029 | 8372 | ||
@@ -8168,7 +8511,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8168 | if (busiest->group_type == group_imbalanced) { | 8511 | if (busiest->group_type == group_imbalanced) { |
8169 | /* | 8512 | /* |
8170 | * In the group_imb case we cannot rely on group-wide averages | 8513 | * In the group_imb case we cannot rely on group-wide averages |
8171 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 8514 | * to ensure CPU-load equilibrium, look at wider averages. XXX |
8172 | */ | 8515 | */ |
8173 | busiest->load_per_task = | 8516 | busiest->load_per_task = |
8174 | min(busiest->load_per_task, sds->avg_load); | 8517 | min(busiest->load_per_task, sds->avg_load); |
@@ -8187,7 +8530,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8187 | } | 8530 | } |
8188 | 8531 | ||
8189 | /* | 8532 | /* |
8190 | * If there aren't any idle cpus, avoid creating some. | 8533 | * If there aren't any idle CPUs, avoid creating some. |
8191 | */ | 8534 | */ |
8192 | if (busiest->group_type == group_overloaded && | 8535 | if (busiest->group_type == group_overloaded && |
8193 | local->group_type == group_overloaded) { | 8536 | local->group_type == group_overloaded) { |
@@ -8201,9 +8544,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8201 | } | 8544 | } |
8202 | 8545 | ||
8203 | /* | 8546 | /* |
8204 | * We're trying to get all the cpus to the average_load, so we don't | 8547 | * We're trying to get all the CPUs to the average_load, so we don't |
8205 | * want to push ourselves above the average load, nor do we wish to | 8548 | * want to push ourselves above the average load, nor do we wish to |
8206 | * reduce the max loaded cpu below the average load. At the same time, | 8549 | * reduce the max loaded CPU below the average load. At the same time, |
8207 | * we also don't want to reduce the group load below the group | 8550 | * we also don't want to reduce the group load below the group |
8208 | * capacity. Thus we look for the minimum possible imbalance. | 8551 | * capacity. Thus we look for the minimum possible imbalance. |
8209 | */ | 8552 | */ |
@@ -8297,9 +8640,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
8297 | 8640 | ||
8298 | if (env->idle == CPU_IDLE) { | 8641 | if (env->idle == CPU_IDLE) { |
8299 | /* | 8642 | /* |
8300 | * This cpu is idle. If the busiest group is not overloaded | 8643 | * This CPU is idle. If the busiest group is not overloaded |
8301 | * and there is no imbalance between this and busiest group | 8644 | * and there is no imbalance between this and busiest group |
8302 | * wrt idle cpus, it is balanced. The imbalance becomes | 8645 | * wrt idle CPUs, it is balanced. The imbalance becomes |
8303 | * significant if the diff is greater than 1 otherwise we | 8646 | * significant if the diff is greater than 1 otherwise we |
8304 | * might end up to just move the imbalance on another group | 8647 | * might end up to just move the imbalance on another group |
8305 | */ | 8648 | */ |
@@ -8327,7 +8670,7 @@ out_balanced: | |||
8327 | } | 8670 | } |
8328 | 8671 | ||
8329 | /* | 8672 | /* |
8330 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 8673 | * find_busiest_queue - find the busiest runqueue among the CPUs in the group. |
8331 | */ | 8674 | */ |
8332 | static struct rq *find_busiest_queue(struct lb_env *env, | 8675 | static struct rq *find_busiest_queue(struct lb_env *env, |
8333 | struct sched_group *group) | 8676 | struct sched_group *group) |
@@ -8371,7 +8714,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8371 | 8714 | ||
8372 | /* | 8715 | /* |
8373 | * When comparing with imbalance, use weighted_cpuload() | 8716 | * When comparing with imbalance, use weighted_cpuload() |
8374 | * which is not scaled with the cpu capacity. | 8717 | * which is not scaled with the CPU capacity. |
8375 | */ | 8718 | */ |
8376 | 8719 | ||
8377 | if (rq->nr_running == 1 && wl > env->imbalance && | 8720 | if (rq->nr_running == 1 && wl > env->imbalance && |
@@ -8379,9 +8722,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8379 | continue; | 8722 | continue; |
8380 | 8723 | ||
8381 | /* | 8724 | /* |
8382 | * For the load comparisons with the other cpu's, consider | 8725 | * For the load comparisons with the other CPU's, consider |
8383 | * the weighted_cpuload() scaled with the cpu capacity, so | 8726 | * the weighted_cpuload() scaled with the CPU capacity, so |
8384 | * that the load can be moved away from the cpu that is | 8727 | * that the load can be moved away from the CPU that is |
8385 | * potentially running at a lower capacity. | 8728 | * potentially running at a lower capacity. |
8386 | * | 8729 | * |
8387 | * Thus we're looking for max(wl_i / capacity_i), crosswise | 8730 | * Thus we're looking for max(wl_i / capacity_i), crosswise |
@@ -8452,13 +8795,13 @@ static int should_we_balance(struct lb_env *env) | |||
8452 | return 0; | 8795 | return 0; |
8453 | 8796 | ||
8454 | /* | 8797 | /* |
8455 | * In the newly idle case, we will allow all the cpu's | 8798 | * In the newly idle case, we will allow all the CPUs |
8456 | * to do the newly idle load balance. | 8799 | * to do the newly idle load balance. |
8457 | */ | 8800 | */ |
8458 | if (env->idle == CPU_NEWLY_IDLE) | 8801 | if (env->idle == CPU_NEWLY_IDLE) |
8459 | return 1; | 8802 | return 1; |
8460 | 8803 | ||
8461 | /* Try to find first idle cpu */ | 8804 | /* Try to find first idle CPU */ |
8462 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { | 8805 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { |
8463 | if (!idle_cpu(cpu)) | 8806 | if (!idle_cpu(cpu)) |
8464 | continue; | 8807 | continue; |
@@ -8471,7 +8814,7 @@ static int should_we_balance(struct lb_env *env) | |||
8471 | balance_cpu = group_balance_cpu(sg); | 8814 | balance_cpu = group_balance_cpu(sg); |
8472 | 8815 | ||
8473 | /* | 8816 | /* |
8474 | * First idle cpu or the first cpu(busiest) in this sched group | 8817 | * First idle CPU or the first CPU(busiest) in this sched group |
8475 | * is eligible for doing load balancing at this and above domains. | 8818 | * is eligible for doing load balancing at this and above domains. |
8476 | */ | 8819 | */ |
8477 | return balance_cpu == env->dst_cpu; | 8820 | return balance_cpu == env->dst_cpu; |
@@ -8580,7 +8923,7 @@ more_balance: | |||
8580 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | 8923 | * Revisit (affine) tasks on src_cpu that couldn't be moved to |
8581 | * us and move them to an alternate dst_cpu in our sched_group | 8924 | * us and move them to an alternate dst_cpu in our sched_group |
8582 | * where they can run. The upper limit on how many times we | 8925 | * where they can run. The upper limit on how many times we |
8583 | * iterate on same src_cpu is dependent on number of cpus in our | 8926 | * iterate on same src_cpu is dependent on number of CPUs in our |
8584 | * sched_group. | 8927 | * sched_group. |
8585 | * | 8928 | * |
8586 | * This changes load balance semantics a bit on who can move | 8929 | * This changes load balance semantics a bit on who can move |
@@ -8597,7 +8940,7 @@ more_balance: | |||
8597 | */ | 8940 | */ |
8598 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { | 8941 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
8599 | 8942 | ||
8600 | /* Prevent to re-select dst_cpu via env's cpus */ | 8943 | /* Prevent to re-select dst_cpu via env's CPUs */ |
8601 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | 8944 | cpumask_clear_cpu(env.dst_cpu, env.cpus); |
8602 | 8945 | ||
8603 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 8946 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
@@ -8659,9 +9002,10 @@ more_balance: | |||
8659 | 9002 | ||
8660 | raw_spin_lock_irqsave(&busiest->lock, flags); | 9003 | raw_spin_lock_irqsave(&busiest->lock, flags); |
8661 | 9004 | ||
8662 | /* don't kick the active_load_balance_cpu_stop, | 9005 | /* |
8663 | * if the curr task on busiest cpu can't be | 9006 | * Don't kick the active_load_balance_cpu_stop, |
8664 | * moved to this_cpu | 9007 | * if the curr task on busiest CPU can't be |
9008 | * moved to this_cpu: | ||
8665 | */ | 9009 | */ |
8666 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | 9010 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { |
8667 | raw_spin_unlock_irqrestore(&busiest->lock, | 9011 | raw_spin_unlock_irqrestore(&busiest->lock, |
@@ -8773,121 +9117,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) | |||
8773 | } | 9117 | } |
8774 | 9118 | ||
8775 | /* | 9119 | /* |
8776 | * idle_balance is called by schedule() if this_cpu is about to become | 9120 | * active_load_balance_cpu_stop is run by the CPU stopper. It pushes |
8777 | * idle. Attempts to pull tasks from other CPUs. | ||
8778 | */ | ||
8779 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) | ||
8780 | { | ||
8781 | unsigned long next_balance = jiffies + HZ; | ||
8782 | int this_cpu = this_rq->cpu; | ||
8783 | struct sched_domain *sd; | ||
8784 | int pulled_task = 0; | ||
8785 | u64 curr_cost = 0; | ||
8786 | |||
8787 | /* | ||
8788 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
8789 | * measure the duration of idle_balance() as idle time. | ||
8790 | */ | ||
8791 | this_rq->idle_stamp = rq_clock(this_rq); | ||
8792 | |||
8793 | /* | ||
8794 | * Do not pull tasks towards !active CPUs... | ||
8795 | */ | ||
8796 | if (!cpu_active(this_cpu)) | ||
8797 | return 0; | ||
8798 | |||
8799 | /* | ||
8800 | * This is OK, because current is on_cpu, which avoids it being picked | ||
8801 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
8802 | * further scheduler activity on it and we're being very careful to | ||
8803 | * re-start the picking loop. | ||
8804 | */ | ||
8805 | rq_unpin_lock(this_rq, rf); | ||
8806 | |||
8807 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | ||
8808 | !this_rq->rd->overload) { | ||
8809 | rcu_read_lock(); | ||
8810 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
8811 | if (sd) | ||
8812 | update_next_balance(sd, &next_balance); | ||
8813 | rcu_read_unlock(); | ||
8814 | |||
8815 | goto out; | ||
8816 | } | ||
8817 | |||
8818 | raw_spin_unlock(&this_rq->lock); | ||
8819 | |||
8820 | update_blocked_averages(this_cpu); | ||
8821 | rcu_read_lock(); | ||
8822 | for_each_domain(this_cpu, sd) { | ||
8823 | int continue_balancing = 1; | ||
8824 | u64 t0, domain_cost; | ||
8825 | |||
8826 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
8827 | continue; | ||
8828 | |||
8829 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | ||
8830 | update_next_balance(sd, &next_balance); | ||
8831 | break; | ||
8832 | } | ||
8833 | |||
8834 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
8835 | t0 = sched_clock_cpu(this_cpu); | ||
8836 | |||
8837 | pulled_task = load_balance(this_cpu, this_rq, | ||
8838 | sd, CPU_NEWLY_IDLE, | ||
8839 | &continue_balancing); | ||
8840 | |||
8841 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
8842 | if (domain_cost > sd->max_newidle_lb_cost) | ||
8843 | sd->max_newidle_lb_cost = domain_cost; | ||
8844 | |||
8845 | curr_cost += domain_cost; | ||
8846 | } | ||
8847 | |||
8848 | update_next_balance(sd, &next_balance); | ||
8849 | |||
8850 | /* | ||
8851 | * Stop searching for tasks to pull if there are | ||
8852 | * now runnable tasks on this rq. | ||
8853 | */ | ||
8854 | if (pulled_task || this_rq->nr_running > 0) | ||
8855 | break; | ||
8856 | } | ||
8857 | rcu_read_unlock(); | ||
8858 | |||
8859 | raw_spin_lock(&this_rq->lock); | ||
8860 | |||
8861 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
8862 | this_rq->max_idle_balance_cost = curr_cost; | ||
8863 | |||
8864 | /* | ||
8865 | * While browsing the domains, we released the rq lock, a task could | ||
8866 | * have been enqueued in the meantime. Since we're not going idle, | ||
8867 | * pretend we pulled a task. | ||
8868 | */ | ||
8869 | if (this_rq->cfs.h_nr_running && !pulled_task) | ||
8870 | pulled_task = 1; | ||
8871 | |||
8872 | out: | ||
8873 | /* Move the next balance forward */ | ||
8874 | if (time_after(this_rq->next_balance, next_balance)) | ||
8875 | this_rq->next_balance = next_balance; | ||
8876 | |||
8877 | /* Is there a task of a high priority class? */ | ||
8878 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) | ||
8879 | pulled_task = -1; | ||
8880 | |||
8881 | if (pulled_task) | ||
8882 | this_rq->idle_stamp = 0; | ||
8883 | |||
8884 | rq_repin_lock(this_rq, rf); | ||
8885 | |||
8886 | return pulled_task; | ||
8887 | } | ||
8888 | |||
8889 | /* | ||
8890 | * active_load_balance_cpu_stop is run by cpu stopper. It pushes | ||
8891 | * running tasks off the busiest CPU onto idle CPUs. It requires at | 9121 | * running tasks off the busiest CPU onto idle CPUs. It requires at |
8892 | * least 1 task to be running on each physical CPU where possible, and | 9122 | * least 1 task to be running on each physical CPU where possible, and |
8893 | * avoids physical / logical imbalances. | 9123 | * avoids physical / logical imbalances. |
@@ -8911,7 +9141,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8911 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) | 9141 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) |
8912 | goto out_unlock; | 9142 | goto out_unlock; |
8913 | 9143 | ||
8914 | /* make sure the requested cpu hasn't gone down in the meantime */ | 9144 | /* Make sure the requested CPU hasn't gone down in the meantime: */ |
8915 | if (unlikely(busiest_cpu != smp_processor_id() || | 9145 | if (unlikely(busiest_cpu != smp_processor_id() || |
8916 | !busiest_rq->active_balance)) | 9146 | !busiest_rq->active_balance)) |
8917 | goto out_unlock; | 9147 | goto out_unlock; |
@@ -8923,7 +9153,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8923 | /* | 9153 | /* |
8924 | * This condition is "impossible", if it occurs | 9154 | * This condition is "impossible", if it occurs |
8925 | * we need to fix it. Originally reported by | 9155 | * we need to fix it. Originally reported by |
8926 | * Bjorn Helgaas on a 128-cpu setup. | 9156 | * Bjorn Helgaas on a 128-CPU setup. |
8927 | */ | 9157 | */ |
8928 | BUG_ON(busiest_rq == target_rq); | 9158 | BUG_ON(busiest_rq == target_rq); |
8929 | 9159 | ||
@@ -8977,141 +9207,6 @@ out_unlock: | |||
8977 | return 0; | 9207 | return 0; |
8978 | } | 9208 | } |
8979 | 9209 | ||
8980 | static inline int on_null_domain(struct rq *rq) | ||
8981 | { | ||
8982 | return unlikely(!rcu_dereference_sched(rq->sd)); | ||
8983 | } | ||
8984 | |||
8985 | #ifdef CONFIG_NO_HZ_COMMON | ||
8986 | /* | ||
8987 | * idle load balancing details | ||
8988 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
8989 | * needed, they will kick the idle load balancer, which then does idle | ||
8990 | * load balancing for all the idle CPUs. | ||
8991 | */ | ||
8992 | static struct { | ||
8993 | cpumask_var_t idle_cpus_mask; | ||
8994 | atomic_t nr_cpus; | ||
8995 | unsigned long next_balance; /* in jiffy units */ | ||
8996 | } nohz ____cacheline_aligned; | ||
8997 | |||
8998 | static inline int find_new_ilb(void) | ||
8999 | { | ||
9000 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
9001 | |||
9002 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) | ||
9003 | return ilb; | ||
9004 | |||
9005 | return nr_cpu_ids; | ||
9006 | } | ||
9007 | |||
9008 | /* | ||
9009 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
9010 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
9011 | * CPU (if there is one). | ||
9012 | */ | ||
9013 | static void nohz_balancer_kick(void) | ||
9014 | { | ||
9015 | int ilb_cpu; | ||
9016 | |||
9017 | nohz.next_balance++; | ||
9018 | |||
9019 | ilb_cpu = find_new_ilb(); | ||
9020 | |||
9021 | if (ilb_cpu >= nr_cpu_ids) | ||
9022 | return; | ||
9023 | |||
9024 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) | ||
9025 | return; | ||
9026 | /* | ||
9027 | * Use smp_send_reschedule() instead of resched_cpu(). | ||
9028 | * This way we generate a sched IPI on the target cpu which | ||
9029 | * is idle. And the softirq performing nohz idle load balance | ||
9030 | * will be run before returning from the IPI. | ||
9031 | */ | ||
9032 | smp_send_reschedule(ilb_cpu); | ||
9033 | return; | ||
9034 | } | ||
9035 | |||
9036 | void nohz_balance_exit_idle(unsigned int cpu) | ||
9037 | { | ||
9038 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | ||
9039 | /* | ||
9040 | * Completely isolated CPUs don't ever set, so we must test. | ||
9041 | */ | ||
9042 | if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { | ||
9043 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
9044 | atomic_dec(&nohz.nr_cpus); | ||
9045 | } | ||
9046 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
9047 | } | ||
9048 | } | ||
9049 | |||
9050 | static inline void set_cpu_sd_state_busy(void) | ||
9051 | { | ||
9052 | struct sched_domain *sd; | ||
9053 | int cpu = smp_processor_id(); | ||
9054 | |||
9055 | rcu_read_lock(); | ||
9056 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9057 | |||
9058 | if (!sd || !sd->nohz_idle) | ||
9059 | goto unlock; | ||
9060 | sd->nohz_idle = 0; | ||
9061 | |||
9062 | atomic_inc(&sd->shared->nr_busy_cpus); | ||
9063 | unlock: | ||
9064 | rcu_read_unlock(); | ||
9065 | } | ||
9066 | |||
9067 | void set_cpu_sd_state_idle(void) | ||
9068 | { | ||
9069 | struct sched_domain *sd; | ||
9070 | int cpu = smp_processor_id(); | ||
9071 | |||
9072 | rcu_read_lock(); | ||
9073 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9074 | |||
9075 | if (!sd || sd->nohz_idle) | ||
9076 | goto unlock; | ||
9077 | sd->nohz_idle = 1; | ||
9078 | |||
9079 | atomic_dec(&sd->shared->nr_busy_cpus); | ||
9080 | unlock: | ||
9081 | rcu_read_unlock(); | ||
9082 | } | ||
9083 | |||
9084 | /* | ||
9085 | * This routine will record that the cpu is going idle with tick stopped. | ||
9086 | * This info will be used in performing idle load balancing in the future. | ||
9087 | */ | ||
9088 | void nohz_balance_enter_idle(int cpu) | ||
9089 | { | ||
9090 | /* | ||
9091 | * If this cpu is going down, then nothing needs to be done. | ||
9092 | */ | ||
9093 | if (!cpu_active(cpu)) | ||
9094 | return; | ||
9095 | |||
9096 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | ||
9097 | if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) | ||
9098 | return; | ||
9099 | |||
9100 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | ||
9101 | return; | ||
9102 | |||
9103 | /* | ||
9104 | * If we're a completely isolated CPU, we don't play. | ||
9105 | */ | ||
9106 | if (on_null_domain(cpu_rq(cpu))) | ||
9107 | return; | ||
9108 | |||
9109 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | ||
9110 | atomic_inc(&nohz.nr_cpus); | ||
9111 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
9112 | } | ||
9113 | #endif | ||
9114 | |||
9115 | static DEFINE_SPINLOCK(balancing); | 9210 | static DEFINE_SPINLOCK(balancing); |
9116 | 9211 | ||
9117 | /* | 9212 | /* |
@@ -9141,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) | |||
9141 | int need_serialize, need_decay = 0; | 9236 | int need_serialize, need_decay = 0; |
9142 | u64 max_cost = 0; | 9237 | u64 max_cost = 0; |
9143 | 9238 | ||
9144 | update_blocked_averages(cpu); | ||
9145 | |||
9146 | rcu_read_lock(); | 9239 | rcu_read_lock(); |
9147 | for_each_domain(cpu, sd) { | 9240 | for_each_domain(cpu, sd) { |
9148 | /* | 9241 | /* |
@@ -9232,68 +9325,56 @@ out: | |||
9232 | } | 9325 | } |
9233 | } | 9326 | } |
9234 | 9327 | ||
9328 | static inline int on_null_domain(struct rq *rq) | ||
9329 | { | ||
9330 | return unlikely(!rcu_dereference_sched(rq->sd)); | ||
9331 | } | ||
9332 | |||
9235 | #ifdef CONFIG_NO_HZ_COMMON | 9333 | #ifdef CONFIG_NO_HZ_COMMON |
9236 | /* | 9334 | /* |
9237 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 9335 | * idle load balancing details |
9238 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 9336 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
9337 | * needed, they will kick the idle load balancer, which then does idle | ||
9338 | * load balancing for all the idle CPUs. | ||
9239 | */ | 9339 | */ |
9240 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
9241 | { | ||
9242 | int this_cpu = this_rq->cpu; | ||
9243 | struct rq *rq; | ||
9244 | int balance_cpu; | ||
9245 | /* Earliest time when we have to do rebalance again */ | ||
9246 | unsigned long next_balance = jiffies + 60*HZ; | ||
9247 | int update_next_balance = 0; | ||
9248 | 9340 | ||
9249 | if (idle != CPU_IDLE || | 9341 | static inline int find_new_ilb(void) |
9250 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) | 9342 | { |
9251 | goto end; | 9343 | int ilb = cpumask_first(nohz.idle_cpus_mask); |
9252 | 9344 | ||
9253 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 9345 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
9254 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) | 9346 | return ilb; |
9255 | continue; | ||
9256 | 9347 | ||
9257 | /* | 9348 | return nr_cpu_ids; |
9258 | * If this cpu gets work to do, stop the load balancing | 9349 | } |
9259 | * work being done for other cpus. Next load | ||
9260 | * balancing owner will pick it up. | ||
9261 | */ | ||
9262 | if (need_resched()) | ||
9263 | break; | ||
9264 | 9350 | ||
9265 | rq = cpu_rq(balance_cpu); | 9351 | /* |
9352 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
9353 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
9354 | * CPU (if there is one). | ||
9355 | */ | ||
9356 | static void kick_ilb(unsigned int flags) | ||
9357 | { | ||
9358 | int ilb_cpu; | ||
9266 | 9359 | ||
9267 | /* | 9360 | nohz.next_balance++; |
9268 | * If time for next balance is due, | ||
9269 | * do the balance. | ||
9270 | */ | ||
9271 | if (time_after_eq(jiffies, rq->next_balance)) { | ||
9272 | struct rq_flags rf; | ||
9273 | 9361 | ||
9274 | rq_lock_irq(rq, &rf); | 9362 | ilb_cpu = find_new_ilb(); |
9275 | update_rq_clock(rq); | ||
9276 | cpu_load_update_idle(rq); | ||
9277 | rq_unlock_irq(rq, &rf); | ||
9278 | 9363 | ||
9279 | rebalance_domains(rq, CPU_IDLE); | 9364 | if (ilb_cpu >= nr_cpu_ids) |
9280 | } | 9365 | return; |
9281 | 9366 | ||
9282 | if (time_after(next_balance, rq->next_balance)) { | 9367 | flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); |
9283 | next_balance = rq->next_balance; | 9368 | if (flags & NOHZ_KICK_MASK) |
9284 | update_next_balance = 1; | 9369 | return; |
9285 | } | ||
9286 | } | ||
9287 | 9370 | ||
9288 | /* | 9371 | /* |
9289 | * next_balance will be updated only when there is a need. | 9372 | * Use smp_send_reschedule() instead of resched_cpu(). |
9290 | * When the CPU is attached to null domain for ex, it will not be | 9373 | * This way we generate a sched IPI on the target CPU which |
9291 | * updated. | 9374 | * is idle. And the softirq performing nohz idle load balance |
9375 | * will be run before returning from the IPI. | ||
9292 | */ | 9376 | */ |
9293 | if (likely(update_next_balance)) | 9377 | smp_send_reschedule(ilb_cpu); |
9294 | nohz.next_balance = next_balance; | ||
9295 | end: | ||
9296 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | ||
9297 | } | 9378 | } |
9298 | 9379 | ||
9299 | /* | 9380 | /* |
@@ -9307,36 +9388,41 @@ end: | |||
9307 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 9388 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
9308 | * domain span are idle. | 9389 | * domain span are idle. |
9309 | */ | 9390 | */ |
9310 | static inline bool nohz_kick_needed(struct rq *rq) | 9391 | static void nohz_balancer_kick(struct rq *rq) |
9311 | { | 9392 | { |
9312 | unsigned long now = jiffies; | 9393 | unsigned long now = jiffies; |
9313 | struct sched_domain_shared *sds; | 9394 | struct sched_domain_shared *sds; |
9314 | struct sched_domain *sd; | 9395 | struct sched_domain *sd; |
9315 | int nr_busy, i, cpu = rq->cpu; | 9396 | int nr_busy, i, cpu = rq->cpu; |
9316 | bool kick = false; | 9397 | unsigned int flags = 0; |
9317 | 9398 | ||
9318 | if (unlikely(rq->idle_balance)) | 9399 | if (unlikely(rq->idle_balance)) |
9319 | return false; | 9400 | return; |
9320 | 9401 | ||
9321 | /* | 9402 | /* |
9322 | * We may be recently in ticked or tickless idle mode. At the first | 9403 | * We may be recently in ticked or tickless idle mode. At the first |
9323 | * busy tick after returning from idle, we will update the busy stats. | 9404 | * busy tick after returning from idle, we will update the busy stats. |
9324 | */ | 9405 | */ |
9325 | set_cpu_sd_state_busy(); | 9406 | nohz_balance_exit_idle(rq); |
9326 | nohz_balance_exit_idle(cpu); | ||
9327 | 9407 | ||
9328 | /* | 9408 | /* |
9329 | * None are in tickless mode and hence no need for NOHZ idle load | 9409 | * None are in tickless mode and hence no need for NOHZ idle load |
9330 | * balancing. | 9410 | * balancing. |
9331 | */ | 9411 | */ |
9332 | if (likely(!atomic_read(&nohz.nr_cpus))) | 9412 | if (likely(!atomic_read(&nohz.nr_cpus))) |
9333 | return false; | 9413 | return; |
9414 | |||
9415 | if (READ_ONCE(nohz.has_blocked) && | ||
9416 | time_after(now, READ_ONCE(nohz.next_blocked))) | ||
9417 | flags = NOHZ_STATS_KICK; | ||
9334 | 9418 | ||
9335 | if (time_before(now, nohz.next_balance)) | 9419 | if (time_before(now, nohz.next_balance)) |
9336 | return false; | 9420 | goto out; |
9337 | 9421 | ||
9338 | if (rq->nr_running >= 2) | 9422 | if (rq->nr_running >= 2) { |
9339 | return true; | 9423 | flags = NOHZ_KICK_MASK; |
9424 | goto out; | ||
9425 | } | ||
9340 | 9426 | ||
9341 | rcu_read_lock(); | 9427 | rcu_read_lock(); |
9342 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | 9428 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
@@ -9347,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
9347 | */ | 9433 | */ |
9348 | nr_busy = atomic_read(&sds->nr_busy_cpus); | 9434 | nr_busy = atomic_read(&sds->nr_busy_cpus); |
9349 | if (nr_busy > 1) { | 9435 | if (nr_busy > 1) { |
9350 | kick = true; | 9436 | flags = NOHZ_KICK_MASK; |
9351 | goto unlock; | 9437 | goto unlock; |
9352 | } | 9438 | } |
9353 | 9439 | ||
@@ -9357,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
9357 | if (sd) { | 9443 | if (sd) { |
9358 | if ((rq->cfs.h_nr_running >= 1) && | 9444 | if ((rq->cfs.h_nr_running >= 1) && |
9359 | check_cpu_capacity(rq, sd)) { | 9445 | check_cpu_capacity(rq, sd)) { |
9360 | kick = true; | 9446 | flags = NOHZ_KICK_MASK; |
9361 | goto unlock; | 9447 | goto unlock; |
9362 | } | 9448 | } |
9363 | } | 9449 | } |
@@ -9370,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
9370 | continue; | 9456 | continue; |
9371 | 9457 | ||
9372 | if (sched_asym_prefer(i, cpu)) { | 9458 | if (sched_asym_prefer(i, cpu)) { |
9373 | kick = true; | 9459 | flags = NOHZ_KICK_MASK; |
9374 | goto unlock; | 9460 | goto unlock; |
9375 | } | 9461 | } |
9376 | } | 9462 | } |
9377 | } | 9463 | } |
9378 | unlock: | 9464 | unlock: |
9379 | rcu_read_unlock(); | 9465 | rcu_read_unlock(); |
9380 | return kick; | 9466 | out: |
9467 | if (flags) | ||
9468 | kick_ilb(flags); | ||
9469 | } | ||
9470 | |||
9471 | static void set_cpu_sd_state_busy(int cpu) | ||
9472 | { | ||
9473 | struct sched_domain *sd; | ||
9474 | |||
9475 | rcu_read_lock(); | ||
9476 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9477 | |||
9478 | if (!sd || !sd->nohz_idle) | ||
9479 | goto unlock; | ||
9480 | sd->nohz_idle = 0; | ||
9481 | |||
9482 | atomic_inc(&sd->shared->nr_busy_cpus); | ||
9483 | unlock: | ||
9484 | rcu_read_unlock(); | ||
9485 | } | ||
9486 | |||
9487 | void nohz_balance_exit_idle(struct rq *rq) | ||
9488 | { | ||
9489 | SCHED_WARN_ON(rq != this_rq()); | ||
9490 | |||
9491 | if (likely(!rq->nohz_tick_stopped)) | ||
9492 | return; | ||
9493 | |||
9494 | rq->nohz_tick_stopped = 0; | ||
9495 | cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); | ||
9496 | atomic_dec(&nohz.nr_cpus); | ||
9497 | |||
9498 | set_cpu_sd_state_busy(rq->cpu); | ||
9499 | } | ||
9500 | |||
9501 | static void set_cpu_sd_state_idle(int cpu) | ||
9502 | { | ||
9503 | struct sched_domain *sd; | ||
9504 | |||
9505 | rcu_read_lock(); | ||
9506 | sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||
9507 | |||
9508 | if (!sd || sd->nohz_idle) | ||
9509 | goto unlock; | ||
9510 | sd->nohz_idle = 1; | ||
9511 | |||
9512 | atomic_dec(&sd->shared->nr_busy_cpus); | ||
9513 | unlock: | ||
9514 | rcu_read_unlock(); | ||
9515 | } | ||
9516 | |||
9517 | /* | ||
9518 | * This routine will record that the CPU is going idle with tick stopped. | ||
9519 | * This info will be used in performing idle load balancing in the future. | ||
9520 | */ | ||
9521 | void nohz_balance_enter_idle(int cpu) | ||
9522 | { | ||
9523 | struct rq *rq = cpu_rq(cpu); | ||
9524 | |||
9525 | SCHED_WARN_ON(cpu != smp_processor_id()); | ||
9526 | |||
9527 | /* If this CPU is going down, then nothing needs to be done: */ | ||
9528 | if (!cpu_active(cpu)) | ||
9529 | return; | ||
9530 | |||
9531 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | ||
9532 | if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) | ||
9533 | return; | ||
9534 | |||
9535 | /* | ||
9536 | * Can be set safely without rq->lock held | ||
9537 | * If a clear happens, it will have evaluated last additions because | ||
9538 | * rq->lock is held during the check and the clear | ||
9539 | */ | ||
9540 | rq->has_blocked_load = 1; | ||
9541 | |||
9542 | /* | ||
9543 | * The tick is still stopped but load could have been added in the | ||
9544 | * meantime. We set the nohz.has_blocked flag to trig a check of the | ||
9545 | * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear | ||
9546 | * of nohz.has_blocked can only happen after checking the new load | ||
9547 | */ | ||
9548 | if (rq->nohz_tick_stopped) | ||
9549 | goto out; | ||
9550 | |||
9551 | /* If we're a completely isolated CPU, we don't play: */ | ||
9552 | if (on_null_domain(rq)) | ||
9553 | return; | ||
9554 | |||
9555 | rq->nohz_tick_stopped = 1; | ||
9556 | |||
9557 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | ||
9558 | atomic_inc(&nohz.nr_cpus); | ||
9559 | |||
9560 | /* | ||
9561 | * Ensures that if nohz_idle_balance() fails to observe our | ||
9562 | * @idle_cpus_mask store, it must observe the @has_blocked | ||
9563 | * store. | ||
9564 | */ | ||
9565 | smp_mb__after_atomic(); | ||
9566 | |||
9567 | set_cpu_sd_state_idle(cpu); | ||
9568 | |||
9569 | out: | ||
9570 | /* | ||
9571 | * Each time a cpu enter idle, we assume that it has blocked load and | ||
9572 | * enable the periodic update of the load of idle cpus | ||
9573 | */ | ||
9574 | WRITE_ONCE(nohz.has_blocked, 1); | ||
9575 | } | ||
9576 | |||
9577 | /* | ||
9578 | * Internal function that runs load balance for all idle cpus. The load balance | ||
9579 | * can be a simple update of blocked load or a complete load balance with | ||
9580 | * tasks movement depending of flags. | ||
9581 | * The function returns false if the loop has stopped before running | ||
9582 | * through all idle CPUs. | ||
9583 | */ | ||
9584 | static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, | ||
9585 | enum cpu_idle_type idle) | ||
9586 | { | ||
9587 | /* Earliest time when we have to do rebalance again */ | ||
9588 | unsigned long now = jiffies; | ||
9589 | unsigned long next_balance = now + 60*HZ; | ||
9590 | bool has_blocked_load = false; | ||
9591 | int update_next_balance = 0; | ||
9592 | int this_cpu = this_rq->cpu; | ||
9593 | int balance_cpu; | ||
9594 | int ret = false; | ||
9595 | struct rq *rq; | ||
9596 | |||
9597 | SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); | ||
9598 | |||
9599 | /* | ||
9600 | * We assume there will be no idle load after this update and clear | ||
9601 | * the has_blocked flag. If a cpu enters idle in the mean time, it will | ||
9602 | * set the has_blocked flag and trig another update of idle load. | ||
9603 | * Because a cpu that becomes idle, is added to idle_cpus_mask before | ||
9604 | * setting the flag, we are sure to not clear the state and not | ||
9605 | * check the load of an idle cpu. | ||
9606 | */ | ||
9607 | WRITE_ONCE(nohz.has_blocked, 0); | ||
9608 | |||
9609 | /* | ||
9610 | * Ensures that if we miss the CPU, we must see the has_blocked | ||
9611 | * store from nohz_balance_enter_idle(). | ||
9612 | */ | ||
9613 | smp_mb(); | ||
9614 | |||
9615 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
9616 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) | ||
9617 | continue; | ||
9618 | |||
9619 | /* | ||
9620 | * If this CPU gets work to do, stop the load balancing | ||
9621 | * work being done for other CPUs. Next load | ||
9622 | * balancing owner will pick it up. | ||
9623 | */ | ||
9624 | if (need_resched()) { | ||
9625 | has_blocked_load = true; | ||
9626 | goto abort; | ||
9627 | } | ||
9628 | |||
9629 | rq = cpu_rq(balance_cpu); | ||
9630 | |||
9631 | has_blocked_load |= update_nohz_stats(rq, true); | ||
9632 | |||
9633 | /* | ||
9634 | * If time for next balance is due, | ||
9635 | * do the balance. | ||
9636 | */ | ||
9637 | if (time_after_eq(jiffies, rq->next_balance)) { | ||
9638 | struct rq_flags rf; | ||
9639 | |||
9640 | rq_lock_irqsave(rq, &rf); | ||
9641 | update_rq_clock(rq); | ||
9642 | cpu_load_update_idle(rq); | ||
9643 | rq_unlock_irqrestore(rq, &rf); | ||
9644 | |||
9645 | if (flags & NOHZ_BALANCE_KICK) | ||
9646 | rebalance_domains(rq, CPU_IDLE); | ||
9647 | } | ||
9648 | |||
9649 | if (time_after(next_balance, rq->next_balance)) { | ||
9650 | next_balance = rq->next_balance; | ||
9651 | update_next_balance = 1; | ||
9652 | } | ||
9653 | } | ||
9654 | |||
9655 | /* Newly idle CPU doesn't need an update */ | ||
9656 | if (idle != CPU_NEWLY_IDLE) { | ||
9657 | update_blocked_averages(this_cpu); | ||
9658 | has_blocked_load |= this_rq->has_blocked_load; | ||
9659 | } | ||
9660 | |||
9661 | if (flags & NOHZ_BALANCE_KICK) | ||
9662 | rebalance_domains(this_rq, CPU_IDLE); | ||
9663 | |||
9664 | WRITE_ONCE(nohz.next_blocked, | ||
9665 | now + msecs_to_jiffies(LOAD_AVG_PERIOD)); | ||
9666 | |||
9667 | /* The full idle balance loop has been done */ | ||
9668 | ret = true; | ||
9669 | |||
9670 | abort: | ||
9671 | /* There is still blocked load, enable periodic update */ | ||
9672 | if (has_blocked_load) | ||
9673 | WRITE_ONCE(nohz.has_blocked, 1); | ||
9674 | |||
9675 | /* | ||
9676 | * next_balance will be updated only when there is a need. | ||
9677 | * When the CPU is attached to null domain for ex, it will not be | ||
9678 | * updated. | ||
9679 | */ | ||
9680 | if (likely(update_next_balance)) | ||
9681 | nohz.next_balance = next_balance; | ||
9682 | |||
9683 | return ret; | ||
9684 | } | ||
9685 | |||
9686 | /* | ||
9687 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | ||
9688 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | ||
9689 | */ | ||
9690 | static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
9691 | { | ||
9692 | int this_cpu = this_rq->cpu; | ||
9693 | unsigned int flags; | ||
9694 | |||
9695 | if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) | ||
9696 | return false; | ||
9697 | |||
9698 | if (idle != CPU_IDLE) { | ||
9699 | atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); | ||
9700 | return false; | ||
9701 | } | ||
9702 | |||
9703 | /* | ||
9704 | * barrier, pairs with nohz_balance_enter_idle(), ensures ... | ||
9705 | */ | ||
9706 | flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); | ||
9707 | if (!(flags & NOHZ_KICK_MASK)) | ||
9708 | return false; | ||
9709 | |||
9710 | _nohz_idle_balance(this_rq, flags, idle); | ||
9711 | |||
9712 | return true; | ||
9713 | } | ||
9714 | |||
9715 | static void nohz_newidle_balance(struct rq *this_rq) | ||
9716 | { | ||
9717 | int this_cpu = this_rq->cpu; | ||
9718 | |||
9719 | /* | ||
9720 | * This CPU doesn't want to be disturbed by scheduler | ||
9721 | * housekeeping | ||
9722 | */ | ||
9723 | if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) | ||
9724 | return; | ||
9725 | |||
9726 | /* Will wake up very soon. No time for doing anything else*/ | ||
9727 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
9728 | return; | ||
9729 | |||
9730 | /* Don't need to update blocked load of idle CPUs*/ | ||
9731 | if (!READ_ONCE(nohz.has_blocked) || | ||
9732 | time_before(jiffies, READ_ONCE(nohz.next_blocked))) | ||
9733 | return; | ||
9734 | |||
9735 | raw_spin_unlock(&this_rq->lock); | ||
9736 | /* | ||
9737 | * This CPU is going to be idle and blocked load of idle CPUs | ||
9738 | * need to be updated. Run the ilb locally as it is a good | ||
9739 | * candidate for ilb instead of waking up another idle CPU. | ||
9740 | * Kick an normal ilb if we failed to do the update. | ||
9741 | */ | ||
9742 | if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) | ||
9743 | kick_ilb(NOHZ_STATS_KICK); | ||
9744 | raw_spin_lock(&this_rq->lock); | ||
9745 | } | ||
9746 | |||
9747 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
9748 | static inline void nohz_balancer_kick(struct rq *rq) { } | ||
9749 | |||
9750 | static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | ||
9751 | { | ||
9752 | return false; | ||
9753 | } | ||
9754 | |||
9755 | static inline void nohz_newidle_balance(struct rq *this_rq) { } | ||
9756 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
9757 | |||
9758 | /* | ||
9759 | * idle_balance is called by schedule() if this_cpu is about to become | ||
9760 | * idle. Attempts to pull tasks from other CPUs. | ||
9761 | */ | ||
9762 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) | ||
9763 | { | ||
9764 | unsigned long next_balance = jiffies + HZ; | ||
9765 | int this_cpu = this_rq->cpu; | ||
9766 | struct sched_domain *sd; | ||
9767 | int pulled_task = 0; | ||
9768 | u64 curr_cost = 0; | ||
9769 | |||
9770 | /* | ||
9771 | * We must set idle_stamp _before_ calling idle_balance(), such that we | ||
9772 | * measure the duration of idle_balance() as idle time. | ||
9773 | */ | ||
9774 | this_rq->idle_stamp = rq_clock(this_rq); | ||
9775 | |||
9776 | /* | ||
9777 | * Do not pull tasks towards !active CPUs... | ||
9778 | */ | ||
9779 | if (!cpu_active(this_cpu)) | ||
9780 | return 0; | ||
9781 | |||
9782 | /* | ||
9783 | * This is OK, because current is on_cpu, which avoids it being picked | ||
9784 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
9785 | * further scheduler activity on it and we're being very careful to | ||
9786 | * re-start the picking loop. | ||
9787 | */ | ||
9788 | rq_unpin_lock(this_rq, rf); | ||
9789 | |||
9790 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | ||
9791 | !this_rq->rd->overload) { | ||
9792 | |||
9793 | rcu_read_lock(); | ||
9794 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
9795 | if (sd) | ||
9796 | update_next_balance(sd, &next_balance); | ||
9797 | rcu_read_unlock(); | ||
9798 | |||
9799 | nohz_newidle_balance(this_rq); | ||
9800 | |||
9801 | goto out; | ||
9802 | } | ||
9803 | |||
9804 | raw_spin_unlock(&this_rq->lock); | ||
9805 | |||
9806 | update_blocked_averages(this_cpu); | ||
9807 | rcu_read_lock(); | ||
9808 | for_each_domain(this_cpu, sd) { | ||
9809 | int continue_balancing = 1; | ||
9810 | u64 t0, domain_cost; | ||
9811 | |||
9812 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
9813 | continue; | ||
9814 | |||
9815 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { | ||
9816 | update_next_balance(sd, &next_balance); | ||
9817 | break; | ||
9818 | } | ||
9819 | |||
9820 | if (sd->flags & SD_BALANCE_NEWIDLE) { | ||
9821 | t0 = sched_clock_cpu(this_cpu); | ||
9822 | |||
9823 | pulled_task = load_balance(this_cpu, this_rq, | ||
9824 | sd, CPU_NEWLY_IDLE, | ||
9825 | &continue_balancing); | ||
9826 | |||
9827 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
9828 | if (domain_cost > sd->max_newidle_lb_cost) | ||
9829 | sd->max_newidle_lb_cost = domain_cost; | ||
9830 | |||
9831 | curr_cost += domain_cost; | ||
9832 | } | ||
9833 | |||
9834 | update_next_balance(sd, &next_balance); | ||
9835 | |||
9836 | /* | ||
9837 | * Stop searching for tasks to pull if there are | ||
9838 | * now runnable tasks on this rq. | ||
9839 | */ | ||
9840 | if (pulled_task || this_rq->nr_running > 0) | ||
9841 | break; | ||
9842 | } | ||
9843 | rcu_read_unlock(); | ||
9844 | |||
9845 | raw_spin_lock(&this_rq->lock); | ||
9846 | |||
9847 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
9848 | this_rq->max_idle_balance_cost = curr_cost; | ||
9849 | |||
9850 | /* | ||
9851 | * While browsing the domains, we released the rq lock, a task could | ||
9852 | * have been enqueued in the meantime. Since we're not going idle, | ||
9853 | * pretend we pulled a task. | ||
9854 | */ | ||
9855 | if (this_rq->cfs.h_nr_running && !pulled_task) | ||
9856 | pulled_task = 1; | ||
9857 | |||
9858 | out: | ||
9859 | /* Move the next balance forward */ | ||
9860 | if (time_after(this_rq->next_balance, next_balance)) | ||
9861 | this_rq->next_balance = next_balance; | ||
9862 | |||
9863 | /* Is there a task of a high priority class? */ | ||
9864 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) | ||
9865 | pulled_task = -1; | ||
9866 | |||
9867 | if (pulled_task) | ||
9868 | this_rq->idle_stamp = 0; | ||
9869 | |||
9870 | rq_repin_lock(this_rq, rf); | ||
9871 | |||
9872 | return pulled_task; | ||
9381 | } | 9873 | } |
9382 | #else | ||
9383 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | ||
9384 | #endif | ||
9385 | 9874 | ||
9386 | /* | 9875 | /* |
9387 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 9876 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
@@ -9394,14 +9883,18 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) | |||
9394 | CPU_IDLE : CPU_NOT_IDLE; | 9883 | CPU_IDLE : CPU_NOT_IDLE; |
9395 | 9884 | ||
9396 | /* | 9885 | /* |
9397 | * If this cpu has a pending nohz_balance_kick, then do the | 9886 | * If this CPU has a pending nohz_balance_kick, then do the |
9398 | * balancing on behalf of the other idle cpus whose ticks are | 9887 | * balancing on behalf of the other idle CPUs whose ticks are |
9399 | * stopped. Do nohz_idle_balance *before* rebalance_domains to | 9888 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
9400 | * give the idle cpus a chance to load balance. Else we may | 9889 | * give the idle CPUs a chance to load balance. Else we may |
9401 | * load balance only within the local sched_domain hierarchy | 9890 | * load balance only within the local sched_domain hierarchy |
9402 | * and abort nohz_idle_balance altogether if we pull some load. | 9891 | * and abort nohz_idle_balance altogether if we pull some load. |
9403 | */ | 9892 | */ |
9404 | nohz_idle_balance(this_rq, idle); | 9893 | if (nohz_idle_balance(this_rq, idle)) |
9894 | return; | ||
9895 | |||
9896 | /* normal load balance */ | ||
9897 | update_blocked_averages(this_rq->cpu); | ||
9405 | rebalance_domains(this_rq, idle); | 9898 | rebalance_domains(this_rq, idle); |
9406 | } | 9899 | } |
9407 | 9900 | ||
@@ -9416,10 +9909,8 @@ void trigger_load_balance(struct rq *rq) | |||
9416 | 9909 | ||
9417 | if (time_after_eq(jiffies, rq->next_balance)) | 9910 | if (time_after_eq(jiffies, rq->next_balance)) |
9418 | raise_softirq(SCHED_SOFTIRQ); | 9911 | raise_softirq(SCHED_SOFTIRQ); |
9419 | #ifdef CONFIG_NO_HZ_COMMON | 9912 | |
9420 | if (nohz_kick_needed(rq)) | 9913 | nohz_balancer_kick(rq); |
9421 | nohz_balancer_kick(); | ||
9422 | #endif | ||
9423 | } | 9914 | } |
9424 | 9915 | ||
9425 | static void rq_online_fair(struct rq *rq) | 9916 | static void rq_online_fair(struct rq *rq) |
@@ -9440,7 +9931,12 @@ static void rq_offline_fair(struct rq *rq) | |||
9440 | #endif /* CONFIG_SMP */ | 9931 | #endif /* CONFIG_SMP */ |
9441 | 9932 | ||
9442 | /* | 9933 | /* |
9443 | * scheduler tick hitting a task of our scheduling class: | 9934 | * scheduler tick hitting a task of our scheduling class. |
9935 | * | ||
9936 | * NOTE: This function can be called remotely by the tick offload that | ||
9937 | * goes along full dynticks. Therefore no local assumption can be made | ||
9938 | * and everything must be accessed through the @rq and @curr passed in | ||
9939 | * parameters. | ||
9444 | */ | 9940 | */ |
9445 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | 9941 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
9446 | { | 9942 | { |
@@ -9591,7 +10087,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) | |||
9591 | 10087 | ||
9592 | /* Synchronize entity with its cfs_rq */ | 10088 | /* Synchronize entity with its cfs_rq */ |
9593 | update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); | 10089 | update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); |
9594 | attach_entity_load_avg(cfs_rq, se); | 10090 | attach_entity_load_avg(cfs_rq, se, 0); |
9595 | update_tg_load_avg(cfs_rq, false); | 10091 | update_tg_load_avg(cfs_rq, false); |
9596 | propagate_entity_cfs_rq(se); | 10092 | propagate_entity_cfs_rq(se); |
9597 | } | 10093 | } |
@@ -9993,6 +10489,7 @@ __init void init_sched_fair_class(void) | |||
9993 | 10489 | ||
9994 | #ifdef CONFIG_NO_HZ_COMMON | 10490 | #ifdef CONFIG_NO_HZ_COMMON |
9995 | nohz.next_balance = jiffies; | 10491 | nohz.next_balance = jiffies; |
10492 | nohz.next_blocked = jiffies; | ||
9996 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 10493 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
9997 | #endif | 10494 | #endif |
9998 | #endif /* SMP */ | 10495 | #endif /* SMP */ |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..85ae8488039c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -85,3 +85,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) | |||
85 | SCHED_FEAT(WA_IDLE, true) | 85 | SCHED_FEAT(WA_IDLE, true) |
86 | SCHED_FEAT(WA_WEIGHT, true) | 86 | SCHED_FEAT(WA_WEIGHT, true) |
87 | SCHED_FEAT(WA_BIAS, true) | 87 | SCHED_FEAT(WA_BIAS, true) |
88 | |||
89 | /* | ||
90 | * UtilEstimation. Use estimated CPU utilization. | ||
91 | */ | ||
92 | SCHED_FEAT(UTIL_EST, true) | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7dae9eb8c042..2975f195e1c4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -1,23 +1,14 @@ | |||
1 | /* | 1 | /* |
2 | * Generic entry point for the idle threads | 2 | * Generic entry points for the idle threads and |
3 | * implementation of the idle task scheduling class. | ||
4 | * | ||
5 | * (NOTE: these are not related to SCHED_IDLE batch scheduled | ||
6 | * tasks which are handled in sched/fair.c ) | ||
3 | */ | 7 | */ |
4 | #include <linux/sched.h> | 8 | #include "sched.h" |
5 | #include <linux/sched/idle.h> | ||
6 | #include <linux/cpu.h> | ||
7 | #include <linux/cpuidle.h> | ||
8 | #include <linux/cpuhotplug.h> | ||
9 | #include <linux/tick.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/stackprotector.h> | ||
12 | #include <linux/suspend.h> | ||
13 | #include <linux/livepatch.h> | ||
14 | |||
15 | #include <asm/tlb.h> | ||
16 | 9 | ||
17 | #include <trace/events/power.h> | 10 | #include <trace/events/power.h> |
18 | 11 | ||
19 | #include "sched.h" | ||
20 | |||
21 | /* Linker adds these: start and end of __cpuidle functions */ | 12 | /* Linker adds these: start and end of __cpuidle functions */ |
22 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; | 13 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; |
23 | 14 | ||
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) | |||
46 | static int __init cpu_idle_poll_setup(char *__unused) | 37 | static int __init cpu_idle_poll_setup(char *__unused) |
47 | { | 38 | { |
48 | cpu_idle_force_poll = 1; | 39 | cpu_idle_force_poll = 1; |
40 | |||
49 | return 1; | 41 | return 1; |
50 | } | 42 | } |
51 | __setup("nohlt", cpu_idle_poll_setup); | 43 | __setup("nohlt", cpu_idle_poll_setup); |
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); | |||
53 | static int __init cpu_idle_nopoll_setup(char *__unused) | 45 | static int __init cpu_idle_nopoll_setup(char *__unused) |
54 | { | 46 | { |
55 | cpu_idle_force_poll = 0; | 47 | cpu_idle_force_poll = 0; |
48 | |||
56 | return 1; | 49 | return 1; |
57 | } | 50 | } |
58 | __setup("hlt", cpu_idle_nopoll_setup); | 51 | __setup("hlt", cpu_idle_nopoll_setup); |
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) | |||
64 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 57 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
65 | local_irq_enable(); | 58 | local_irq_enable(); |
66 | stop_critical_timings(); | 59 | stop_critical_timings(); |
60 | |||
67 | while (!tif_need_resched() && | 61 | while (!tif_need_resched() && |
68 | (cpu_idle_force_poll || tick_check_broadcast_expired())) | 62 | (cpu_idle_force_poll || tick_check_broadcast_expired())) |
69 | cpu_relax(); | 63 | cpu_relax(); |
70 | start_critical_timings(); | 64 | start_critical_timings(); |
71 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 65 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
72 | rcu_idle_exit(); | 66 | rcu_idle_exit(); |
67 | |||
73 | return 1; | 68 | return 1; |
74 | } | 69 | } |
75 | 70 | ||
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
332 | { | 327 | { |
333 | /* | 328 | /* |
334 | * This #ifdef needs to die, but it's too late in the cycle to | 329 | * This #ifdef needs to die, but it's too late in the cycle to |
335 | * make this generic (arm and sh have never invoked the canary | 330 | * make this generic (ARM and SH have never invoked the canary |
336 | * init for the non boot cpus!). Will be fixed in 3.11 | 331 | * init for the non boot CPUs!). Will be fixed in 3.11 |
337 | */ | 332 | */ |
338 | #ifdef CONFIG_X86 | 333 | #ifdef CONFIG_X86 |
339 | /* | 334 | /* |
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
350 | while (1) | 345 | while (1) |
351 | do_idle(); | 346 | do_idle(); |
352 | } | 347 | } |
348 | |||
349 | /* | ||
350 | * idle-task scheduling class. | ||
351 | */ | ||
352 | |||
353 | #ifdef CONFIG_SMP | ||
354 | static int | ||
355 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
356 | { | ||
357 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
358 | } | ||
359 | #endif | ||
360 | |||
361 | /* | ||
362 | * Idle tasks are unconditionally rescheduled: | ||
363 | */ | ||
364 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
365 | { | ||
366 | resched_curr(rq); | ||
367 | } | ||
368 | |||
369 | static struct task_struct * | ||
370 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
371 | { | ||
372 | put_prev_task(rq, prev); | ||
373 | update_idle_core(rq); | ||
374 | schedstat_inc(rq->sched_goidle); | ||
375 | |||
376 | return rq->idle; | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * It is not legal to sleep in the idle task - print a warning | ||
381 | * message if some code attempts to do it: | ||
382 | */ | ||
383 | static void | ||
384 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
385 | { | ||
386 | raw_spin_unlock_irq(&rq->lock); | ||
387 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
388 | dump_stack(); | ||
389 | raw_spin_lock_irq(&rq->lock); | ||
390 | } | ||
391 | |||
392 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
393 | { | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * scheduler tick hitting a task of our scheduling class. | ||
398 | * | ||
399 | * NOTE: This function can be called remotely by the tick offload that | ||
400 | * goes along full dynticks. Therefore no local assumption can be made | ||
401 | * and everything must be accessed through the @rq and @curr passed in | ||
402 | * parameters. | ||
403 | */ | ||
404 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
405 | { | ||
406 | } | ||
407 | |||
408 | static void set_curr_task_idle(struct rq *rq) | ||
409 | { | ||
410 | } | ||
411 | |||
412 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
413 | { | ||
414 | BUG(); | ||
415 | } | ||
416 | |||
417 | static void | ||
418 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
419 | { | ||
420 | BUG(); | ||
421 | } | ||
422 | |||
423 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
424 | { | ||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | static void update_curr_idle(struct rq *rq) | ||
429 | { | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
434 | */ | ||
435 | const struct sched_class idle_sched_class = { | ||
436 | /* .next is NULL */ | ||
437 | /* no enqueue/yield_task for idle tasks */ | ||
438 | |||
439 | /* dequeue is not valid, we print a debug message there: */ | ||
440 | .dequeue_task = dequeue_task_idle, | ||
441 | |||
442 | .check_preempt_curr = check_preempt_curr_idle, | ||
443 | |||
444 | .pick_next_task = pick_next_task_idle, | ||
445 | .put_prev_task = put_prev_task_idle, | ||
446 | |||
447 | #ifdef CONFIG_SMP | ||
448 | .select_task_rq = select_task_rq_idle, | ||
449 | .set_cpus_allowed = set_cpus_allowed_common, | ||
450 | #endif | ||
451 | |||
452 | .set_curr_task = set_curr_task_idle, | ||
453 | .task_tick = task_tick_idle, | ||
454 | |||
455 | .get_rr_interval = get_rr_interval_idle, | ||
456 | |||
457 | .prio_changed = prio_changed_idle, | ||
458 | .switched_to = switched_to_idle, | ||
459 | .update_curr = update_curr_idle, | ||
460 | }; | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c deleted file mode 100644 index d518664cce4f..000000000000 --- a/kernel/sched/idle_task.c +++ /dev/null | |||
@@ -1,110 +0,0 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include "sched.h" | ||
3 | |||
4 | /* | ||
5 | * idle-task scheduling class. | ||
6 | * | ||
7 | * (NOTE: these are not related to SCHED_IDLE tasks which are | ||
8 | * handled in sched/fair.c) | ||
9 | */ | ||
10 | |||
11 | #ifdef CONFIG_SMP | ||
12 | static int | ||
13 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
14 | { | ||
15 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
16 | } | ||
17 | #endif /* CONFIG_SMP */ | ||
18 | |||
19 | /* | ||
20 | * Idle tasks are unconditionally rescheduled: | ||
21 | */ | ||
22 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
23 | { | ||
24 | resched_curr(rq); | ||
25 | } | ||
26 | |||
27 | static struct task_struct * | ||
28 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
29 | { | ||
30 | put_prev_task(rq, prev); | ||
31 | update_idle_core(rq); | ||
32 | schedstat_inc(rq->sched_goidle); | ||
33 | return rq->idle; | ||
34 | } | ||
35 | |||
36 | /* | ||
37 | * It is not legal to sleep in the idle task - print a warning | ||
38 | * message if some code attempts to do it: | ||
39 | */ | ||
40 | static void | ||
41 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
42 | { | ||
43 | raw_spin_unlock_irq(&rq->lock); | ||
44 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
45 | dump_stack(); | ||
46 | raw_spin_lock_irq(&rq->lock); | ||
47 | } | ||
48 | |||
49 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
50 | { | ||
51 | rq_last_tick_reset(rq); | ||
52 | } | ||
53 | |||
54 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
55 | { | ||
56 | } | ||
57 | |||
58 | static void set_curr_task_idle(struct rq *rq) | ||
59 | { | ||
60 | } | ||
61 | |||
62 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
63 | { | ||
64 | BUG(); | ||
65 | } | ||
66 | |||
67 | static void | ||
68 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
69 | { | ||
70 | BUG(); | ||
71 | } | ||
72 | |||
73 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
74 | { | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | static void update_curr_idle(struct rq *rq) | ||
79 | { | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
84 | */ | ||
85 | const struct sched_class idle_sched_class = { | ||
86 | /* .next is NULL */ | ||
87 | /* no enqueue/yield_task for idle tasks */ | ||
88 | |||
89 | /* dequeue is not valid, we print a debug message there: */ | ||
90 | .dequeue_task = dequeue_task_idle, | ||
91 | |||
92 | .check_preempt_curr = check_preempt_curr_idle, | ||
93 | |||
94 | .pick_next_task = pick_next_task_idle, | ||
95 | .put_prev_task = put_prev_task_idle, | ||
96 | |||
97 | #ifdef CONFIG_SMP | ||
98 | .select_task_rq = select_task_rq_idle, | ||
99 | .set_cpus_allowed = set_cpus_allowed_common, | ||
100 | #endif | ||
101 | |||
102 | .set_curr_task = set_curr_task_idle, | ||
103 | .task_tick = task_tick_idle, | ||
104 | |||
105 | .get_rr_interval = get_rr_interval_idle, | ||
106 | |||
107 | .prio_changed = prio_changed_idle, | ||
108 | .switched_to = switched_to_idle, | ||
109 | .update_curr = update_curr_idle, | ||
110 | }; | ||
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436f59f2..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c | |||
@@ -3,15 +3,10 @@ | |||
3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. | 3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. |
4 | * | 4 | * |
5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker | 5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker |
6 | * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker | ||
6 | * | 7 | * |
7 | */ | 8 | */ |
8 | 9 | #include "sched.h" | |
9 | #include <linux/sched/isolation.h> | ||
10 | #include <linux/tick.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/static_key.h> | ||
14 | #include <linux/ctype.h> | ||
15 | 10 | ||
16 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); | 11 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); |
17 | EXPORT_SYMBOL_GPL(housekeeping_overriden); | 12 | EXPORT_SYMBOL_GPL(housekeeping_overriden); |
@@ -60,6 +55,9 @@ void __init housekeeping_init(void) | |||
60 | 55 | ||
61 | static_branch_enable(&housekeeping_overriden); | 56 | static_branch_enable(&housekeeping_overriden); |
62 | 57 | ||
58 | if (housekeeping_flags & HK_FLAG_TICK) | ||
59 | sched_tick_offload_init(); | ||
60 | |||
63 | /* We need at least one CPU to handle housekeeping work */ | 61 | /* We need at least one CPU to handle housekeeping work */ |
64 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); | 62 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); |
65 | } | 63 | } |
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str) | |||
119 | { | 117 | { |
120 | unsigned int flags; | 118 | unsigned int flags; |
121 | 119 | ||
122 | flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; | 120 | flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; |
123 | 121 | ||
124 | return housekeeping_setup(str, flags); | 122 | return housekeeping_setup(str, flags); |
125 | } | 123 | } |
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 89a989e4d758..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c | |||
@@ -6,10 +6,6 @@ | |||
6 | * figure. Its a silly number but people think its important. We go through | 6 | * figure. Its a silly number but people think its important. We go through |
7 | * great pains to make it work on big machines and tickless kernels. | 7 | * great pains to make it work on big machines and tickless kernels. |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/export.h> | ||
11 | #include <linux/sched/loadavg.h> | ||
12 | |||
13 | #include "sched.h" | 9 | #include "sched.h" |
14 | 10 | ||
15 | /* | 11 | /* |
@@ -32,29 +28,29 @@ | |||
32 | * Due to a number of reasons the above turns in the mess below: | 28 | * Due to a number of reasons the above turns in the mess below: |
33 | * | 29 | * |
34 | * - for_each_possible_cpu() is prohibitively expensive on machines with | 30 | * - for_each_possible_cpu() is prohibitively expensive on machines with |
35 | * serious number of cpus, therefore we need to take a distributed approach | 31 | * serious number of CPUs, therefore we need to take a distributed approach |
36 | * to calculating nr_active. | 32 | * to calculating nr_active. |
37 | * | 33 | * |
38 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | 34 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 |
39 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | 35 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } |
40 | * | 36 | * |
41 | * So assuming nr_active := 0 when we start out -- true per definition, we | 37 | * So assuming nr_active := 0 when we start out -- true per definition, we |
42 | * can simply take per-cpu deltas and fold those into a global accumulate | 38 | * can simply take per-CPU deltas and fold those into a global accumulate |
43 | * to obtain the same result. See calc_load_fold_active(). | 39 | * to obtain the same result. See calc_load_fold_active(). |
44 | * | 40 | * |
45 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | 41 | * Furthermore, in order to avoid synchronizing all per-CPU delta folding |
46 | * across the machine, we assume 10 ticks is sufficient time for every | 42 | * across the machine, we assume 10 ticks is sufficient time for every |
47 | * cpu to have completed this task. | 43 | * CPU to have completed this task. |
48 | * | 44 | * |
49 | * This places an upper-bound on the IRQ-off latency of the machine. Then | 45 | * This places an upper-bound on the IRQ-off latency of the machine. Then |
50 | * again, being late doesn't loose the delta, just wrecks the sample. | 46 | * again, being late doesn't loose the delta, just wrecks the sample. |
51 | * | 47 | * |
52 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | 48 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because |
53 | * this would add another cross-cpu cacheline miss and atomic operation | 49 | * this would add another cross-CPU cacheline miss and atomic operation |
54 | * to the wakeup path. Instead we increment on whatever cpu the task ran | 50 | * to the wakeup path. Instead we increment on whatever CPU the task ran |
55 | * when it went into uninterruptible state and decrement on whatever cpu | 51 | * when it went into uninterruptible state and decrement on whatever CPU |
56 | * did the wakeup. This means that only the sum of nr_uninterruptible over | 52 | * did the wakeup. This means that only the sum of nr_uninterruptible over |
57 | * all cpus yields the correct result. | 53 | * all CPUs yields the correct result. |
58 | * | 54 | * |
59 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | 55 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. |
60 | */ | 56 | */ |
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
115 | * Handle NO_HZ for the global load-average. | 111 | * Handle NO_HZ for the global load-average. |
116 | * | 112 | * |
117 | * Since the above described distributed algorithm to compute the global | 113 | * Since the above described distributed algorithm to compute the global |
118 | * load-average relies on per-cpu sampling from the tick, it is affected by | 114 | * load-average relies on per-CPU sampling from the tick, it is affected by |
119 | * NO_HZ. | 115 | * NO_HZ. |
120 | * | 116 | * |
121 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon | 117 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon |
122 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | 118 | * entering NO_HZ state such that we can include this as an 'extra' CPU delta |
123 | * when we read the global state. | 119 | * when we read the global state. |
124 | * | 120 | * |
125 | * Obviously reality has to ruin such a delightfully simple scheme: | 121 | * Obviously reality has to ruin such a delightfully simple scheme: |
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
146 | * busy state. | 142 | * busy state. |
147 | * | 143 | * |
148 | * This is solved by pushing the window forward, and thus skipping the | 144 | * This is solved by pushing the window forward, and thus skipping the |
149 | * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which | 145 | * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which |
150 | * was in effect at the time the window opened). This also solves the issue | 146 | * was in effect at the time the window opened). This also solves the issue |
151 | * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ | 147 | * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ |
152 | * intervals. | 148 | * intervals. |
153 | * | 149 | * |
154 | * When making the ILB scale, we should try to pull this in as well. | 150 | * When making the ILB scale, we should try to pull this in as well. |
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp, | |||
299 | } | 295 | } |
300 | 296 | ||
301 | /* | 297 | /* |
302 | * NO_HZ can leave us missing all per-cpu ticks calling | 298 | * NO_HZ can leave us missing all per-CPU ticks calling |
303 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into | 299 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |
304 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold | 300 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold |
305 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. | 301 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. |
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks) | |||
363 | return; | 359 | return; |
364 | 360 | ||
365 | /* | 361 | /* |
366 | * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. | 362 | * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. |
367 | */ | 363 | */ |
368 | delta = calc_load_nohz_fold(); | 364 | delta = calc_load_nohz_fold(); |
369 | if (delta) | 365 | if (delta) |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 5d0762633639..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -13,32 +13,25 @@ | |||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
15 | */ | 15 | */ |
16 | 16 | #include "sched.h" | |
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/membarrier.h> | ||
19 | #include <linux/tick.h> | ||
20 | #include <linux/cpumask.h> | ||
21 | #include <linux/atomic.h> | ||
22 | |||
23 | #include "sched.h" /* for cpu_rq(). */ | ||
24 | 17 | ||
25 | /* | 18 | /* |
26 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, | 19 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, |
27 | * except MEMBARRIER_CMD_QUERY. | 20 | * except MEMBARRIER_CMD_QUERY. |
28 | */ | 21 | */ |
29 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE | 22 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE |
30 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ | 23 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ |
31 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ | 24 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ |
32 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) | 25 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) |
33 | #else | 26 | #else |
34 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 | 27 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 |
35 | #endif | 28 | #endif |
36 | 29 | ||
37 | #define MEMBARRIER_CMD_BITMASK \ | 30 | #define MEMBARRIER_CMD_BITMASK \ |
38 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | 31 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ |
39 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | 32 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ |
40 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | 33 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ |
41 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | 34 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ |
42 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) | 35 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) |
43 | 36 | ||
44 | static void ipi_mb(void *info) | 37 | static void ipi_mb(void *info) |
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void) | |||
85 | */ | 78 | */ |
86 | if (cpu == raw_smp_processor_id()) | 79 | if (cpu == raw_smp_processor_id()) |
87 | continue; | 80 | continue; |
81 | |||
88 | rcu_read_lock(); | 82 | rcu_read_lock(); |
89 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); | 83 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); |
90 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & | 84 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & |
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags) | |||
188 | * rq->curr modification in scheduler. | 182 | * rq->curr modification in scheduler. |
189 | */ | 183 | */ |
190 | smp_mb(); /* exit from system call is not a mb */ | 184 | smp_mb(); /* exit from system call is not a mb */ |
185 | |||
191 | return 0; | 186 | return 0; |
192 | } | 187 | } |
193 | 188 | ||
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void) | |||
219 | } | 214 | } |
220 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, | 215 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, |
221 | &mm->membarrier_state); | 216 | &mm->membarrier_state); |
217 | |||
222 | return 0; | 218 | return 0; |
223 | } | 219 | } |
224 | 220 | ||
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags) | |||
253 | synchronize_sched(); | 249 | synchronize_sched(); |
254 | } | 250 | } |
255 | atomic_or(state, &mm->membarrier_state); | 251 | atomic_or(state, &mm->membarrier_state); |
252 | |||
256 | return 0; | 253 | return 0; |
257 | } | 254 | } |
258 | 255 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451584e..86b77987435e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -3,12 +3,8 @@ | |||
3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR | 3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR |
4 | * policies) | 4 | * policies) |
5 | */ | 5 | */ |
6 | |||
7 | #include "sched.h" | 6 | #include "sched.h" |
8 | 7 | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/irq_work.h> | ||
11 | |||
12 | int sched_rr_timeslice = RR_TIMESLICE; | 8 | int sched_rr_timeslice = RR_TIMESLICE; |
13 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; | 9 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; |
14 | 10 | ||
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); | |||
359 | static void push_rt_tasks(struct rq *); | 355 | static void push_rt_tasks(struct rq *); |
360 | static void pull_rt_task(struct rq *); | 356 | static void pull_rt_task(struct rq *); |
361 | 357 | ||
362 | static inline void queue_push_tasks(struct rq *rq) | 358 | static inline void rt_queue_push_tasks(struct rq *rq) |
363 | { | 359 | { |
364 | if (!has_pushable_tasks(rq)) | 360 | if (!has_pushable_tasks(rq)) |
365 | return; | 361 | return; |
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
367 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); | 363 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); |
368 | } | 364 | } |
369 | 365 | ||
370 | static inline void queue_pull_task(struct rq *rq) | 366 | static inline void rt_queue_pull_task(struct rq *rq) |
371 | { | 367 | { |
372 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); | 368 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); |
373 | } | 369 | } |
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) | |||
425 | { | 421 | { |
426 | } | 422 | } |
427 | 423 | ||
428 | static inline void queue_push_tasks(struct rq *rq) | 424 | static inline void rt_queue_push_tasks(struct rq *rq) |
429 | { | 425 | { |
430 | } | 426 | } |
431 | #endif /* CONFIG_SMP */ | 427 | #endif /* CONFIG_SMP */ |
@@ -961,9 +957,6 @@ static void update_curr_rt(struct rq *rq) | |||
961 | if (unlikely((s64)delta_exec <= 0)) | 957 | if (unlikely((s64)delta_exec <= 0)) |
962 | return; | 958 | return; |
963 | 959 | ||
964 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
965 | cpufreq_update_util(rq, SCHED_CPUFREQ_RT); | ||
966 | |||
967 | schedstat_set(curr->se.statistics.exec_max, | 960 | schedstat_set(curr->se.statistics.exec_max, |
968 | max(curr->se.statistics.exec_max, delta_exec)); | 961 | max(curr->se.statistics.exec_max, delta_exec)); |
969 | 962 | ||
@@ -1005,6 +998,9 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) | |||
1005 | 998 | ||
1006 | sub_nr_running(rq, rt_rq->rt_nr_running); | 999 | sub_nr_running(rq, rt_rq->rt_nr_running); |
1007 | rt_rq->rt_queued = 0; | 1000 | rt_rq->rt_queued = 0; |
1001 | |||
1002 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
1003 | cpufreq_update_util(rq, 0); | ||
1008 | } | 1004 | } |
1009 | 1005 | ||
1010 | static void | 1006 | static void |
@@ -1021,6 +1017,9 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) | |||
1021 | 1017 | ||
1022 | add_nr_running(rq, rt_rq->rt_nr_running); | 1018 | add_nr_running(rq, rt_rq->rt_nr_running); |
1023 | rt_rq->rt_queued = 1; | 1019 | rt_rq->rt_queued = 1; |
1020 | |||
1021 | /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ | ||
1022 | cpufreq_update_util(rq, 0); | ||
1024 | } | 1023 | } |
1025 | 1024 | ||
1026 | #if defined CONFIG_SMP | 1025 | #if defined CONFIG_SMP |
@@ -1453,9 +1452,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
1453 | return; | 1452 | return; |
1454 | 1453 | ||
1455 | /* | 1454 | /* |
1456 | * There appears to be other cpus that can accept | 1455 | * There appear to be other CPUs that can accept |
1457 | * current and none to run 'p', so lets reschedule | 1456 | * the current task but none can run 'p', so lets reschedule |
1458 | * to try and push current away: | 1457 | * to try and push the current task away: |
1459 | */ | 1458 | */ |
1460 | requeue_task_rt(rq, p, 1); | 1459 | requeue_task_rt(rq, p, 1); |
1461 | resched_curr(rq); | 1460 | resched_curr(rq); |
@@ -1569,7 +1568,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1569 | /* The running task is never eligible for pushing */ | 1568 | /* The running task is never eligible for pushing */ |
1570 | dequeue_pushable_task(rq, p); | 1569 | dequeue_pushable_task(rq, p); |
1571 | 1570 | ||
1572 | queue_push_tasks(rq); | 1571 | rt_queue_push_tasks(rq); |
1573 | 1572 | ||
1574 | return p; | 1573 | return p; |
1575 | } | 1574 | } |
@@ -1596,12 +1595,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1596 | if (!task_running(rq, p) && | 1595 | if (!task_running(rq, p) && |
1597 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1596 | cpumask_test_cpu(cpu, &p->cpus_allowed)) |
1598 | return 1; | 1597 | return 1; |
1598 | |||
1599 | return 0; | 1599 | return 0; |
1600 | } | 1600 | } |
1601 | 1601 | ||
1602 | /* | 1602 | /* |
1603 | * Return the highest pushable rq's task, which is suitable to be executed | 1603 | * Return the highest pushable rq's task, which is suitable to be executed |
1604 | * on the cpu, NULL otherwise | 1604 | * on the CPU, NULL otherwise |
1605 | */ | 1605 | */ |
1606 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) | 1606 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) |
1607 | { | 1607 | { |
@@ -1639,11 +1639,11 @@ static int find_lowest_rq(struct task_struct *task) | |||
1639 | return -1; /* No targets found */ | 1639 | return -1; /* No targets found */ |
1640 | 1640 | ||
1641 | /* | 1641 | /* |
1642 | * At this point we have built a mask of cpus representing the | 1642 | * At this point we have built a mask of CPUs representing the |
1643 | * lowest priority tasks in the system. Now we want to elect | 1643 | * lowest priority tasks in the system. Now we want to elect |
1644 | * the best one based on our affinity and topology. | 1644 | * the best one based on our affinity and topology. |
1645 | * | 1645 | * |
1646 | * We prioritize the last cpu that the task executed on since | 1646 | * We prioritize the last CPU that the task executed on since |
1647 | * it is most likely cache-hot in that location. | 1647 | * it is most likely cache-hot in that location. |
1648 | */ | 1648 | */ |
1649 | if (cpumask_test_cpu(cpu, lowest_mask)) | 1649 | if (cpumask_test_cpu(cpu, lowest_mask)) |
@@ -1651,7 +1651,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1651 | 1651 | ||
1652 | /* | 1652 | /* |
1653 | * Otherwise, we consult the sched_domains span maps to figure | 1653 | * Otherwise, we consult the sched_domains span maps to figure |
1654 | * out which cpu is logically closest to our hot cache data. | 1654 | * out which CPU is logically closest to our hot cache data. |
1655 | */ | 1655 | */ |
1656 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) | 1656 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) |
1657 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ | 1657 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ |
@@ -1692,6 +1692,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1692 | cpu = cpumask_any(lowest_mask); | 1692 | cpu = cpumask_any(lowest_mask); |
1693 | if (cpu < nr_cpu_ids) | 1693 | if (cpu < nr_cpu_ids) |
1694 | return cpu; | 1694 | return cpu; |
1695 | |||
1695 | return -1; | 1696 | return -1; |
1696 | } | 1697 | } |
1697 | 1698 | ||
@@ -1827,7 +1828,7 @@ retry: | |||
1827 | * The task hasn't migrated, and is still the next | 1828 | * The task hasn't migrated, and is still the next |
1828 | * eligible task, but we failed to find a run-queue | 1829 | * eligible task, but we failed to find a run-queue |
1829 | * to push it to. Do not retry in this case, since | 1830 | * to push it to. Do not retry in this case, since |
1830 | * other cpus will pull from us when ready. | 1831 | * other CPUs will pull from us when ready. |
1831 | */ | 1832 | */ |
1832 | goto out; | 1833 | goto out; |
1833 | } | 1834 | } |
@@ -1919,7 +1920,7 @@ static int rto_next_cpu(struct root_domain *rd) | |||
1919 | * rt_next_cpu() will simply return the first CPU found in | 1920 | * rt_next_cpu() will simply return the first CPU found in |
1920 | * the rto_mask. | 1921 | * the rto_mask. |
1921 | * | 1922 | * |
1922 | * If rto_next_cpu() is called with rto_cpu is a valid cpu, it | 1923 | * If rto_next_cpu() is called with rto_cpu is a valid CPU, it |
1923 | * will return the next CPU found in the rto_mask. | 1924 | * will return the next CPU found in the rto_mask. |
1924 | * | 1925 | * |
1925 | * If there are no more CPUs left in the rto_mask, then a check is made | 1926 | * If there are no more CPUs left in the rto_mask, then a check is made |
@@ -1980,7 +1981,7 @@ static void tell_cpu_to_push(struct rq *rq) | |||
1980 | raw_spin_lock(&rq->rd->rto_lock); | 1981 | raw_spin_lock(&rq->rd->rto_lock); |
1981 | 1982 | ||
1982 | /* | 1983 | /* |
1983 | * The rto_cpu is updated under the lock, if it has a valid cpu | 1984 | * The rto_cpu is updated under the lock, if it has a valid CPU |
1984 | * then the IPI is still running and will continue due to the | 1985 | * then the IPI is still running and will continue due to the |
1985 | * update to loop_next, and nothing needs to be done here. | 1986 | * update to loop_next, and nothing needs to be done here. |
1986 | * Otherwise it is finishing up and an ipi needs to be sent. | 1987 | * Otherwise it is finishing up and an ipi needs to be sent. |
@@ -2105,7 +2106,7 @@ static void pull_rt_task(struct rq *this_rq) | |||
2105 | 2106 | ||
2106 | /* | 2107 | /* |
2107 | * There's a chance that p is higher in priority | 2108 | * There's a chance that p is higher in priority |
2108 | * than what's currently running on its cpu. | 2109 | * than what's currently running on its CPU. |
2109 | * This is just that p is wakeing up and hasn't | 2110 | * This is just that p is wakeing up and hasn't |
2110 | * had a chance to schedule. We only pull | 2111 | * had a chance to schedule. We only pull |
2111 | * p if it is lower in priority than the | 2112 | * p if it is lower in priority than the |
@@ -2187,7 +2188,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
2187 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) | 2188 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
2188 | return; | 2189 | return; |
2189 | 2190 | ||
2190 | queue_pull_task(rq); | 2191 | rt_queue_pull_task(rq); |
2191 | } | 2192 | } |
2192 | 2193 | ||
2193 | void __init init_sched_rt_class(void) | 2194 | void __init init_sched_rt_class(void) |
@@ -2218,7 +2219,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
2218 | if (task_on_rq_queued(p) && rq->curr != p) { | 2219 | if (task_on_rq_queued(p) && rq->curr != p) { |
2219 | #ifdef CONFIG_SMP | 2220 | #ifdef CONFIG_SMP |
2220 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) | 2221 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) |
2221 | queue_push_tasks(rq); | 2222 | rt_queue_push_tasks(rq); |
2222 | #endif /* CONFIG_SMP */ | 2223 | #endif /* CONFIG_SMP */ |
2223 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) | 2224 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) |
2224 | resched_curr(rq); | 2225 | resched_curr(rq); |
@@ -2242,7 +2243,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
2242 | * may need to pull tasks to this runqueue. | 2243 | * may need to pull tasks to this runqueue. |
2243 | */ | 2244 | */ |
2244 | if (oldprio < p->prio) | 2245 | if (oldprio < p->prio) |
2245 | queue_pull_task(rq); | 2246 | rt_queue_pull_task(rq); |
2246 | 2247 | ||
2247 | /* | 2248 | /* |
2248 | * If there's a higher priority task waiting to run | 2249 | * If there's a higher priority task waiting to run |
@@ -2292,6 +2293,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
2292 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } | 2293 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } |
2293 | #endif | 2294 | #endif |
2294 | 2295 | ||
2296 | /* | ||
2297 | * scheduler tick hitting a task of our scheduling class. | ||
2298 | * | ||
2299 | * NOTE: This function can be called remotely by the tick offload that | ||
2300 | * goes along full dynticks. Therefore no local assumption can be made | ||
2301 | * and everything must be accessed through the @rq and @curr passed in | ||
2302 | * parameters. | ||
2303 | */ | ||
2295 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 2304 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
2296 | { | 2305 | { |
2297 | struct sched_rt_entity *rt_se = &p->rt; | 2306 | struct sched_rt_entity *rt_se = &p->rt; |
@@ -2685,6 +2694,7 @@ int sched_rr_handler(struct ctl_table *table, int write, | |||
2685 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | 2694 | msecs_to_jiffies(sysctl_sched_rr_timeslice); |
2686 | } | 2695 | } |
2687 | mutex_unlock(&mutex); | 2696 | mutex_unlock(&mutex); |
2697 | |||
2688 | return ret; | 2698 | return ret; |
2689 | } | 2699 | } |
2690 | 2700 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb5fc458547f..c3deaee7a7a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1,39 +1,73 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | 2 | /* | |
3 | * Scheduler internal types and methods: | ||
4 | */ | ||
3 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | |||
4 | #include <linux/sched/autogroup.h> | 7 | #include <linux/sched/autogroup.h> |
5 | #include <linux/sched/sysctl.h> | ||
6 | #include <linux/sched/topology.h> | ||
7 | #include <linux/sched/rt.h> | ||
8 | #include <linux/sched/deadline.h> | ||
9 | #include <linux/sched/clock.h> | 8 | #include <linux/sched/clock.h> |
10 | #include <linux/sched/wake_q.h> | 9 | #include <linux/sched/coredump.h> |
11 | #include <linux/sched/signal.h> | ||
12 | #include <linux/sched/numa_balancing.h> | ||
13 | #include <linux/sched/mm.h> | ||
14 | #include <linux/sched/cpufreq.h> | 10 | #include <linux/sched/cpufreq.h> |
15 | #include <linux/sched/stat.h> | 11 | #include <linux/sched/cputime.h> |
16 | #include <linux/sched/nohz.h> | 12 | #include <linux/sched/deadline.h> |
17 | #include <linux/sched/debug.h> | 13 | #include <linux/sched/debug.h> |
18 | #include <linux/sched/hotplug.h> | 14 | #include <linux/sched/hotplug.h> |
15 | #include <linux/sched/idle.h> | ||
16 | #include <linux/sched/init.h> | ||
17 | #include <linux/sched/isolation.h> | ||
18 | #include <linux/sched/jobctl.h> | ||
19 | #include <linux/sched/loadavg.h> | ||
20 | #include <linux/sched/mm.h> | ||
21 | #include <linux/sched/nohz.h> | ||
22 | #include <linux/sched/numa_balancing.h> | ||
23 | #include <linux/sched/prio.h> | ||
24 | #include <linux/sched/rt.h> | ||
25 | #include <linux/sched/signal.h> | ||
26 | #include <linux/sched/stat.h> | ||
27 | #include <linux/sched/sysctl.h> | ||
19 | #include <linux/sched/task.h> | 28 | #include <linux/sched/task.h> |
20 | #include <linux/sched/task_stack.h> | 29 | #include <linux/sched/task_stack.h> |
21 | #include <linux/sched/cputime.h> | 30 | #include <linux/sched/topology.h> |
22 | #include <linux/sched/init.h> | 31 | #include <linux/sched/user.h> |
32 | #include <linux/sched/wake_q.h> | ||
33 | #include <linux/sched/xacct.h> | ||
34 | |||
35 | #include <uapi/linux/sched/types.h> | ||
23 | 36 | ||
24 | #include <linux/u64_stats_sync.h> | ||
25 | #include <linux/kernel_stat.h> | ||
26 | #include <linux/binfmts.h> | 37 | #include <linux/binfmts.h> |
27 | #include <linux/mutex.h> | 38 | #include <linux/blkdev.h> |
28 | #include <linux/spinlock.h> | 39 | #include <linux/compat.h> |
40 | #include <linux/context_tracking.h> | ||
41 | #include <linux/cpufreq.h> | ||
42 | #include <linux/cpuidle.h> | ||
43 | #include <linux/cpuset.h> | ||
44 | #include <linux/ctype.h> | ||
45 | #include <linux/debugfs.h> | ||
46 | #include <linux/delayacct.h> | ||
47 | #include <linux/init_task.h> | ||
48 | #include <linux/kprobes.h> | ||
49 | #include <linux/kthread.h> | ||
50 | #include <linux/membarrier.h> | ||
51 | #include <linux/migrate.h> | ||
52 | #include <linux/mmu_context.h> | ||
53 | #include <linux/nmi.h> | ||
54 | #include <linux/proc_fs.h> | ||
55 | #include <linux/prefetch.h> | ||
56 | #include <linux/profile.h> | ||
57 | #include <linux/rcupdate_wait.h> | ||
58 | #include <linux/security.h> | ||
59 | #include <linux/stackprotector.h> | ||
29 | #include <linux/stop_machine.h> | 60 | #include <linux/stop_machine.h> |
30 | #include <linux/irq_work.h> | 61 | #include <linux/suspend.h> |
31 | #include <linux/tick.h> | 62 | #include <linux/swait.h> |
32 | #include <linux/slab.h> | 63 | #include <linux/syscalls.h> |
33 | #include <linux/cgroup.h> | 64 | #include <linux/task_work.h> |
65 | #include <linux/tsacct_kern.h> | ||
66 | |||
67 | #include <asm/tlb.h> | ||
34 | 68 | ||
35 | #ifdef CONFIG_PARAVIRT | 69 | #ifdef CONFIG_PARAVIRT |
36 | #include <asm/paravirt.h> | 70 | # include <asm/paravirt.h> |
37 | #endif | 71 | #endif |
38 | 72 | ||
39 | #include "cpupri.h" | 73 | #include "cpupri.h" |
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
79 | * and does not change the user-interface for setting shares/weights. | 113 | * and does not change the user-interface for setting shares/weights. |
80 | * | 114 | * |
81 | * We increase resolution only if we have enough bits to allow this increased | 115 | * We increase resolution only if we have enough bits to allow this increased |
82 | * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are | 116 | * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit |
83 | * pretty high and the returns do not justify the increased costs. | 117 | * are pretty high and the returns do not justify the increased costs. |
84 | * | 118 | * |
85 | * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to | 119 | * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to |
86 | * increase coverage and consistency always enable it on 64bit platforms. | 120 | * increase coverage and consistency always enable it on 64-bit platforms. |
87 | */ | 121 | */ |
88 | #ifdef CONFIG_64BIT | 122 | #ifdef CONFIG_64BIT |
89 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) | 123 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) |
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
111 | * 10 -> just above 1us | 145 | * 10 -> just above 1us |
112 | * 9 -> just above 0.5us | 146 | * 9 -> just above 0.5us |
113 | */ | 147 | */ |
114 | #define DL_SCALE (10) | 148 | #define DL_SCALE 10 |
115 | 149 | ||
116 | /* | 150 | /* |
117 | * These are the 'tuning knobs' of the scheduler: | 151 | * Single value that denotes runtime == period, ie unlimited time. |
118 | */ | 152 | */ |
119 | 153 | #define RUNTIME_INF ((u64)~0ULL) | |
120 | /* | ||
121 | * single value that denotes runtime == period, ie unlimited time. | ||
122 | */ | ||
123 | #define RUNTIME_INF ((u64)~0ULL) | ||
124 | 154 | ||
125 | static inline int idle_policy(int policy) | 155 | static inline int idle_policy(int policy) |
126 | { | 156 | { |
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p); | |||
235 | * control. | 265 | * control. |
236 | */ | 266 | */ |
237 | struct dl_bandwidth { | 267 | struct dl_bandwidth { |
238 | raw_spinlock_t dl_runtime_lock; | 268 | raw_spinlock_t dl_runtime_lock; |
239 | u64 dl_runtime; | 269 | u64 dl_runtime; |
240 | u64 dl_period; | 270 | u64 dl_period; |
241 | }; | 271 | }; |
242 | 272 | ||
243 | static inline int dl_bandwidth_enabled(void) | 273 | static inline int dl_bandwidth_enabled(void) |
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void) | |||
246 | } | 276 | } |
247 | 277 | ||
248 | struct dl_bw { | 278 | struct dl_bw { |
249 | raw_spinlock_t lock; | 279 | raw_spinlock_t lock; |
250 | u64 bw, total_bw; | 280 | u64 bw; |
281 | u64 total_bw; | ||
251 | }; | 282 | }; |
252 | 283 | ||
253 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); | 284 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); |
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
273 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 304 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
274 | } | 305 | } |
275 | 306 | ||
276 | void dl_change_utilization(struct task_struct *p, u64 new_bw); | 307 | extern void dl_change_utilization(struct task_struct *p, u64 new_bw); |
277 | extern void init_dl_bw(struct dl_bw *dl_b); | 308 | extern void init_dl_bw(struct dl_bw *dl_b); |
278 | extern int sched_dl_global_validate(void); | 309 | extern int sched_dl_global_validate(void); |
279 | extern void sched_dl_do_global(void); | 310 | extern void sched_dl_do_global(void); |
280 | extern int sched_dl_overflow(struct task_struct *p, int policy, | 311 | extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); |
281 | const struct sched_attr *attr); | ||
282 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); | 312 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); |
283 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); | 313 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); |
284 | extern bool __checkparam_dl(const struct sched_attr *attr); | 314 | extern bool __checkparam_dl(const struct sched_attr *attr); |
285 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); | 315 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); |
286 | extern int dl_task_can_attach(struct task_struct *p, | 316 | extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); |
287 | const struct cpumask *cs_cpus_allowed); | 317 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); |
288 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
289 | const struct cpumask *trial); | ||
290 | extern bool dl_cpu_busy(unsigned int cpu); | 318 | extern bool dl_cpu_busy(unsigned int cpu); |
291 | 319 | ||
292 | #ifdef CONFIG_CGROUP_SCHED | 320 | #ifdef CONFIG_CGROUP_SCHED |
@@ -300,32 +328,36 @@ extern struct list_head task_groups; | |||
300 | 328 | ||
301 | struct cfs_bandwidth { | 329 | struct cfs_bandwidth { |
302 | #ifdef CONFIG_CFS_BANDWIDTH | 330 | #ifdef CONFIG_CFS_BANDWIDTH |
303 | raw_spinlock_t lock; | 331 | raw_spinlock_t lock; |
304 | ktime_t period; | 332 | ktime_t period; |
305 | u64 quota, runtime; | 333 | u64 quota; |
306 | s64 hierarchical_quota; | 334 | u64 runtime; |
307 | u64 runtime_expires; | 335 | s64 hierarchical_quota; |
308 | 336 | u64 runtime_expires; | |
309 | int idle, period_active; | 337 | |
310 | struct hrtimer period_timer, slack_timer; | 338 | int idle; |
311 | struct list_head throttled_cfs_rq; | 339 | int period_active; |
312 | 340 | struct hrtimer period_timer; | |
313 | /* statistics */ | 341 | struct hrtimer slack_timer; |
314 | int nr_periods, nr_throttled; | 342 | struct list_head throttled_cfs_rq; |
315 | u64 throttled_time; | 343 | |
344 | /* Statistics: */ | ||
345 | int nr_periods; | ||
346 | int nr_throttled; | ||
347 | u64 throttled_time; | ||
316 | #endif | 348 | #endif |
317 | }; | 349 | }; |
318 | 350 | ||
319 | /* task group related information */ | 351 | /* Task group related information */ |
320 | struct task_group { | 352 | struct task_group { |
321 | struct cgroup_subsys_state css; | 353 | struct cgroup_subsys_state css; |
322 | 354 | ||
323 | #ifdef CONFIG_FAIR_GROUP_SCHED | 355 | #ifdef CONFIG_FAIR_GROUP_SCHED |
324 | /* schedulable entities of this group on each cpu */ | 356 | /* schedulable entities of this group on each CPU */ |
325 | struct sched_entity **se; | 357 | struct sched_entity **se; |
326 | /* runqueue "owned" by this group on each cpu */ | 358 | /* runqueue "owned" by this group on each CPU */ |
327 | struct cfs_rq **cfs_rq; | 359 | struct cfs_rq **cfs_rq; |
328 | unsigned long shares; | 360 | unsigned long shares; |
329 | 361 | ||
330 | #ifdef CONFIG_SMP | 362 | #ifdef CONFIG_SMP |
331 | /* | 363 | /* |
@@ -333,29 +365,29 @@ struct task_group { | |||
333 | * it in its own cacheline separated from the fields above which | 365 | * it in its own cacheline separated from the fields above which |
334 | * will also be accessed at each tick. | 366 | * will also be accessed at each tick. |
335 | */ | 367 | */ |
336 | atomic_long_t load_avg ____cacheline_aligned; | 368 | atomic_long_t load_avg ____cacheline_aligned; |
337 | #endif | 369 | #endif |
338 | #endif | 370 | #endif |
339 | 371 | ||
340 | #ifdef CONFIG_RT_GROUP_SCHED | 372 | #ifdef CONFIG_RT_GROUP_SCHED |
341 | struct sched_rt_entity **rt_se; | 373 | struct sched_rt_entity **rt_se; |
342 | struct rt_rq **rt_rq; | 374 | struct rt_rq **rt_rq; |
343 | 375 | ||
344 | struct rt_bandwidth rt_bandwidth; | 376 | struct rt_bandwidth rt_bandwidth; |
345 | #endif | 377 | #endif |
346 | 378 | ||
347 | struct rcu_head rcu; | 379 | struct rcu_head rcu; |
348 | struct list_head list; | 380 | struct list_head list; |
349 | 381 | ||
350 | struct task_group *parent; | 382 | struct task_group *parent; |
351 | struct list_head siblings; | 383 | struct list_head siblings; |
352 | struct list_head children; | 384 | struct list_head children; |
353 | 385 | ||
354 | #ifdef CONFIG_SCHED_AUTOGROUP | 386 | #ifdef CONFIG_SCHED_AUTOGROUP |
355 | struct autogroup *autogroup; | 387 | struct autogroup *autogroup; |
356 | #endif | 388 | #endif |
357 | 389 | ||
358 | struct cfs_bandwidth cfs_bandwidth; | 390 | struct cfs_bandwidth cfs_bandwidth; |
359 | }; | 391 | }; |
360 | 392 | ||
361 | #ifdef CONFIG_FAIR_GROUP_SCHED | 393 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -369,8 +401,8 @@ struct task_group { | |||
369 | * (The default weight is 1024 - so there's no practical | 401 | * (The default weight is 1024 - so there's no practical |
370 | * limitation from this.) | 402 | * limitation from this.) |
371 | */ | 403 | */ |
372 | #define MIN_SHARES (1UL << 1) | 404 | #define MIN_SHARES (1UL << 1) |
373 | #define MAX_SHARES (1UL << 18) | 405 | #define MAX_SHARES (1UL << 18) |
374 | #endif | 406 | #endif |
375 | 407 | ||
376 | typedef int (*tg_visitor)(struct task_group *, void *); | 408 | typedef int (*tg_visitor)(struct task_group *, void *); |
@@ -443,35 +475,39 @@ struct cfs_bandwidth { }; | |||
443 | 475 | ||
444 | /* CFS-related fields in a runqueue */ | 476 | /* CFS-related fields in a runqueue */ |
445 | struct cfs_rq { | 477 | struct cfs_rq { |
446 | struct load_weight load; | 478 | struct load_weight load; |
447 | unsigned long runnable_weight; | 479 | unsigned long runnable_weight; |
448 | unsigned int nr_running, h_nr_running; | 480 | unsigned int nr_running; |
481 | unsigned int h_nr_running; | ||
449 | 482 | ||
450 | u64 exec_clock; | 483 | u64 exec_clock; |
451 | u64 min_vruntime; | 484 | u64 min_vruntime; |
452 | #ifndef CONFIG_64BIT | 485 | #ifndef CONFIG_64BIT |
453 | u64 min_vruntime_copy; | 486 | u64 min_vruntime_copy; |
454 | #endif | 487 | #endif |
455 | 488 | ||
456 | struct rb_root_cached tasks_timeline; | 489 | struct rb_root_cached tasks_timeline; |
457 | 490 | ||
458 | /* | 491 | /* |
459 | * 'curr' points to currently running entity on this cfs_rq. | 492 | * 'curr' points to currently running entity on this cfs_rq. |
460 | * It is set to NULL otherwise (i.e when none are currently running). | 493 | * It is set to NULL otherwise (i.e when none are currently running). |
461 | */ | 494 | */ |
462 | struct sched_entity *curr, *next, *last, *skip; | 495 | struct sched_entity *curr; |
496 | struct sched_entity *next; | ||
497 | struct sched_entity *last; | ||
498 | struct sched_entity *skip; | ||
463 | 499 | ||
464 | #ifdef CONFIG_SCHED_DEBUG | 500 | #ifdef CONFIG_SCHED_DEBUG |
465 | unsigned int nr_spread_over; | 501 | unsigned int nr_spread_over; |
466 | #endif | 502 | #endif |
467 | 503 | ||
468 | #ifdef CONFIG_SMP | 504 | #ifdef CONFIG_SMP |
469 | /* | 505 | /* |
470 | * CFS load tracking | 506 | * CFS load tracking |
471 | */ | 507 | */ |
472 | struct sched_avg avg; | 508 | struct sched_avg avg; |
473 | #ifndef CONFIG_64BIT | 509 | #ifndef CONFIG_64BIT |
474 | u64 load_last_update_time_copy; | 510 | u64 load_last_update_time_copy; |
475 | #endif | 511 | #endif |
476 | struct { | 512 | struct { |
477 | raw_spinlock_t lock ____cacheline_aligned; | 513 | raw_spinlock_t lock ____cacheline_aligned; |
@@ -482,9 +518,9 @@ struct cfs_rq { | |||
482 | } removed; | 518 | } removed; |
483 | 519 | ||
484 | #ifdef CONFIG_FAIR_GROUP_SCHED | 520 | #ifdef CONFIG_FAIR_GROUP_SCHED |
485 | unsigned long tg_load_avg_contrib; | 521 | unsigned long tg_load_avg_contrib; |
486 | long propagate; | 522 | long propagate; |
487 | long prop_runnable_sum; | 523 | long prop_runnable_sum; |
488 | 524 | ||
489 | /* | 525 | /* |
490 | * h_load = weight * f(tg) | 526 | * h_load = weight * f(tg) |
@@ -492,36 +528,38 @@ struct cfs_rq { | |||
492 | * Where f(tg) is the recursive weight fraction assigned to | 528 | * Where f(tg) is the recursive weight fraction assigned to |
493 | * this group. | 529 | * this group. |
494 | */ | 530 | */ |
495 | unsigned long h_load; | 531 | unsigned long h_load; |
496 | u64 last_h_load_update; | 532 | u64 last_h_load_update; |
497 | struct sched_entity *h_load_next; | 533 | struct sched_entity *h_load_next; |
498 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 534 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
499 | #endif /* CONFIG_SMP */ | 535 | #endif /* CONFIG_SMP */ |
500 | 536 | ||
501 | #ifdef CONFIG_FAIR_GROUP_SCHED | 537 | #ifdef CONFIG_FAIR_GROUP_SCHED |
502 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 538 | struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ |
503 | 539 | ||
504 | /* | 540 | /* |
505 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 541 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
506 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 542 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
507 | * (like users, containers etc.) | 543 | * (like users, containers etc.) |
508 | * | 544 | * |
509 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 545 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. |
510 | * list is used during load balance. | 546 | * This list is used during load balance. |
511 | */ | 547 | */ |
512 | int on_list; | 548 | int on_list; |
513 | struct list_head leaf_cfs_rq_list; | 549 | struct list_head leaf_cfs_rq_list; |
514 | struct task_group *tg; /* group that "owns" this runqueue */ | 550 | struct task_group *tg; /* group that "owns" this runqueue */ |
515 | 551 | ||
516 | #ifdef CONFIG_CFS_BANDWIDTH | 552 | #ifdef CONFIG_CFS_BANDWIDTH |
517 | int runtime_enabled; | 553 | int runtime_enabled; |
518 | u64 runtime_expires; | 554 | u64 runtime_expires; |
519 | s64 runtime_remaining; | 555 | s64 runtime_remaining; |
520 | 556 | ||
521 | u64 throttled_clock, throttled_clock_task; | 557 | u64 throttled_clock; |
522 | u64 throttled_clock_task_time; | 558 | u64 throttled_clock_task; |
523 | int throttled, throttle_count; | 559 | u64 throttled_clock_task_time; |
524 | struct list_head throttled_list; | 560 | int throttled; |
561 | int throttle_count; | ||
562 | struct list_head throttled_list; | ||
525 | #endif /* CONFIG_CFS_BANDWIDTH */ | 563 | #endif /* CONFIG_CFS_BANDWIDTH */ |
526 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 564 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
527 | }; | 565 | }; |
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void) | |||
538 | 576 | ||
539 | /* Real-Time classes' related field in a runqueue: */ | 577 | /* Real-Time classes' related field in a runqueue: */ |
540 | struct rt_rq { | 578 | struct rt_rq { |
541 | struct rt_prio_array active; | 579 | struct rt_prio_array active; |
542 | unsigned int rt_nr_running; | 580 | unsigned int rt_nr_running; |
543 | unsigned int rr_nr_running; | 581 | unsigned int rr_nr_running; |
544 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 582 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
545 | struct { | 583 | struct { |
546 | int curr; /* highest queued rt task prio */ | 584 | int curr; /* highest queued rt task prio */ |
547 | #ifdef CONFIG_SMP | 585 | #ifdef CONFIG_SMP |
548 | int next; /* next highest */ | 586 | int next; /* next highest */ |
549 | #endif | 587 | #endif |
550 | } highest_prio; | 588 | } highest_prio; |
551 | #endif | 589 | #endif |
552 | #ifdef CONFIG_SMP | 590 | #ifdef CONFIG_SMP |
553 | unsigned long rt_nr_migratory; | 591 | unsigned long rt_nr_migratory; |
554 | unsigned long rt_nr_total; | 592 | unsigned long rt_nr_total; |
555 | int overloaded; | 593 | int overloaded; |
556 | struct plist_head pushable_tasks; | 594 | struct plist_head pushable_tasks; |
557 | #endif /* CONFIG_SMP */ | 595 | #endif /* CONFIG_SMP */ |
558 | int rt_queued; | 596 | int rt_queued; |
559 | 597 | ||
560 | int rt_throttled; | 598 | int rt_throttled; |
561 | u64 rt_time; | 599 | u64 rt_time; |
562 | u64 rt_runtime; | 600 | u64 rt_runtime; |
563 | /* Nests inside the rq lock: */ | 601 | /* Nests inside the rq lock: */ |
564 | raw_spinlock_t rt_runtime_lock; | 602 | raw_spinlock_t rt_runtime_lock; |
565 | 603 | ||
566 | #ifdef CONFIG_RT_GROUP_SCHED | 604 | #ifdef CONFIG_RT_GROUP_SCHED |
567 | unsigned long rt_nr_boosted; | 605 | unsigned long rt_nr_boosted; |
568 | 606 | ||
569 | struct rq *rq; | 607 | struct rq *rq; |
570 | struct task_group *tg; | 608 | struct task_group *tg; |
571 | #endif | 609 | #endif |
572 | }; | 610 | }; |
573 | 611 | ||
574 | /* Deadline class' related fields in a runqueue */ | 612 | /* Deadline class' related fields in a runqueue */ |
575 | struct dl_rq { | 613 | struct dl_rq { |
576 | /* runqueue is an rbtree, ordered by deadline */ | 614 | /* runqueue is an rbtree, ordered by deadline */ |
577 | struct rb_root_cached root; | 615 | struct rb_root_cached root; |
578 | 616 | ||
579 | unsigned long dl_nr_running; | 617 | unsigned long dl_nr_running; |
580 | 618 | ||
581 | #ifdef CONFIG_SMP | 619 | #ifdef CONFIG_SMP |
582 | /* | 620 | /* |
@@ -586,28 +624,28 @@ struct dl_rq { | |||
586 | * should migrate somewhere else. | 624 | * should migrate somewhere else. |
587 | */ | 625 | */ |
588 | struct { | 626 | struct { |
589 | u64 curr; | 627 | u64 curr; |
590 | u64 next; | 628 | u64 next; |
591 | } earliest_dl; | 629 | } earliest_dl; |
592 | 630 | ||
593 | unsigned long dl_nr_migratory; | 631 | unsigned long dl_nr_migratory; |
594 | int overloaded; | 632 | int overloaded; |
595 | 633 | ||
596 | /* | 634 | /* |
597 | * Tasks on this rq that can be pushed away. They are kept in | 635 | * Tasks on this rq that can be pushed away. They are kept in |
598 | * an rb-tree, ordered by tasks' deadlines, with caching | 636 | * an rb-tree, ordered by tasks' deadlines, with caching |
599 | * of the leftmost (earliest deadline) element. | 637 | * of the leftmost (earliest deadline) element. |
600 | */ | 638 | */ |
601 | struct rb_root_cached pushable_dl_tasks_root; | 639 | struct rb_root_cached pushable_dl_tasks_root; |
602 | #else | 640 | #else |
603 | struct dl_bw dl_bw; | 641 | struct dl_bw dl_bw; |
604 | #endif | 642 | #endif |
605 | /* | 643 | /* |
606 | * "Active utilization" for this runqueue: increased when a | 644 | * "Active utilization" for this runqueue: increased when a |
607 | * task wakes up (becomes TASK_RUNNING) and decreased when a | 645 | * task wakes up (becomes TASK_RUNNING) and decreased when a |
608 | * task blocks | 646 | * task blocks |
609 | */ | 647 | */ |
610 | u64 running_bw; | 648 | u64 running_bw; |
611 | 649 | ||
612 | /* | 650 | /* |
613 | * Utilization of the tasks "assigned" to this runqueue (including | 651 | * Utilization of the tasks "assigned" to this runqueue (including |
@@ -618,14 +656,14 @@ struct dl_rq { | |||
618 | * This is needed to compute the "inactive utilization" for the | 656 | * This is needed to compute the "inactive utilization" for the |
619 | * runqueue (inactive utilization = this_bw - running_bw). | 657 | * runqueue (inactive utilization = this_bw - running_bw). |
620 | */ | 658 | */ |
621 | u64 this_bw; | 659 | u64 this_bw; |
622 | u64 extra_bw; | 660 | u64 extra_bw; |
623 | 661 | ||
624 | /* | 662 | /* |
625 | * Inverse of the fraction of CPU utilization that can be reclaimed | 663 | * Inverse of the fraction of CPU utilization that can be reclaimed |
626 | * by the GRUB algorithm. | 664 | * by the GRUB algorithm. |
627 | */ | 665 | */ |
628 | u64 bw_ratio; | 666 | u64 bw_ratio; |
629 | }; | 667 | }; |
630 | 668 | ||
631 | #ifdef CONFIG_SMP | 669 | #ifdef CONFIG_SMP |
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b) | |||
638 | /* | 676 | /* |
639 | * We add the notion of a root-domain which will be used to define per-domain | 677 | * We add the notion of a root-domain which will be used to define per-domain |
640 | * variables. Each exclusive cpuset essentially defines an island domain by | 678 | * variables. Each exclusive cpuset essentially defines an island domain by |
641 | * fully partitioning the member cpus from any other cpuset. Whenever a new | 679 | * fully partitioning the member CPUs from any other cpuset. Whenever a new |
642 | * exclusive cpuset is created, we also create and attach a new root-domain | 680 | * exclusive cpuset is created, we also create and attach a new root-domain |
643 | * object. | 681 | * object. |
644 | * | 682 | * |
645 | */ | 683 | */ |
646 | struct root_domain { | 684 | struct root_domain { |
647 | atomic_t refcount; | 685 | atomic_t refcount; |
648 | atomic_t rto_count; | 686 | atomic_t rto_count; |
649 | struct rcu_head rcu; | 687 | struct rcu_head rcu; |
650 | cpumask_var_t span; | 688 | cpumask_var_t span; |
651 | cpumask_var_t online; | 689 | cpumask_var_t online; |
652 | 690 | ||
653 | /* Indicate more than one runnable task for any CPU */ | 691 | /* Indicate more than one runnable task for any CPU */ |
654 | bool overload; | 692 | bool overload; |
655 | 693 | ||
656 | /* | 694 | /* |
657 | * The bit corresponding to a CPU gets set here if such CPU has more | 695 | * The bit corresponding to a CPU gets set here if such CPU has more |
658 | * than one runnable -deadline task (as it is below for RT tasks). | 696 | * than one runnable -deadline task (as it is below for RT tasks). |
659 | */ | 697 | */ |
660 | cpumask_var_t dlo_mask; | 698 | cpumask_var_t dlo_mask; |
661 | atomic_t dlo_count; | 699 | atomic_t dlo_count; |
662 | struct dl_bw dl_bw; | 700 | struct dl_bw dl_bw; |
663 | struct cpudl cpudl; | 701 | struct cpudl cpudl; |
664 | 702 | ||
665 | #ifdef HAVE_RT_PUSH_IPI | 703 | #ifdef HAVE_RT_PUSH_IPI |
666 | /* | 704 | /* |
667 | * For IPI pull requests, loop across the rto_mask. | 705 | * For IPI pull requests, loop across the rto_mask. |
668 | */ | 706 | */ |
669 | struct irq_work rto_push_work; | 707 | struct irq_work rto_push_work; |
670 | raw_spinlock_t rto_lock; | 708 | raw_spinlock_t rto_lock; |
671 | /* These are only updated and read within rto_lock */ | 709 | /* These are only updated and read within rto_lock */ |
672 | int rto_loop; | 710 | int rto_loop; |
673 | int rto_cpu; | 711 | int rto_cpu; |
674 | /* These atomics are updated outside of a lock */ | 712 | /* These atomics are updated outside of a lock */ |
675 | atomic_t rto_loop_next; | 713 | atomic_t rto_loop_next; |
676 | atomic_t rto_loop_start; | 714 | atomic_t rto_loop_start; |
677 | #endif | 715 | #endif |
678 | /* | 716 | /* |
679 | * The "RT overload" flag: it gets set if a CPU has more than | 717 | * The "RT overload" flag: it gets set if a CPU has more than |
680 | * one runnable RT task. | 718 | * one runnable RT task. |
681 | */ | 719 | */ |
682 | cpumask_var_t rto_mask; | 720 | cpumask_var_t rto_mask; |
683 | struct cpupri cpupri; | 721 | struct cpupri cpupri; |
684 | 722 | ||
685 | unsigned long max_cpu_capacity; | 723 | unsigned long max_cpu_capacity; |
686 | }; | 724 | }; |
687 | 725 | ||
688 | extern struct root_domain def_root_domain; | 726 | extern struct root_domain def_root_domain; |
@@ -708,41 +746,42 @@ extern void rto_push_irq_work_func(struct irq_work *work); | |||
708 | */ | 746 | */ |
709 | struct rq { | 747 | struct rq { |
710 | /* runqueue lock: */ | 748 | /* runqueue lock: */ |
711 | raw_spinlock_t lock; | 749 | raw_spinlock_t lock; |
712 | 750 | ||
713 | /* | 751 | /* |
714 | * nr_running and cpu_load should be in the same cacheline because | 752 | * nr_running and cpu_load should be in the same cacheline because |
715 | * remote CPUs use both these fields when doing load calculation. | 753 | * remote CPUs use both these fields when doing load calculation. |
716 | */ | 754 | */ |
717 | unsigned int nr_running; | 755 | unsigned int nr_running; |
718 | #ifdef CONFIG_NUMA_BALANCING | 756 | #ifdef CONFIG_NUMA_BALANCING |
719 | unsigned int nr_numa_running; | 757 | unsigned int nr_numa_running; |
720 | unsigned int nr_preferred_running; | 758 | unsigned int nr_preferred_running; |
721 | #endif | 759 | #endif |
722 | #define CPU_LOAD_IDX_MAX 5 | 760 | #define CPU_LOAD_IDX_MAX 5 |
723 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 761 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
724 | #ifdef CONFIG_NO_HZ_COMMON | 762 | #ifdef CONFIG_NO_HZ_COMMON |
725 | #ifdef CONFIG_SMP | 763 | #ifdef CONFIG_SMP |
726 | unsigned long last_load_update_tick; | 764 | unsigned long last_load_update_tick; |
765 | unsigned long last_blocked_load_update_tick; | ||
766 | unsigned int has_blocked_load; | ||
727 | #endif /* CONFIG_SMP */ | 767 | #endif /* CONFIG_SMP */ |
728 | unsigned long nohz_flags; | 768 | unsigned int nohz_tick_stopped; |
769 | atomic_t nohz_flags; | ||
729 | #endif /* CONFIG_NO_HZ_COMMON */ | 770 | #endif /* CONFIG_NO_HZ_COMMON */ |
730 | #ifdef CONFIG_NO_HZ_FULL | ||
731 | unsigned long last_sched_tick; | ||
732 | #endif | ||
733 | /* capture load from *all* tasks on this cpu: */ | ||
734 | struct load_weight load; | ||
735 | unsigned long nr_load_updates; | ||
736 | u64 nr_switches; | ||
737 | 771 | ||
738 | struct cfs_rq cfs; | 772 | /* capture load from *all* tasks on this CPU: */ |
739 | struct rt_rq rt; | 773 | struct load_weight load; |
740 | struct dl_rq dl; | 774 | unsigned long nr_load_updates; |
775 | u64 nr_switches; | ||
776 | |||
777 | struct cfs_rq cfs; | ||
778 | struct rt_rq rt; | ||
779 | struct dl_rq dl; | ||
741 | 780 | ||
742 | #ifdef CONFIG_FAIR_GROUP_SCHED | 781 | #ifdef CONFIG_FAIR_GROUP_SCHED |
743 | /* list of leaf cfs_rq on this cpu: */ | 782 | /* list of leaf cfs_rq on this CPU: */ |
744 | struct list_head leaf_cfs_rq_list; | 783 | struct list_head leaf_cfs_rq_list; |
745 | struct list_head *tmp_alone_branch; | 784 | struct list_head *tmp_alone_branch; |
746 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 785 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
747 | 786 | ||
748 | /* | 787 | /* |
@@ -751,94 +790,98 @@ struct rq { | |||
751 | * one CPU and if it got migrated afterwards it may decrease | 790 | * one CPU and if it got migrated afterwards it may decrease |
752 | * it on another CPU. Always updated under the runqueue lock: | 791 | * it on another CPU. Always updated under the runqueue lock: |
753 | */ | 792 | */ |
754 | unsigned long nr_uninterruptible; | 793 | unsigned long nr_uninterruptible; |
755 | 794 | ||
756 | struct task_struct *curr, *idle, *stop; | 795 | struct task_struct *curr; |
757 | unsigned long next_balance; | 796 | struct task_struct *idle; |
758 | struct mm_struct *prev_mm; | 797 | struct task_struct *stop; |
798 | unsigned long next_balance; | ||
799 | struct mm_struct *prev_mm; | ||
759 | 800 | ||
760 | unsigned int clock_update_flags; | 801 | unsigned int clock_update_flags; |
761 | u64 clock; | 802 | u64 clock; |
762 | u64 clock_task; | 803 | u64 clock_task; |
763 | 804 | ||
764 | atomic_t nr_iowait; | 805 | atomic_t nr_iowait; |
765 | 806 | ||
766 | #ifdef CONFIG_SMP | 807 | #ifdef CONFIG_SMP |
767 | struct root_domain *rd; | 808 | struct root_domain *rd; |
768 | struct sched_domain *sd; | 809 | struct sched_domain *sd; |
769 | 810 | ||
770 | unsigned long cpu_capacity; | 811 | unsigned long cpu_capacity; |
771 | unsigned long cpu_capacity_orig; | 812 | unsigned long cpu_capacity_orig; |
772 | 813 | ||
773 | struct callback_head *balance_callback; | 814 | struct callback_head *balance_callback; |
815 | |||
816 | unsigned char idle_balance; | ||
774 | 817 | ||
775 | unsigned char idle_balance; | ||
776 | /* For active balancing */ | 818 | /* For active balancing */ |
777 | int active_balance; | 819 | int active_balance; |
778 | int push_cpu; | 820 | int push_cpu; |
779 | struct cpu_stop_work active_balance_work; | 821 | struct cpu_stop_work active_balance_work; |
780 | /* cpu of this runqueue: */ | 822 | |
781 | int cpu; | 823 | /* CPU of this runqueue: */ |
782 | int online; | 824 | int cpu; |
825 | int online; | ||
783 | 826 | ||
784 | struct list_head cfs_tasks; | 827 | struct list_head cfs_tasks; |
785 | 828 | ||
786 | u64 rt_avg; | 829 | u64 rt_avg; |
787 | u64 age_stamp; | 830 | u64 age_stamp; |
788 | u64 idle_stamp; | 831 | u64 idle_stamp; |
789 | u64 avg_idle; | 832 | u64 avg_idle; |
790 | 833 | ||
791 | /* This is used to determine avg_idle's max value */ | 834 | /* This is used to determine avg_idle's max value */ |
792 | u64 max_idle_balance_cost; | 835 | u64 max_idle_balance_cost; |
793 | #endif | 836 | #endif |
794 | 837 | ||
795 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 838 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
796 | u64 prev_irq_time; | 839 | u64 prev_irq_time; |
797 | #endif | 840 | #endif |
798 | #ifdef CONFIG_PARAVIRT | 841 | #ifdef CONFIG_PARAVIRT |
799 | u64 prev_steal_time; | 842 | u64 prev_steal_time; |
800 | #endif | 843 | #endif |
801 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | 844 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
802 | u64 prev_steal_time_rq; | 845 | u64 prev_steal_time_rq; |
803 | #endif | 846 | #endif |
804 | 847 | ||
805 | /* calc_load related fields */ | 848 | /* calc_load related fields */ |
806 | unsigned long calc_load_update; | 849 | unsigned long calc_load_update; |
807 | long calc_load_active; | 850 | long calc_load_active; |
808 | 851 | ||
809 | #ifdef CONFIG_SCHED_HRTICK | 852 | #ifdef CONFIG_SCHED_HRTICK |
810 | #ifdef CONFIG_SMP | 853 | #ifdef CONFIG_SMP |
811 | int hrtick_csd_pending; | 854 | int hrtick_csd_pending; |
812 | call_single_data_t hrtick_csd; | 855 | call_single_data_t hrtick_csd; |
813 | #endif | 856 | #endif |
814 | struct hrtimer hrtick_timer; | 857 | struct hrtimer hrtick_timer; |
815 | #endif | 858 | #endif |
816 | 859 | ||
817 | #ifdef CONFIG_SCHEDSTATS | 860 | #ifdef CONFIG_SCHEDSTATS |
818 | /* latency stats */ | 861 | /* latency stats */ |
819 | struct sched_info rq_sched_info; | 862 | struct sched_info rq_sched_info; |
820 | unsigned long long rq_cpu_time; | 863 | unsigned long long rq_cpu_time; |
821 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | 864 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ |
822 | 865 | ||
823 | /* sys_sched_yield() stats */ | 866 | /* sys_sched_yield() stats */ |
824 | unsigned int yld_count; | 867 | unsigned int yld_count; |
825 | 868 | ||
826 | /* schedule() stats */ | 869 | /* schedule() stats */ |
827 | unsigned int sched_count; | 870 | unsigned int sched_count; |
828 | unsigned int sched_goidle; | 871 | unsigned int sched_goidle; |
829 | 872 | ||
830 | /* try_to_wake_up() stats */ | 873 | /* try_to_wake_up() stats */ |
831 | unsigned int ttwu_count; | 874 | unsigned int ttwu_count; |
832 | unsigned int ttwu_local; | 875 | unsigned int ttwu_local; |
833 | #endif | 876 | #endif |
834 | 877 | ||
835 | #ifdef CONFIG_SMP | 878 | #ifdef CONFIG_SMP |
836 | struct llist_head wake_list; | 879 | struct llist_head wake_list; |
837 | #endif | 880 | #endif |
838 | 881 | ||
839 | #ifdef CONFIG_CPU_IDLE | 882 | #ifdef CONFIG_CPU_IDLE |
840 | /* Must be inspected within a rcu lock section */ | 883 | /* Must be inspected within a rcu lock section */ |
841 | struct cpuidle_state *idle_state; | 884 | struct cpuidle_state *idle_state; |
842 | #endif | 885 | #endif |
843 | }; | 886 | }; |
844 | 887 | ||
@@ -904,9 +947,9 @@ static inline u64 __rq_clock_broken(struct rq *rq) | |||
904 | * one position though, because the next rq_unpin_lock() will shift it | 947 | * one position though, because the next rq_unpin_lock() will shift it |
905 | * back. | 948 | * back. |
906 | */ | 949 | */ |
907 | #define RQCF_REQ_SKIP 0x01 | 950 | #define RQCF_REQ_SKIP 0x01 |
908 | #define RQCF_ACT_SKIP 0x02 | 951 | #define RQCF_ACT_SKIP 0x02 |
909 | #define RQCF_UPDATED 0x04 | 952 | #define RQCF_UPDATED 0x04 |
910 | 953 | ||
911 | static inline void assert_clock_updated(struct rq *rq) | 954 | static inline void assert_clock_updated(struct rq *rq) |
912 | { | 955 | { |
@@ -1059,12 +1102,12 @@ extern void sched_ttwu_pending(void); | |||
1059 | 1102 | ||
1060 | /** | 1103 | /** |
1061 | * highest_flag_domain - Return highest sched_domain containing flag. | 1104 | * highest_flag_domain - Return highest sched_domain containing flag. |
1062 | * @cpu: The cpu whose highest level of sched domain is to | 1105 | * @cpu: The CPU whose highest level of sched domain is to |
1063 | * be returned. | 1106 | * be returned. |
1064 | * @flag: The flag to check for the highest sched_domain | 1107 | * @flag: The flag to check for the highest sched_domain |
1065 | * for the given cpu. | 1108 | * for the given CPU. |
1066 | * | 1109 | * |
1067 | * Returns the highest sched_domain of a cpu which contains the given flag. | 1110 | * Returns the highest sched_domain of a CPU which contains the given flag. |
1068 | */ | 1111 | */ |
1069 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | 1112 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) |
1070 | { | 1113 | { |
@@ -1099,30 +1142,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); | |||
1099 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 1142 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
1100 | 1143 | ||
1101 | struct sched_group_capacity { | 1144 | struct sched_group_capacity { |
1102 | atomic_t ref; | 1145 | atomic_t ref; |
1103 | /* | 1146 | /* |
1104 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity | 1147 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity |
1105 | * for a single CPU. | 1148 | * for a single CPU. |
1106 | */ | 1149 | */ |
1107 | unsigned long capacity; | 1150 | unsigned long capacity; |
1108 | unsigned long min_capacity; /* Min per-CPU capacity in group */ | 1151 | unsigned long min_capacity; /* Min per-CPU capacity in group */ |
1109 | unsigned long next_update; | 1152 | unsigned long next_update; |
1110 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 1153 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
1111 | 1154 | ||
1112 | #ifdef CONFIG_SCHED_DEBUG | 1155 | #ifdef CONFIG_SCHED_DEBUG |
1113 | int id; | 1156 | int id; |
1114 | #endif | 1157 | #endif |
1115 | 1158 | ||
1116 | unsigned long cpumask[0]; /* balance mask */ | 1159 | unsigned long cpumask[0]; /* Balance mask */ |
1117 | }; | 1160 | }; |
1118 | 1161 | ||
1119 | struct sched_group { | 1162 | struct sched_group { |
1120 | struct sched_group *next; /* Must be a circular list */ | 1163 | struct sched_group *next; /* Must be a circular list */ |
1121 | atomic_t ref; | 1164 | atomic_t ref; |
1122 | 1165 | ||
1123 | unsigned int group_weight; | 1166 | unsigned int group_weight; |
1124 | struct sched_group_capacity *sgc; | 1167 | struct sched_group_capacity *sgc; |
1125 | int asym_prefer_cpu; /* cpu of highest priority in group */ | 1168 | int asym_prefer_cpu; /* CPU of highest priority in group */ |
1126 | 1169 | ||
1127 | /* | 1170 | /* |
1128 | * The CPUs this group covers. | 1171 | * The CPUs this group covers. |
@@ -1131,7 +1174,7 @@ struct sched_group { | |||
1131 | * by attaching extra space to the end of the structure, | 1174 | * by attaching extra space to the end of the structure, |
1132 | * depending on how many CPUs the kernel has booted up with) | 1175 | * depending on how many CPUs the kernel has booted up with) |
1133 | */ | 1176 | */ |
1134 | unsigned long cpumask[0]; | 1177 | unsigned long cpumask[0]; |
1135 | }; | 1178 | }; |
1136 | 1179 | ||
1137 | static inline struct cpumask *sched_group_span(struct sched_group *sg) | 1180 | static inline struct cpumask *sched_group_span(struct sched_group *sg) |
@@ -1148,8 +1191,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) | |||
1148 | } | 1191 | } |
1149 | 1192 | ||
1150 | /** | 1193 | /** |
1151 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | 1194 | * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. |
1152 | * @group: The group whose first cpu is to be returned. | 1195 | * @group: The group whose first CPU is to be returned. |
1153 | */ | 1196 | */ |
1154 | static inline unsigned int group_first_cpu(struct sched_group *group) | 1197 | static inline unsigned int group_first_cpu(struct sched_group *group) |
1155 | { | 1198 | { |
@@ -1349,19 +1392,12 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
1349 | return p->on_rq == TASK_ON_RQ_MIGRATING; | 1392 | return p->on_rq == TASK_ON_RQ_MIGRATING; |
1350 | } | 1393 | } |
1351 | 1394 | ||
1352 | #ifndef prepare_arch_switch | ||
1353 | # define prepare_arch_switch(next) do { } while (0) | ||
1354 | #endif | ||
1355 | #ifndef finish_arch_post_lock_switch | ||
1356 | # define finish_arch_post_lock_switch() do { } while (0) | ||
1357 | #endif | ||
1358 | |||
1359 | /* | 1395 | /* |
1360 | * wake flags | 1396 | * wake flags |
1361 | */ | 1397 | */ |
1362 | #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ | 1398 | #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ |
1363 | #define WF_FORK 0x02 /* child wakeup after fork */ | 1399 | #define WF_FORK 0x02 /* Child wakeup after fork */ |
1364 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | 1400 | #define WF_MIGRATED 0x4 /* Internal use, task got migrated */ |
1365 | 1401 | ||
1366 | /* | 1402 | /* |
1367 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1403 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
@@ -1372,11 +1408,11 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
1372 | * slice expiry etc. | 1408 | * slice expiry etc. |
1373 | */ | 1409 | */ |
1374 | 1410 | ||
1375 | #define WEIGHT_IDLEPRIO 3 | 1411 | #define WEIGHT_IDLEPRIO 3 |
1376 | #define WMULT_IDLEPRIO 1431655765 | 1412 | #define WMULT_IDLEPRIO 1431655765 |
1377 | 1413 | ||
1378 | extern const int sched_prio_to_weight[40]; | 1414 | extern const int sched_prio_to_weight[40]; |
1379 | extern const u32 sched_prio_to_wmult[40]; | 1415 | extern const u32 sched_prio_to_wmult[40]; |
1380 | 1416 | ||
1381 | /* | 1417 | /* |
1382 | * {de,en}queue flags: | 1418 | * {de,en}queue flags: |
@@ -1398,9 +1434,9 @@ extern const u32 sched_prio_to_wmult[40]; | |||
1398 | */ | 1434 | */ |
1399 | 1435 | ||
1400 | #define DEQUEUE_SLEEP 0x01 | 1436 | #define DEQUEUE_SLEEP 0x01 |
1401 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ | 1437 | #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ |
1402 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ | 1438 | #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ |
1403 | #define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ | 1439 | #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ |
1404 | 1440 | ||
1405 | #define ENQUEUE_WAKEUP 0x01 | 1441 | #define ENQUEUE_WAKEUP 0x01 |
1406 | #define ENQUEUE_RESTORE 0x02 | 1442 | #define ENQUEUE_RESTORE 0x02 |
@@ -1422,10 +1458,10 @@ struct sched_class { | |||
1422 | 1458 | ||
1423 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1459 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
1424 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1460 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
1425 | void (*yield_task) (struct rq *rq); | 1461 | void (*yield_task) (struct rq *rq); |
1426 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | 1462 | bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); |
1427 | 1463 | ||
1428 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1464 | void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); |
1429 | 1465 | ||
1430 | /* | 1466 | /* |
1431 | * It is the responsibility of the pick_next_task() method that will | 1467 | * It is the responsibility of the pick_next_task() method that will |
@@ -1435,16 +1471,16 @@ struct sched_class { | |||
1435 | * May return RETRY_TASK when it finds a higher prio class has runnable | 1471 | * May return RETRY_TASK when it finds a higher prio class has runnable |
1436 | * tasks. | 1472 | * tasks. |
1437 | */ | 1473 | */ |
1438 | struct task_struct * (*pick_next_task) (struct rq *rq, | 1474 | struct task_struct * (*pick_next_task)(struct rq *rq, |
1439 | struct task_struct *prev, | 1475 | struct task_struct *prev, |
1440 | struct rq_flags *rf); | 1476 | struct rq_flags *rf); |
1441 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1477 | void (*put_prev_task)(struct rq *rq, struct task_struct *p); |
1442 | 1478 | ||
1443 | #ifdef CONFIG_SMP | 1479 | #ifdef CONFIG_SMP |
1444 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1480 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
1445 | void (*migrate_task_rq)(struct task_struct *p); | 1481 | void (*migrate_task_rq)(struct task_struct *p); |
1446 | 1482 | ||
1447 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1483 | void (*task_woken)(struct rq *this_rq, struct task_struct *task); |
1448 | 1484 | ||
1449 | void (*set_cpus_allowed)(struct task_struct *p, | 1485 | void (*set_cpus_allowed)(struct task_struct *p, |
1450 | const struct cpumask *newmask); | 1486 | const struct cpumask *newmask); |
@@ -1453,31 +1489,31 @@ struct sched_class { | |||
1453 | void (*rq_offline)(struct rq *rq); | 1489 | void (*rq_offline)(struct rq *rq); |
1454 | #endif | 1490 | #endif |
1455 | 1491 | ||
1456 | void (*set_curr_task) (struct rq *rq); | 1492 | void (*set_curr_task)(struct rq *rq); |
1457 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1493 | void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); |
1458 | void (*task_fork) (struct task_struct *p); | 1494 | void (*task_fork)(struct task_struct *p); |
1459 | void (*task_dead) (struct task_struct *p); | 1495 | void (*task_dead)(struct task_struct *p); |
1460 | 1496 | ||
1461 | /* | 1497 | /* |
1462 | * The switched_from() call is allowed to drop rq->lock, therefore we | 1498 | * The switched_from() call is allowed to drop rq->lock, therefore we |
1463 | * cannot assume the switched_from/switched_to pair is serliazed by | 1499 | * cannot assume the switched_from/switched_to pair is serliazed by |
1464 | * rq->lock. They are however serialized by p->pi_lock. | 1500 | * rq->lock. They are however serialized by p->pi_lock. |
1465 | */ | 1501 | */ |
1466 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1502 | void (*switched_from)(struct rq *this_rq, struct task_struct *task); |
1467 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1503 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
1468 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1504 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
1469 | int oldprio); | 1505 | int oldprio); |
1470 | 1506 | ||
1471 | unsigned int (*get_rr_interval) (struct rq *rq, | 1507 | unsigned int (*get_rr_interval)(struct rq *rq, |
1472 | struct task_struct *task); | 1508 | struct task_struct *task); |
1473 | 1509 | ||
1474 | void (*update_curr) (struct rq *rq); | 1510 | void (*update_curr)(struct rq *rq); |
1475 | 1511 | ||
1476 | #define TASK_SET_GROUP 0 | 1512 | #define TASK_SET_GROUP 0 |
1477 | #define TASK_MOVE_GROUP 1 | 1513 | #define TASK_MOVE_GROUP 1 |
1478 | 1514 | ||
1479 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1515 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1480 | void (*task_change_group) (struct task_struct *p, int type); | 1516 | void (*task_change_group)(struct task_struct *p, int type); |
1481 | #endif | 1517 | #endif |
1482 | }; | 1518 | }; |
1483 | 1519 | ||
@@ -1526,6 +1562,7 @@ static inline void idle_set_state(struct rq *rq, | |||
1526 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | 1562 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) |
1527 | { | 1563 | { |
1528 | SCHED_WARN_ON(!rcu_read_lock_held()); | 1564 | SCHED_WARN_ON(!rcu_read_lock_held()); |
1565 | |||
1529 | return rq->idle_state; | 1566 | return rq->idle_state; |
1530 | } | 1567 | } |
1531 | #else | 1568 | #else |
@@ -1564,9 +1601,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | |||
1564 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); | 1601 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); |
1565 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); | 1602 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); |
1566 | 1603 | ||
1567 | #define BW_SHIFT 20 | 1604 | #define BW_SHIFT 20 |
1568 | #define BW_UNIT (1 << BW_SHIFT) | 1605 | #define BW_UNIT (1 << BW_SHIFT) |
1569 | #define RATIO_SHIFT 8 | 1606 | #define RATIO_SHIFT 8 |
1570 | unsigned long to_ratio(u64 period, u64 runtime); | 1607 | unsigned long to_ratio(u64 period, u64 runtime); |
1571 | 1608 | ||
1572 | extern void init_entity_runnable_average(struct sched_entity *se); | 1609 | extern void init_entity_runnable_average(struct sched_entity *se); |
@@ -1574,6 +1611,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); | |||
1574 | 1611 | ||
1575 | #ifdef CONFIG_NO_HZ_FULL | 1612 | #ifdef CONFIG_NO_HZ_FULL |
1576 | extern bool sched_can_stop_tick(struct rq *rq); | 1613 | extern bool sched_can_stop_tick(struct rq *rq); |
1614 | extern int __init sched_tick_offload_init(void); | ||
1577 | 1615 | ||
1578 | /* | 1616 | /* |
1579 | * Tick may be needed by tasks in the runqueue depending on their policy and | 1617 | * Tick may be needed by tasks in the runqueue depending on their policy and |
@@ -1598,6 +1636,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) | |||
1598 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); | 1636 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); |
1599 | } | 1637 | } |
1600 | #else | 1638 | #else |
1639 | static inline int sched_tick_offload_init(void) { return 0; } | ||
1601 | static inline void sched_update_tick_dependency(struct rq *rq) { } | 1640 | static inline void sched_update_tick_dependency(struct rq *rq) { } |
1602 | #endif | 1641 | #endif |
1603 | 1642 | ||
@@ -1624,13 +1663,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) | |||
1624 | sched_update_tick_dependency(rq); | 1663 | sched_update_tick_dependency(rq); |
1625 | } | 1664 | } |
1626 | 1665 | ||
1627 | static inline void rq_last_tick_reset(struct rq *rq) | ||
1628 | { | ||
1629 | #ifdef CONFIG_NO_HZ_FULL | ||
1630 | rq->last_sched_tick = jiffies; | ||
1631 | #endif | ||
1632 | } | ||
1633 | |||
1634 | extern void update_rq_clock(struct rq *rq); | 1666 | extern void update_rq_clock(struct rq *rq); |
1635 | 1667 | ||
1636 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | 1668 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); |
@@ -1821,8 +1853,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1821 | /* | 1853 | /* |
1822 | * Unfair double_lock_balance: Optimizes throughput at the expense of | 1854 | * Unfair double_lock_balance: Optimizes throughput at the expense of |
1823 | * latency by eliminating extra atomic operations when the locks are | 1855 | * latency by eliminating extra atomic operations when the locks are |
1824 | * already in proper order on entry. This favors lower cpu-ids and will | 1856 | * already in proper order on entry. This favors lower CPU-ids and will |
1825 | * grant the double lock to lower cpus over higher ids under contention, | 1857 | * grant the double lock to lower CPUs over higher ids under contention, |
1826 | * regardless of entry order into the function. | 1858 | * regardless of entry order into the function. |
1827 | */ | 1859 | */ |
1828 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1860 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) |
@@ -1854,7 +1886,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1854 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1886 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1855 | { | 1887 | { |
1856 | if (unlikely(!irqs_disabled())) { | 1888 | if (unlikely(!irqs_disabled())) { |
1857 | /* printk() doesn't work good under rq->lock */ | 1889 | /* printk() doesn't work well under rq->lock */ |
1858 | raw_spin_unlock(&this_rq->lock); | 1890 | raw_spin_unlock(&this_rq->lock); |
1859 | BUG_ON(1); | 1891 | BUG_ON(1); |
1860 | } | 1892 | } |
@@ -2005,16 +2037,19 @@ extern void cfs_bandwidth_usage_inc(void); | |||
2005 | extern void cfs_bandwidth_usage_dec(void); | 2037 | extern void cfs_bandwidth_usage_dec(void); |
2006 | 2038 | ||
2007 | #ifdef CONFIG_NO_HZ_COMMON | 2039 | #ifdef CONFIG_NO_HZ_COMMON |
2008 | enum rq_nohz_flag_bits { | 2040 | #define NOHZ_BALANCE_KICK_BIT 0 |
2009 | NOHZ_TICK_STOPPED, | 2041 | #define NOHZ_STATS_KICK_BIT 1 |
2010 | NOHZ_BALANCE_KICK, | 2042 | |
2011 | }; | 2043 | #define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) |
2044 | #define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) | ||
2045 | |||
2046 | #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) | ||
2012 | 2047 | ||
2013 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 2048 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
2014 | 2049 | ||
2015 | extern void nohz_balance_exit_idle(unsigned int cpu); | 2050 | extern void nohz_balance_exit_idle(struct rq *rq); |
2016 | #else | 2051 | #else |
2017 | static inline void nohz_balance_exit_idle(unsigned int cpu) { } | 2052 | static inline void nohz_balance_exit_idle(struct rq *rq) { } |
2018 | #endif | 2053 | #endif |
2019 | 2054 | ||
2020 | 2055 | ||
@@ -2113,15 +2148,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | |||
2113 | #endif /* CONFIG_CPU_FREQ */ | 2148 | #endif /* CONFIG_CPU_FREQ */ |
2114 | 2149 | ||
2115 | #ifdef arch_scale_freq_capacity | 2150 | #ifdef arch_scale_freq_capacity |
2116 | #ifndef arch_scale_freq_invariant | 2151 | # ifndef arch_scale_freq_invariant |
2117 | #define arch_scale_freq_invariant() (true) | 2152 | # define arch_scale_freq_invariant() true |
2118 | #endif | 2153 | # endif |
2119 | #else /* arch_scale_freq_capacity */ | 2154 | #else |
2120 | #define arch_scale_freq_invariant() (false) | 2155 | # define arch_scale_freq_invariant() false |
2121 | #endif | 2156 | #endif |
2122 | 2157 | ||
2123 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | 2158 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
2124 | |||
2125 | static inline unsigned long cpu_util_dl(struct rq *rq) | 2159 | static inline unsigned long cpu_util_dl(struct rq *rq) |
2126 | { | 2160 | { |
2127 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; | 2161 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; |
@@ -2129,7 +2163,13 @@ static inline unsigned long cpu_util_dl(struct rq *rq) | |||
2129 | 2163 | ||
2130 | static inline unsigned long cpu_util_cfs(struct rq *rq) | 2164 | static inline unsigned long cpu_util_cfs(struct rq *rq) |
2131 | { | 2165 | { |
2132 | return rq->cfs.avg.util_avg; | 2166 | unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); |
2133 | } | 2167 | |
2168 | if (sched_feat(UTIL_EST)) { | ||
2169 | util = max_t(unsigned long, util, | ||
2170 | READ_ONCE(rq->cfs.avg.util_est.enqueued)); | ||
2171 | } | ||
2134 | 2172 | ||
2173 | return util; | ||
2174 | } | ||
2135 | #endif | 2175 | #endif |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 940b1fa1d2ce..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
@@ -1,14 +1,13 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | 2 | /* | |
3 | #include <linux/slab.h> | 3 | * /proc/schedstat implementation |
4 | #include <linux/fs.h> | 4 | */ |
5 | #include <linux/seq_file.h> | ||
6 | #include <linux/proc_fs.h> | ||
7 | |||
8 | #include "sched.h" | 5 | #include "sched.h" |
9 | 6 | ||
10 | /* | 7 | /* |
11 | * bump this up when changing the output format or the meaning of an existing | 8 | * Current schedstat API version. |
9 | * | ||
10 | * Bump this up when changing the output format or the meaning of an existing | ||
12 | * format, so that tools can adapt (or abort) | 11 | * format, so that tools can adapt (or abort) |
13 | */ | 12 | */ |
14 | #define SCHEDSTAT_VERSION 15 | 13 | #define SCHEDSTAT_VERSION 15 |
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
78 | * This itererator needs some explanation. | 77 | * This itererator needs some explanation. |
79 | * It returns 1 for the header position. | 78 | * It returns 1 for the header position. |
80 | * This means 2 is cpu 0. | 79 | * This means 2 is cpu 0. |
81 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 80 | * In a hotplugged system some CPUs, including cpu 0, may be missing so we have |
82 | * to use cpumask_* to iterate over the cpus. | 81 | * to use cpumask_* to iterate over the CPUs. |
83 | */ | 82 | */ |
84 | static void *schedstat_start(struct seq_file *file, loff_t *offset) | 83 | static void *schedstat_start(struct seq_file *file, loff_t *offset) |
85 | { | 84 | { |
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) | |||
99 | 98 | ||
100 | if (n < nr_cpu_ids) | 99 | if (n < nr_cpu_ids) |
101 | return (void *)(unsigned long)(n + 2); | 100 | return (void *)(unsigned long)(n + 2); |
101 | |||
102 | return NULL; | 102 | return NULL; |
103 | } | 103 | } |
104 | 104 | ||
105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) | 105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) |
106 | { | 106 | { |
107 | (*offset)++; | 107 | (*offset)++; |
108 | |||
108 | return schedstat_start(file, offset); | 109 | return schedstat_start(file, offset); |
109 | } | 110 | } |
110 | 111 | ||
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = { | |||
134 | static int __init proc_schedstat_init(void) | 135 | static int __init proc_schedstat_init(void) |
135 | { | 136 | { |
136 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | 137 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); |
138 | |||
137 | return 0; | 139 | return 0; |
138 | } | 140 | } |
139 | subsys_initcall(proc_schedstat_init); | 141 | subsys_initcall(proc_schedstat_init); |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8e7b58de61e7..8aea199a39b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
30 | if (rq) | 30 | if (rq) |
31 | rq->rq_sched_info.run_delay += delta; | 31 | rq->rq_sched_info.run_delay += delta; |
32 | } | 32 | } |
33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) | 33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
34 | #define __schedstat_inc(var) do { var++; } while (0) | 34 | #define __schedstat_inc(var) do { var++; } while (0) |
35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) | 35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) | 36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) |
37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) | 37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
38 | #define __schedstat_set(var, val) do { var = (val); } while (0) | 38 | #define __schedstat_set(var, val) do { var = (val); } while (0) |
39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
40 | #define schedstat_val(var) (var) | 40 | #define schedstat_val(var) (var) |
41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) | 41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) |
42 | 42 | ||
43 | #else /* !CONFIG_SCHEDSTATS */ | 43 | #else /* !CONFIG_SCHEDSTATS: */ |
44 | static inline void | 44 | static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } |
45 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 45 | static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } |
46 | {} | 46 | static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } |
47 | static inline void | 47 | # define schedstat_enabled() 0 |
48 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | 48 | # define __schedstat_inc(var) do { } while (0) |
49 | {} | 49 | # define schedstat_inc(var) do { } while (0) |
50 | static inline void | 50 | # define __schedstat_add(var, amt) do { } while (0) |
51 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 51 | # define schedstat_add(var, amt) do { } while (0) |
52 | {} | 52 | # define __schedstat_set(var, val) do { } while (0) |
53 | #define schedstat_enabled() 0 | 53 | # define schedstat_set(var, val) do { } while (0) |
54 | #define __schedstat_inc(var) do { } while (0) | 54 | # define schedstat_val(var) 0 |
55 | #define schedstat_inc(var) do { } while (0) | 55 | # define schedstat_val_or_zero(var) 0 |
56 | #define __schedstat_add(var, amt) do { } while (0) | ||
57 | #define schedstat_add(var, amt) do { } while (0) | ||
58 | #define __schedstat_set(var, val) do { } while (0) | ||
59 | #define schedstat_set(var, val) do { } while (0) | ||
60 | #define schedstat_val(var) 0 | ||
61 | #define schedstat_val_or_zero(var) 0 | ||
62 | #endif /* CONFIG_SCHEDSTATS */ | 56 | #endif /* CONFIG_SCHEDSTATS */ |
63 | 57 | ||
64 | #ifdef CONFIG_SCHED_INFO | 58 | #ifdef CONFIG_SCHED_INFO |
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
69 | 63 | ||
70 | /* | 64 | /* |
71 | * We are interested in knowing how long it was from the *first* time a | 65 | * We are interested in knowing how long it was from the *first* time a |
72 | * task was queued to the time that it finally hit a cpu, we call this routine | 66 | * task was queued to the time that it finally hit a CPU, we call this routine |
73 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 67 | * from dequeue_task() to account for possible rq->clock skew across CPUs. The |
74 | * delta taken on each cpu would annul the skew. | 68 | * delta taken on each CPU would annul the skew. |
75 | */ | 69 | */ |
76 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | 70 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
77 | { | 71 | { |
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | |||
87 | } | 81 | } |
88 | 82 | ||
89 | /* | 83 | /* |
90 | * Called when a task finally hits the cpu. We can now calculate how | 84 | * Called when a task finally hits the CPU. We can now calculate how |
91 | * long it was waiting to run. We also note when it began so that we | 85 | * long it was waiting to run. We also note when it began so that we |
92 | * can keep stats on how long its timeslice is. | 86 | * can keep stats on how long its timeslice is. |
93 | */ | 87 | */ |
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) | |||
112 | */ | 106 | */ |
113 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | 107 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
114 | { | 108 | { |
115 | if (unlikely(sched_info_on())) | 109 | if (unlikely(sched_info_on())) { |
116 | if (!t->sched_info.last_queued) | 110 | if (!t->sched_info.last_queued) |
117 | t->sched_info.last_queued = rq_clock(rq); | 111 | t->sched_info.last_queued = rq_clock(rq); |
112 | } | ||
118 | } | 113 | } |
119 | 114 | ||
120 | /* | 115 | /* |
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | |||
127 | */ | 122 | */ |
128 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | 123 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
129 | { | 124 | { |
130 | unsigned long long delta = rq_clock(rq) - | 125 | unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; |
131 | t->sched_info.last_arrival; | ||
132 | 126 | ||
133 | rq_sched_info_depart(rq, delta); | 127 | rq_sched_info_depart(rq, delta); |
134 | 128 | ||
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | |||
142 | * the idle task.) We are only called when prev != next. | 136 | * the idle task.) We are only called when prev != next. |
143 | */ | 137 | */ |
144 | static inline void | 138 | static inline void |
145 | __sched_info_switch(struct rq *rq, | 139 | __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
146 | struct task_struct *prev, struct task_struct *next) | ||
147 | { | 140 | { |
148 | /* | 141 | /* |
149 | * prev now departs the cpu. It's not interesting to record | 142 | * prev now departs the CPU. It's not interesting to record |
150 | * stats about how efficient we were at scheduling the idle | 143 | * stats about how efficient we were at scheduling the idle |
151 | * process, however. | 144 | * process, however. |
152 | */ | 145 | */ |
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq, | |||
156 | if (next != rq->idle) | 149 | if (next != rq->idle) |
157 | sched_info_arrive(rq, next); | 150 | sched_info_arrive(rq, next); |
158 | } | 151 | } |
152 | |||
159 | static inline void | 153 | static inline void |
160 | sched_info_switch(struct rq *rq, | 154 | sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
161 | struct task_struct *prev, struct task_struct *next) | ||
162 | { | 155 | { |
163 | if (unlikely(sched_info_on())) | 156 | if (unlikely(sched_info_on())) |
164 | __sched_info_switch(rq, prev, next); | 157 | __sched_info_switch(rq, prev, next); |
165 | } | 158 | } |
166 | #else | 159 | |
167 | #define sched_info_queued(rq, t) do { } while (0) | 160 | #else /* !CONFIG_SCHED_INFO: */ |
168 | #define sched_info_reset_dequeued(t) do { } while (0) | 161 | # define sched_info_queued(rq, t) do { } while (0) |
169 | #define sched_info_dequeued(rq, t) do { } while (0) | 162 | # define sched_info_reset_dequeued(t) do { } while (0) |
170 | #define sched_info_depart(rq, t) do { } while (0) | 163 | # define sched_info_dequeued(rq, t) do { } while (0) |
171 | #define sched_info_arrive(rq, next) do { } while (0) | 164 | # define sched_info_depart(rq, t) do { } while (0) |
172 | #define sched_info_switch(rq, t, next) do { } while (0) | 165 | # define sched_info_arrive(rq, next) do { } while (0) |
166 | # define sched_info_switch(rq, t, next) do { } while (0) | ||
173 | #endif /* CONFIG_SCHED_INFO */ | 167 | #endif /* CONFIG_SCHED_INFO */ |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 210b1f2146ff..c183b790ca54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -1,6 +1,4 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include "sched.h" | ||
3 | |||
4 | /* | 2 | /* |
5 | * stop-task scheduling class. | 3 | * stop-task scheduling class. |
6 | * | 4 | * |
@@ -9,6 +7,7 @@ | |||
9 | * | 7 | * |
10 | * See kernel/stop_machine.c | 8 | * See kernel/stop_machine.c |
11 | */ | 9 | */ |
10 | #include "sched.h" | ||
12 | 11 | ||
13 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
14 | static int | 13 | static int |
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
75 | cgroup_account_cputime(curr, delta_exec); | 74 | cgroup_account_cputime(curr, delta_exec); |
76 | } | 75 | } |
77 | 76 | ||
77 | /* | ||
78 | * scheduler tick hitting a task of our scheduling class. | ||
79 | * | ||
80 | * NOTE: This function can be called remotely by the tick offload that | ||
81 | * goes along full dynticks. Therefore no local assumption can be made | ||
82 | * and everything must be accessed through the @rq and @curr passed in | ||
83 | * parameters. | ||
84 | */ | ||
78 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | 85 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) |
79 | { | 86 | { |
80 | } | 87 | } |
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 9ff1555341ed..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c | |||
@@ -1,6 +1,8 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/sched/signal.h> | 2 | /* |
3 | #include <linux/swait.h> | 3 | * <linux/swait.h> (simple wait queues ) implementation: |
4 | */ | ||
5 | #include "sched.h" | ||
4 | 6 | ||
5 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | 7 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, |
6 | struct lock_class_key *key) | 8 | struct lock_class_key *key) |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 519b024f4e94..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
@@ -2,10 +2,6 @@ | |||
2 | /* | 2 | /* |
3 | * Scheduler topology setup/handling methods | 3 | * Scheduler topology setup/handling methods |
4 | */ | 4 | */ |
5 | #include <linux/sched.h> | ||
6 | #include <linux/mutex.h> | ||
7 | #include <linux/sched/isolation.h> | ||
8 | |||
9 | #include "sched.h" | 5 | #include "sched.h" |
10 | 6 | ||
11 | DEFINE_MUTEX(sched_domains_mutex); | 7 | DEFINE_MUTEX(sched_domains_mutex); |
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
41 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 37 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
42 | printk("does not load-balance\n"); | 38 | printk("does not load-balance\n"); |
43 | if (sd->parent) | 39 | if (sd->parent) |
44 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 40 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); |
45 | " has parent"); | ||
46 | return -1; | 41 | return -1; |
47 | } | 42 | } |
48 | 43 | ||
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
50 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | 45 | cpumask_pr_args(sched_domain_span(sd)), sd->name); |
51 | 46 | ||
52 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 47 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
53 | printk(KERN_ERR "ERROR: domain->span does not contain " | 48 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); |
54 | "CPU%d\n", cpu); | ||
55 | } | 49 | } |
56 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { | 50 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { |
57 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 51 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); |
58 | " CPU%d\n", cpu); | ||
59 | } | 52 | } |
60 | 53 | ||
61 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | 54 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); |
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
115 | 108 | ||
116 | if (sd->parent && | 109 | if (sd->parent && |
117 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | 110 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) |
118 | printk(KERN_ERR "ERROR: parent span is not a superset " | 111 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); |
119 | "of domain->span\n"); | ||
120 | return 0; | 112 | return 0; |
121 | } | 113 | } |
122 | 114 | ||
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg) | |||
595 | * are not. | 587 | * are not. |
596 | * | 588 | * |
597 | * This leads to a few particularly weird cases where the sched_domain's are | 589 | * This leads to a few particularly weird cases where the sched_domain's are |
598 | * not of the same number for each cpu. Consider: | 590 | * not of the same number for each CPU. Consider: |
599 | * | 591 | * |
600 | * NUMA-2 0-3 0-3 | 592 | * NUMA-2 0-3 0-3 |
601 | * groups: {0-2},{1-3} {1-3},{0-2} | 593 | * groups: {0-2},{1-3} {1-3},{0-2} |
@@ -780,7 +772,7 @@ fail: | |||
780 | * ^ ^ ^ ^ | 772 | * ^ ^ ^ ^ |
781 | * `-' `-' | 773 | * `-' `-' |
782 | * | 774 | * |
783 | * The sched_domains are per-cpu and have a two way link (parent & child) and | 775 | * The sched_domains are per-CPU and have a two way link (parent & child) and |
784 | * denote the ever growing mask of CPUs belonging to that level of topology. | 776 | * denote the ever growing mask of CPUs belonging to that level of topology. |
785 | * | 777 | * |
786 | * Each sched_domain has a circular (double) linked list of sched_group's, each | 778 | * Each sched_domain has a circular (double) linked list of sched_group's, each |
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | |||
1021 | d->rd = alloc_rootdomain(); | 1013 | d->rd = alloc_rootdomain(); |
1022 | if (!d->rd) | 1014 | if (!d->rd) |
1023 | return sa_sd; | 1015 | return sa_sd; |
1016 | |||
1024 | return sa_rootdomain; | 1017 | return sa_rootdomain; |
1025 | } | 1018 | } |
1026 | 1019 | ||
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
1047 | } | 1040 | } |
1048 | 1041 | ||
1049 | #ifdef CONFIG_NUMA | 1042 | #ifdef CONFIG_NUMA |
1050 | static int sched_domains_numa_levels; | ||
1051 | enum numa_topology_type sched_numa_topology_type; | 1043 | enum numa_topology_type sched_numa_topology_type; |
1052 | static int *sched_domains_numa_distance; | 1044 | |
1053 | int sched_max_numa_distance; | 1045 | static int sched_domains_numa_levels; |
1054 | static struct cpumask ***sched_domains_numa_masks; | 1046 | static int sched_domains_curr_level; |
1055 | static int sched_domains_curr_level; | 1047 | |
1048 | int sched_max_numa_distance; | ||
1049 | static int *sched_domains_numa_distance; | ||
1050 | static struct cpumask ***sched_domains_numa_masks; | ||
1056 | #endif | 1051 | #endif |
1057 | 1052 | ||
1058 | /* | 1053 | /* |
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level; | |||
1074 | * SD_ASYM_PACKING - describes SMT quirks | 1069 | * SD_ASYM_PACKING - describes SMT quirks |
1075 | */ | 1070 | */ |
1076 | #define TOPOLOGY_SD_FLAGS \ | 1071 | #define TOPOLOGY_SD_FLAGS \ |
1077 | (SD_SHARE_CPUCAPACITY | \ | 1072 | (SD_SHARE_CPUCAPACITY | \ |
1078 | SD_SHARE_PKG_RESOURCES | \ | 1073 | SD_SHARE_PKG_RESOURCES | \ |
1079 | SD_NUMA | \ | 1074 | SD_NUMA | \ |
1080 | SD_ASYM_PACKING | \ | 1075 | SD_ASYM_PACKING | \ |
1081 | SD_ASYM_CPUCAPACITY | \ | 1076 | SD_ASYM_CPUCAPACITY | \ |
1082 | SD_SHARE_POWERDOMAIN) | 1077 | SD_SHARE_POWERDOMAIN) |
1083 | 1078 | ||
1084 | static struct sched_domain * | 1079 | static struct sched_domain * |
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve | |||
1628 | pr_err(" the %s domain not a subset of the %s domain\n", | 1623 | pr_err(" the %s domain not a subset of the %s domain\n", |
1629 | child->name, sd->name); | 1624 | child->name, sd->name); |
1630 | #endif | 1625 | #endif |
1631 | /* Fixup, ensure @sd has at least @child cpus. */ | 1626 | /* Fixup, ensure @sd has at least @child CPUs. */ |
1632 | cpumask_or(sched_domain_span(sd), | 1627 | cpumask_or(sched_domain_span(sd), |
1633 | sched_domain_span(sd), | 1628 | sched_domain_span(sd), |
1634 | sched_domain_span(child)); | 1629 | sched_domain_span(child)); |
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att | |||
1720 | ret = 0; | 1715 | ret = 0; |
1721 | error: | 1716 | error: |
1722 | __free_domain_allocs(&d, alloc_state, cpu_map); | 1717 | __free_domain_allocs(&d, alloc_state, cpu_map); |
1718 | |||
1723 | return ret; | 1719 | return ret; |
1724 | } | 1720 | } |
1725 | 1721 | ||
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
1824 | return 1; | 1820 | return 1; |
1825 | 1821 | ||
1826 | tmp = SD_ATTR_INIT; | 1822 | tmp = SD_ATTR_INIT; |
1823 | |||
1827 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | 1824 | return !memcmp(cur ? (cur + idx_cur) : &tmp, |
1828 | new ? (new + idx_new) : &tmp, | 1825 | new ? (new + idx_new) : &tmp, |
1829 | sizeof(struct sched_domain_attr)); | 1826 | sizeof(struct sched_domain_attr)); |
@@ -1929,4 +1926,3 @@ match2: | |||
1929 | 1926 | ||
1930 | mutex_unlock(&sched_domains_mutex); | 1927 | mutex_unlock(&sched_domains_mutex); |
1931 | } | 1928 | } |
1932 | |||
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 929ecb7d6b78..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -3,14 +3,7 @@ | |||
3 | * | 3 | * |
4 | * (C) 2004 Nadia Yvette Chambers, Oracle | 4 | * (C) 2004 Nadia Yvette Chambers, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include "sched.h" |
7 | #include <linux/export.h> | ||
8 | #include <linux/sched/signal.h> | ||
9 | #include <linux/sched/debug.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/wait.h> | ||
12 | #include <linux/hash.h> | ||
13 | #include <linux/kthread.h> | ||
14 | 7 | ||
15 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) | 8 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) |
16 | { | 9 | { |
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, | |||
107 | break; | 100 | break; |
108 | } | 101 | } |
109 | } | 102 | } |
103 | |||
110 | return nr_exclusive; | 104 | return nr_exclusive; |
111 | } | 105 | } |
112 | 106 | ||
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
317 | spin_unlock(&wq->lock); | 311 | spin_unlock(&wq->lock); |
318 | schedule(); | 312 | schedule(); |
319 | spin_lock(&wq->lock); | 313 | spin_lock(&wq->lock); |
314 | |||
320 | return 0; | 315 | return 0; |
321 | } | 316 | } |
322 | EXPORT_SYMBOL(do_wait_intr); | 317 | EXPORT_SYMBOL(do_wait_intr); |
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
333 | spin_unlock_irq(&wq->lock); | 328 | spin_unlock_irq(&wq->lock); |
334 | schedule(); | 329 | schedule(); |
335 | spin_lock_irq(&wq->lock); | 330 | spin_lock_irq(&wq->lock); |
331 | |||
336 | return 0; | 332 | return 0; |
337 | } | 333 | } |
338 | EXPORT_SYMBOL(do_wait_intr_irq); | 334 | EXPORT_SYMBOL(do_wait_intr_irq); |
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i | |||
378 | 374 | ||
379 | if (ret) | 375 | if (ret) |
380 | list_del_init(&wq_entry->entry); | 376 | list_del_init(&wq_entry->entry); |
377 | |||
381 | return ret; | 378 | return ret; |
382 | } | 379 | } |
383 | EXPORT_SYMBOL(autoremove_wake_function); | 380 | EXPORT_SYMBOL(autoremove_wake_function); |
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 84cb3acd9260..4239c78f5cd3 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c | |||
@@ -1,10 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * The implementation of the wait_bit*() and related waiting APIs: | 2 | * The implementation of the wait_bit*() and related waiting APIs: |
3 | */ | 3 | */ |
4 | #include <linux/wait_bit.h> | 4 | #include "sched.h" |
5 | #include <linux/sched/signal.h> | ||
6 | #include <linux/sched/debug.h> | ||
7 | #include <linux/hash.h> | ||
8 | 5 | ||
9 | #define WAIT_TABLE_BITS 8 | 6 | #define WAIT_TABLE_BITS 8 |
10 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | 7 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) |
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync | |||
29 | wait_bit->key.bit_nr != key->bit_nr || | 26 | wait_bit->key.bit_nr != key->bit_nr || |
30 | test_bit(key->bit_nr, key->flags)) | 27 | test_bit(key->bit_nr, key->flags)) |
31 | return 0; | 28 | return 0; |
32 | else | 29 | |
33 | return autoremove_wake_function(wq_entry, mode, sync, key); | 30 | return autoremove_wake_function(wq_entry, mode, sync, key); |
34 | } | 31 | } |
35 | EXPORT_SYMBOL(wake_bit_function); | 32 | EXPORT_SYMBOL(wake_bit_function); |
36 | 33 | ||
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ | |||
50 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) | 47 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) |
51 | ret = (*action)(&wbq_entry->key, mode); | 48 | ret = (*action)(&wbq_entry->key, mode); |
52 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); | 49 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); |
50 | |||
53 | finish_wait(wq_head, &wbq_entry->wq_entry); | 51 | finish_wait(wq_head, &wbq_entry->wq_entry); |
52 | |||
54 | return ret; | 53 | return ret; |
55 | } | 54 | } |
56 | EXPORT_SYMBOL(__wait_on_bit); | 55 | EXPORT_SYMBOL(__wait_on_bit); |
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout( | |||
73 | DEFINE_WAIT_BIT(wq_entry, word, bit); | 72 | DEFINE_WAIT_BIT(wq_entry, word, bit); |
74 | 73 | ||
75 | wq_entry.key.timeout = jiffies + timeout; | 74 | wq_entry.key.timeout = jiffies + timeout; |
75 | |||
76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); | 76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); |
77 | } | 77 | } |
78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | 78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); |
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); | |||
120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) | 120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) |
121 | { | 121 | { |
122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | 122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); |
123 | |||
123 | if (waitqueue_active(wq_head)) | 124 | if (waitqueue_active(wq_head)) |
124 | __wake_up(wq_head, TASK_NORMAL, 1, &key); | 125 | __wake_up(wq_head, TASK_NORMAL, 1, &key); |
125 | } | 126 | } |
@@ -157,6 +158,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) | |||
157 | { | 158 | { |
158 | if (BITS_PER_LONG == 64) { | 159 | if (BITS_PER_LONG == 64) { |
159 | unsigned long q = (unsigned long)p; | 160 | unsigned long q = (unsigned long)p; |
161 | |||
160 | return bit_waitqueue((void *)(q & ~1), q & 1); | 162 | return bit_waitqueue((void *)(q & ~1), q & 1); |
161 | } | 163 | } |
162 | return bit_waitqueue(p, 0); | 164 | return bit_waitqueue(p, 0); |
@@ -173,6 +175,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo | |||
173 | wait_bit->key.bit_nr != key->bit_nr || | 175 | wait_bit->key.bit_nr != key->bit_nr || |
174 | atomic_read(val) != 0) | 176 | atomic_read(val) != 0) |
175 | return 0; | 177 | return 0; |
178 | |||
176 | return autoremove_wake_function(wq_entry, mode, sync, key); | 179 | return autoremove_wake_function(wq_entry, mode, sync, key); |
177 | } | 180 | } |
178 | 181 | ||
@@ -196,6 +199,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en | |||
196 | ret = (*action)(val, mode); | 199 | ret = (*action)(val, mode); |
197 | } while (!ret && atomic_read(val) != 0); | 200 | } while (!ret && atomic_read(val) != 0); |
198 | finish_wait(wq_head, &wbq_entry->wq_entry); | 201 | finish_wait(wq_head, &wbq_entry->wq_entry); |
202 | |||
199 | return ret; | 203 | return ret; |
200 | } | 204 | } |
201 | 205 | ||
@@ -226,6 +230,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode) | |||
226 | schedule(); | 230 | schedule(); |
227 | if (signal_pending_state(mode, current)) | 231 | if (signal_pending_state(mode, current)) |
228 | return -EINTR; | 232 | return -EINTR; |
233 | |||
229 | return 0; | 234 | return 0; |
230 | } | 235 | } |
231 | EXPORT_SYMBOL(atomic_t_wait); | 236 | EXPORT_SYMBOL(atomic_t_wait); |
@@ -250,6 +255,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode) | |||
250 | schedule(); | 255 | schedule(); |
251 | if (signal_pending_state(mode, current)) | 256 | if (signal_pending_state(mode, current)) |
252 | return -EINTR; | 257 | return -EINTR; |
258 | |||
253 | return 0; | 259 | return 0; |
254 | } | 260 | } |
255 | EXPORT_SYMBOL(bit_wait); | 261 | EXPORT_SYMBOL(bit_wait); |
@@ -259,6 +265,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) | |||
259 | io_schedule(); | 265 | io_schedule(); |
260 | if (signal_pending_state(mode, current)) | 266 | if (signal_pending_state(mode, current)) |
261 | return -EINTR; | 267 | return -EINTR; |
268 | |||
262 | return 0; | 269 | return 0; |
263 | } | 270 | } |
264 | EXPORT_SYMBOL(bit_wait_io); | 271 | EXPORT_SYMBOL(bit_wait_io); |
@@ -266,11 +273,13 @@ EXPORT_SYMBOL(bit_wait_io); | |||
266 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) | 273 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) |
267 | { | 274 | { |
268 | unsigned long now = READ_ONCE(jiffies); | 275 | unsigned long now = READ_ONCE(jiffies); |
276 | |||
269 | if (time_after_eq(now, word->timeout)) | 277 | if (time_after_eq(now, word->timeout)) |
270 | return -EAGAIN; | 278 | return -EAGAIN; |
271 | schedule_timeout(word->timeout - now); | 279 | schedule_timeout(word->timeout - now); |
272 | if (signal_pending_state(mode, current)) | 280 | if (signal_pending_state(mode, current)) |
273 | return -EINTR; | 281 | return -EINTR; |
282 | |||
274 | return 0; | 283 | return 0; |
275 | } | 284 | } |
276 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | 285 | EXPORT_SYMBOL_GPL(bit_wait_timeout); |
@@ -278,11 +287,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); | |||
278 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) | 287 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) |
279 | { | 288 | { |
280 | unsigned long now = READ_ONCE(jiffies); | 289 | unsigned long now = READ_ONCE(jiffies); |
290 | |||
281 | if (time_after_eq(now, word->timeout)) | 291 | if (time_after_eq(now, word->timeout)) |
282 | return -EAGAIN; | 292 | return -EAGAIN; |
283 | io_schedule_timeout(word->timeout - now); | 293 | io_schedule_timeout(word->timeout - now); |
284 | if (signal_pending_state(mode, current)) | 294 | if (signal_pending_state(mode, current)) |
285 | return -EINTR; | 295 | return -EINTR; |
296 | |||
286 | return 0; | 297 | return 0; |
287 | } | 298 | } |
288 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | 299 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ccd3782da0bf..5d4a0342f934 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -463,11 +463,18 @@ static int __init setup_tick_nohz(char *str) | |||
463 | 463 | ||
464 | __setup("nohz=", setup_tick_nohz); | 464 | __setup("nohz=", setup_tick_nohz); |
465 | 465 | ||
466 | int tick_nohz_tick_stopped(void) | 466 | bool tick_nohz_tick_stopped(void) |
467 | { | 467 | { |
468 | return __this_cpu_read(tick_cpu_sched.tick_stopped); | 468 | return __this_cpu_read(tick_cpu_sched.tick_stopped); |
469 | } | 469 | } |
470 | 470 | ||
471 | bool tick_nohz_tick_stopped_cpu(int cpu) | ||
472 | { | ||
473 | struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); | ||
474 | |||
475 | return ts->tick_stopped; | ||
476 | } | ||
477 | |||
471 | /** | 478 | /** |
472 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 479 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted |
473 | * | 480 | * |
@@ -723,12 +730,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
723 | delta = KTIME_MAX; | 730 | delta = KTIME_MAX; |
724 | } | 731 | } |
725 | 732 | ||
726 | #ifdef CONFIG_NO_HZ_FULL | ||
727 | /* Limit the tick delta to the maximum scheduler deferment */ | ||
728 | if (!ts->inidle) | ||
729 | delta = min(delta, scheduler_tick_max_deferment()); | ||
730 | #endif | ||
731 | |||
732 | /* Calculate the next expiry time */ | 733 | /* Calculate the next expiry time */ |
733 | if (delta < (KTIME_MAX - basemono)) | 734 | if (delta < (KTIME_MAX - basemono)) |
734 | expires = basemono + delta; | 735 | expires = basemono + delta; |
@@ -935,13 +936,6 @@ void tick_nohz_idle_enter(void) | |||
935 | struct tick_sched *ts; | 936 | struct tick_sched *ts; |
936 | 937 | ||
937 | lockdep_assert_irqs_enabled(); | 938 | lockdep_assert_irqs_enabled(); |
938 | /* | ||
939 | * Update the idle state in the scheduler domain hierarchy | ||
940 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
941 | * State will be updated to busy during the first busy tick after | ||
942 | * exiting idle. | ||
943 | */ | ||
944 | set_cpu_sd_state_idle(); | ||
945 | 939 | ||
946 | local_irq_disable(); | 940 | local_irq_disable(); |
947 | 941 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6ec6ba65127b..254e636a3d6b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void) | |||
5573 | int __init workqueue_init_early(void) | 5573 | int __init workqueue_init_early(void) |
5574 | { | 5574 | { |
5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; | 5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; |
5576 | int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; | ||
5576 | int i, cpu; | 5577 | int i, cpu; |
5577 | 5578 | ||
5578 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); | 5579 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); |
5579 | 5580 | ||
5580 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); | 5581 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); |
5581 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); | 5582 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); |
5582 | 5583 | ||
5583 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5584 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
5584 | 5585 | ||