aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-03-03 08:01:12 -0500
committerIngo Molnar <mingo@kernel.org>2018-03-03 09:50:21 -0500
commit97fb7a0a8944bd6d2c5634e1e0fa689a5c40bc22 (patch)
tree4993de40ba9dc0cf76d2233b8292a771d8c41941
parentc2e513821d5df5e772287f6d0c23fd17b7c2bb1a (diff)
sched: Clean up and harmonize the coding style of the scheduler code base
A good number of small style inconsistencies have accumulated in the scheduler core, so do a pass over them to harmonize all these details: - fix speling in comments, - use curly braces for multi-line statements, - remove unnecessary parentheses from integer literals, - capitalize consistently, - remove stray newlines, - add comments where necessary, - remove invalid/unnecessary comments, - align structure definitions and other data types vertically, - add missing newlines for increased readability, - fix vertical tabulation where it's misaligned, - harmonize preprocessor conditional block labeling and vertical alignment, - remove line-breaks where they uglify the code, - add newline after local variable definitions, No change in functionality: md5: 1191fa0a890cfa8132156d2959d7e9e2 built-in.o.before.asm 1191fa0a890cfa8132156d2959d7e9e2 built-in.o.after.asm Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/autogroup.c12
-rw-r--r--kernel/sched/autogroup.h8
-rw-r--r--kernel/sched/clock.c22
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/cpuacct.c20
-rw-r--r--kernel/sched/cpudeadline.c13
-rw-r--r--kernel/sched/cpudeadline.h27
-rw-r--r--kernel/sched/cpufreq_schedutil.c129
-rw-r--r--kernel/sched/cpupri.c9
-rw-r--r--kernel/sched/cpupri.h24
-rw-r--r--kernel/sched/cputime.c48
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/debug.c88
-rw-r--r--kernel/sched/fair.c183
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/idle_task.c3
-rw-r--r--kernel/sched/isolation.c2
-rw-r--r--kernel/sched/loadavg.c30
-rw-r--r--kernel/sched/membarrier.c18
-rw-r--r--kernel/sched/rt.c25
-rw-r--r--kernel/sched/sched.h529
-rw-r--r--kernel/sched/stats.c7
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/swait.c3
-rw-r--r--kernel/sched/topology.c42
-rw-r--r--kernel/sched/wait.c4
-rw-r--r--kernel/sched/wait_bit.c18
28 files changed, 706 insertions, 710 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..ff1b7b647b86 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -168,18 +168,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
168 autogroup_kref_put(prev); 168 autogroup_kref_put(prev);
169} 169}
170 170
171/* Allocates GFP_KERNEL, cannot be called under any spinlock */ 171/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
172void sched_autogroup_create_attach(struct task_struct *p) 172void sched_autogroup_create_attach(struct task_struct *p)
173{ 173{
174 struct autogroup *ag = autogroup_create(); 174 struct autogroup *ag = autogroup_create();
175 175
176 autogroup_move_group(p, ag); 176 autogroup_move_group(p, ag);
177 /* drop extra reference added by autogroup_create() */ 177
178 /* Drop extra reference added by autogroup_create(): */
178 autogroup_kref_put(ag); 179 autogroup_kref_put(ag);
179} 180}
180EXPORT_SYMBOL(sched_autogroup_create_attach); 181EXPORT_SYMBOL(sched_autogroup_create_attach);
181 182
182/* Cannot be called under siglock. Currently has no users */ 183/* Cannot be called under siglock. Currently has no users: */
183void sched_autogroup_detach(struct task_struct *p) 184void sched_autogroup_detach(struct task_struct *p)
184{ 185{
185 autogroup_move_group(p, &autogroup_default); 186 autogroup_move_group(p, &autogroup_default);
@@ -202,7 +203,6 @@ static int __init setup_autogroup(char *str)
202 203
203 return 1; 204 return 1;
204} 205}
205
206__setup("noautogroup", setup_autogroup); 206__setup("noautogroup", setup_autogroup);
207 207
208#ifdef CONFIG_PROC_FS 208#ifdef CONFIG_PROC_FS
@@ -224,7 +224,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
224 if (nice < 0 && !can_nice(current, nice)) 224 if (nice < 0 && !can_nice(current, nice))
225 return -EPERM; 225 return -EPERM;
226 226
227 /* this is a heavy operation taking global locks.. */ 227 /* This is a heavy operation, taking global locks.. */
228 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) 228 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
229 return -EAGAIN; 229 return -EAGAIN;
230 230
@@ -267,4 +267,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
267 267
268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
269} 269}
270#endif /* CONFIG_SCHED_DEBUG */ 270#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..49e6ec9559cf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -7,9 +7,9 @@
7 7
8struct autogroup { 8struct autogroup {
9 /* 9 /*
10 * reference doesn't mean how many thread attach to this 10 * Reference doesn't mean how many threads attach to this
11 * autogroup now. It just stands for the number of task 11 * autogroup now. It just stands for the number of tasks
12 * could use this autogroup. 12 * which could use this autogroup.
13 */ 13 */
14 struct kref kref; 14 struct kref kref;
15 struct task_group *tg; 15 struct task_group *tg;
@@ -56,11 +56,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
56 return tg; 56 return tg;
57} 57}
58 58
59#ifdef CONFIG_SCHED_DEBUG
60static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 59static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
61{ 60{
62 return 0; 61 return 0;
63} 62}
64#endif
65 63
66#endif /* CONFIG_SCHED_AUTOGROUP */ 64#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..7da6bec8a2ff 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * sched_clock for unstable cpu clocks 2 * sched_clock() for unstable CPU clocks
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
5 * 5 *
@@ -11,7 +11,7 @@
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * 13 *
14 * What: 14 * What this file implements:
15 * 15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution 16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i) 17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current CPU.
30 * 30 *
31 * sched_clock_cpu(i) 31 * sched_clock_cpu(i)
32 * 32 *
33 * How: 33 * How it is implemented:
34 * 34 *
35 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the 36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -302,21 +302,21 @@ again:
302 * cmpxchg64 below only protects one readout. 302 * cmpxchg64 below only protects one readout.
303 * 303 *
304 * We must reread via sched_clock_local() in the retry case on 304 * We must reread via sched_clock_local() in the retry case on
305 * 32bit as an NMI could use sched_clock_local() via the 305 * 32-bit kernels as an NMI could use sched_clock_local() via the
306 * tracer and hit between the readout of 306 * tracer and hit between the readout of
307 * the low32bit and the high 32bit portion. 307 * the low 32-bit and the high 32-bit portion.
308 */ 308 */
309 this_clock = sched_clock_local(my_scd); 309 this_clock = sched_clock_local(my_scd);
310 /* 310 /*
311 * We must enforce atomic readout on 32bit, otherwise the 311 * We must enforce atomic readout on 32-bit, otherwise the
312 * update on the remote cpu can hit inbetween the readout of 312 * update on the remote CPU can hit inbetween the readout of
313 * the low32bit and the high 32bit portion. 313 * the low 32-bit and the high 32-bit portion.
314 */ 314 */
315 remote_clock = cmpxchg64(&scd->clock, 0, 0); 315 remote_clock = cmpxchg64(&scd->clock, 0, 0);
316#else 316#else
317 /* 317 /*
318 * On 64bit the read of [my]scd->clock is atomic versus the 318 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
319 * update, so we can avoid the above 32bit dance. 319 * update, so we can avoid the above 32-bit dance.
320 */ 320 */
321 sched_clock_local(my_scd); 321 sched_clock_local(my_scd);
322again: 322again:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8fff4f16c510..9427b59551c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -135,7 +135,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
135 * [L] ->on_rq 135 * [L] ->on_rq
136 * RELEASE (rq->lock) 136 * RELEASE (rq->lock)
137 * 137 *
138 * If we observe the old cpu in task_rq_lock, the acquire of 138 * If we observe the old CPU in task_rq_lock, the acquire of
139 * the old rq->lock will fully serialize against the stores. 139 * the old rq->lock will fully serialize against the stores.
140 * 140 *
141 * If we observe the new CPU in task_rq_lock, the acquire will 141 * If we observe the new CPU in task_rq_lock, the acquire will
@@ -1457,7 +1457,7 @@ EXPORT_SYMBOL_GPL(kick_process);
1457 * 1457 *
1458 * - cpu_active must be a subset of cpu_online 1458 * - cpu_active must be a subset of cpu_online
1459 * 1459 *
1460 * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, 1460 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
1461 * see __set_cpus_allowed_ptr(). At this point the newly online 1461 * see __set_cpus_allowed_ptr(). At this point the newly online
1462 * CPU isn't yet part of the sched domains, and balancing will not 1462 * CPU isn't yet part of the sched domains, and balancing will not
1463 * see it. 1463 * see it.
@@ -3037,7 +3037,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3037 3037
3038#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3038#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3039 /* 3039 /*
3040 * 64-bit doesn't need locks to atomically read a 64bit value. 3040 * 64-bit doesn't need locks to atomically read a 64-bit value.
3041 * So we have a optimization chance when the task's delta_exec is 0. 3041 * So we have a optimization chance when the task's delta_exec is 0.
3042 * Reading ->on_cpu is racy, but this is ok. 3042 * Reading ->on_cpu is racy, but this is ok.
3043 * 3043 *
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..1abd325e733a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -18,7 +18,7 @@
18 * (balbir@in.ibm.com). 18 * (balbir@in.ibm.com).
19 */ 19 */
20 20
21/* Time spent by the tasks of the cpu accounting group executing in ... */ 21/* Time spent by the tasks of the CPU accounting group executing in ... */
22enum cpuacct_stat_index { 22enum cpuacct_stat_index {
23 CPUACCT_STAT_USER, /* ... user mode */ 23 CPUACCT_STAT_USER, /* ... user mode */
24 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 24 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
@@ -35,12 +35,12 @@ struct cpuacct_usage {
35 u64 usages[CPUACCT_STAT_NSTATS]; 35 u64 usages[CPUACCT_STAT_NSTATS];
36}; 36};
37 37
38/* track cpu usage of a group of tasks and its child groups */ 38/* track CPU usage of a group of tasks and its child groups */
39struct cpuacct { 39struct cpuacct {
40 struct cgroup_subsys_state css; 40 struct cgroup_subsys_state css;
41 /* cpuusage holds pointer to a u64-type object on every cpu */ 41 /* cpuusage holds pointer to a u64-type object on every CPU */
42 struct cpuacct_usage __percpu *cpuusage; 42 struct cpuacct_usage __percpu *cpuusage;
43 struct kernel_cpustat __percpu *cpustat; 43 struct kernel_cpustat __percpu *cpustat;
44}; 44};
45 45
46static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) 46static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +48,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
48 return css ? container_of(css, struct cpuacct, css) : NULL; 48 return css ? container_of(css, struct cpuacct, css) : NULL;
49} 49}
50 50
51/* return cpu accounting group to which this task belongs */ 51/* Return CPU accounting group to which this task belongs */
52static inline struct cpuacct *task_ca(struct task_struct *tsk) 52static inline struct cpuacct *task_ca(struct task_struct *tsk)
53{ 53{
54 return css_ca(task_css(tsk, cpuacct_cgrp_id)); 54 return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +65,7 @@ static struct cpuacct root_cpuacct = {
65 .cpuusage = &root_cpuacct_cpuusage, 65 .cpuusage = &root_cpuacct_cpuusage,
66}; 66};
67 67
68/* create a new cpu accounting group */ 68/* Create a new CPU accounting group */
69static struct cgroup_subsys_state * 69static struct cgroup_subsys_state *
70cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) 70cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
71{ 71{
@@ -96,7 +96,7 @@ out:
96 return ERR_PTR(-ENOMEM); 96 return ERR_PTR(-ENOMEM);
97} 97}
98 98
99/* destroy an existing cpu accounting group */ 99/* Destroy an existing CPU accounting group */
100static void cpuacct_css_free(struct cgroup_subsys_state *css) 100static void cpuacct_css_free(struct cgroup_subsys_state *css)
101{ 101{
102 struct cpuacct *ca = css_ca(css); 102 struct cpuacct *ca = css_ca(css);
@@ -162,7 +162,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
162#endif 162#endif
163} 163}
164 164
165/* return total cpu usage (in nanoseconds) of a group */ 165/* Return total CPU usage (in nanoseconds) of a group */
166static u64 __cpuusage_read(struct cgroup_subsys_state *css, 166static u64 __cpuusage_read(struct cgroup_subsys_state *css,
167 enum cpuacct_stat_index index) 167 enum cpuacct_stat_index index)
168{ 168{
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 6a9defebbb54..cb172b61d191 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,7 +10,6 @@
10 * as published by the Free Software Foundation; version 2 10 * as published by the Free Software Foundation; version 2
11 * of the License. 11 * of the License.
12 */ 12 */
13
14#include <linux/gfp.h> 13#include <linux/gfp.h>
15#include <linux/kernel.h> 14#include <linux/kernel.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
@@ -147,9 +146,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
147} 146}
148 147
149/* 148/*
150 * cpudl_clear - remove a cpu from the cpudl max-heap 149 * cpudl_clear - remove a CPU from the cpudl max-heap
151 * @cp: the cpudl max-heap context 150 * @cp: the cpudl max-heap context
152 * @cpu: the target cpu 151 * @cpu: the target CPU
153 * 152 *
154 * Notes: assumes cpu_rq(cpu)->lock is locked 153 * Notes: assumes cpu_rq(cpu)->lock is locked
155 * 154 *
@@ -188,8 +187,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
188/* 187/*
189 * cpudl_set - update the cpudl max-heap 188 * cpudl_set - update the cpudl max-heap
190 * @cp: the cpudl max-heap context 189 * @cp: the cpudl max-heap context
191 * @cpu: the target cpu 190 * @cpu: the target CPU
192 * @dl: the new earliest deadline for this cpu 191 * @dl: the new earliest deadline for this CPU
193 * 192 *
194 * Notes: assumes cpu_rq(cpu)->lock is locked 193 * Notes: assumes cpu_rq(cpu)->lock is locked
195 * 194 *
@@ -224,7 +223,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
224/* 223/*
225 * cpudl_set_freecpu - Set the cpudl.free_cpus 224 * cpudl_set_freecpu - Set the cpudl.free_cpus
226 * @cp: the cpudl max-heap context 225 * @cp: the cpudl max-heap context
227 * @cpu: rd attached cpu 226 * @cpu: rd attached CPU
228 */ 227 */
229void cpudl_set_freecpu(struct cpudl *cp, int cpu) 228void cpudl_set_freecpu(struct cpudl *cp, int cpu)
230{ 229{
@@ -234,7 +233,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
234/* 233/*
235 * cpudl_clear_freecpu - Clear the cpudl.free_cpus 234 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
236 * @cp: the cpudl max-heap context 235 * @cp: the cpudl max-heap context
237 * @cpu: rd attached cpu 236 * @cpu: rd attached CPU
238 */ 237 */
239void cpudl_clear_freecpu(struct cpudl *cp, int cpu) 238void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
240{ 239{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..c26e7a0e5a66 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,28 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUDL_H
3#define _LINUX_CPUDL_H
4
5#include <linux/sched.h> 2#include <linux/sched.h>
6#include <linux/sched/deadline.h> 3#include <linux/sched/deadline.h>
7 4
8#define IDX_INVALID -1 5#define IDX_INVALID -1
9 6
10struct cpudl_item { 7struct cpudl_item {
11 u64 dl; 8 u64 dl;
12 int cpu; 9 int cpu;
13 int idx; 10 int idx;
14}; 11};
15 12
16struct cpudl { 13struct cpudl {
17 raw_spinlock_t lock; 14 raw_spinlock_t lock;
18 int size; 15 int size;
19 cpumask_var_t free_cpus; 16 cpumask_var_t free_cpus;
20 struct cpudl_item *elements; 17 struct cpudl_item *elements;
21}; 18};
22 19
23
24#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
25int cpudl_find(struct cpudl *cp, struct task_struct *p, 21int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
26 struct cpumask *later_mask);
27void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 22void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
28void cpudl_clear(struct cpudl *cp, int cpu); 23void cpudl_clear(struct cpudl *cp, int cpu);
29int cpudl_init(struct cpudl *cp); 24int cpudl_init(struct cpudl *cp);
30void cpudl_set_freecpu(struct cpudl *cp, int cpu); 25void cpudl_set_freecpu(struct cpudl *cp, int cpu);
31void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 26void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
32void cpudl_cleanup(struct cpudl *cp); 27void cpudl_cleanup(struct cpudl *cp);
33#endif /* CONFIG_SMP */ 28#endif /* CONFIG_SMP */
34
35#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7936f548e071..0dad8160e00f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -20,52 +20,52 @@
20#include "sched.h" 20#include "sched.h"
21 21
22struct sugov_tunables { 22struct sugov_tunables {
23 struct gov_attr_set attr_set; 23 struct gov_attr_set attr_set;
24 unsigned int rate_limit_us; 24 unsigned int rate_limit_us;
25}; 25};
26 26
27struct sugov_policy { 27struct sugov_policy {
28 struct cpufreq_policy *policy; 28 struct cpufreq_policy *policy;
29 29
30 struct sugov_tunables *tunables; 30 struct sugov_tunables *tunables;
31 struct list_head tunables_hook; 31 struct list_head tunables_hook;
32 32
33 raw_spinlock_t update_lock; /* For shared policies */ 33 raw_spinlock_t update_lock; /* For shared policies */
34 u64 last_freq_update_time; 34 u64 last_freq_update_time;
35 s64 freq_update_delay_ns; 35 s64 freq_update_delay_ns;
36 unsigned int next_freq; 36 unsigned int next_freq;
37 unsigned int cached_raw_freq; 37 unsigned int cached_raw_freq;
38 38
39 /* The next fields are only needed if fast switch cannot be used. */ 39 /* The next fields are only needed if fast switch cannot be used: */
40 struct irq_work irq_work; 40 struct irq_work irq_work;
41 struct kthread_work work; 41 struct kthread_work work;
42 struct mutex work_lock; 42 struct mutex work_lock;
43 struct kthread_worker worker; 43 struct kthread_worker worker;
44 struct task_struct *thread; 44 struct task_struct *thread;
45 bool work_in_progress; 45 bool work_in_progress;
46 46
47 bool need_freq_update; 47 bool need_freq_update;
48}; 48};
49 49
50struct sugov_cpu { 50struct sugov_cpu {
51 struct update_util_data update_util; 51 struct update_util_data update_util;
52 struct sugov_policy *sg_policy; 52 struct sugov_policy *sg_policy;
53 unsigned int cpu; 53 unsigned int cpu;
54 54
55 bool iowait_boost_pending; 55 bool iowait_boost_pending;
56 unsigned int iowait_boost; 56 unsigned int iowait_boost;
57 unsigned int iowait_boost_max; 57 unsigned int iowait_boost_max;
58 u64 last_update; 58 u64 last_update;
59 59
60 /* The fields below are only needed when sharing a policy. */ 60 /* The fields below are only needed when sharing a policy: */
61 unsigned long util_cfs; 61 unsigned long util_cfs;
62 unsigned long util_dl; 62 unsigned long util_dl;
63 unsigned long max; 63 unsigned long max;
64 unsigned int flags; 64 unsigned int flags;
65 65
66 /* The field below is for single-CPU policies only. */ 66 /* The field below is for single-CPU policies only: */
67#ifdef CONFIG_NO_HZ_COMMON 67#ifdef CONFIG_NO_HZ_COMMON
68 unsigned long saved_idle_calls; 68 unsigned long saved_idle_calls;
69#endif 69#endif
70}; 70};
71 71
@@ -79,9 +79,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
79 79
80 /* 80 /*
81 * Since cpufreq_update_util() is called with rq->lock held for 81 * Since cpufreq_update_util() is called with rq->lock held for
82 * the @target_cpu, our per-cpu data is fully serialized. 82 * the @target_cpu, our per-CPU data is fully serialized.
83 * 83 *
84 * However, drivers cannot in general deal with cross-cpu 84 * However, drivers cannot in general deal with cross-CPU
85 * requests, so while get_next_freq() will work, our 85 * requests, so while get_next_freq() will work, our
86 * sugov_update_commit() call may not for the fast switching platforms. 86 * sugov_update_commit() call may not for the fast switching platforms.
87 * 87 *
@@ -111,6 +111,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
111 } 111 }
112 112
113 delta_ns = time - sg_policy->last_freq_update_time; 113 delta_ns = time - sg_policy->last_freq_update_time;
114
114 return delta_ns >= sg_policy->freq_update_delay_ns; 115 return delta_ns >= sg_policy->freq_update_delay_ns;
115} 116}
116 117
@@ -345,8 +346,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
345 return get_next_freq(sg_policy, util, max); 346 return get_next_freq(sg_policy, util, max);
346} 347}
347 348
348static void sugov_update_shared(struct update_util_data *hook, u64 time, 349static void
349 unsigned int flags) 350sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
350{ 351{
351 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 352 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
352 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 353 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -423,8 +424,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
423 return sprintf(buf, "%u\n", tunables->rate_limit_us); 424 return sprintf(buf, "%u\n", tunables->rate_limit_us);
424} 425}
425 426
426static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, 427static ssize_t
427 size_t count) 428rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
428{ 429{
429 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 430 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
430 struct sugov_policy *sg_policy; 431 struct sugov_policy *sg_policy;
@@ -479,11 +480,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
479{ 480{
480 struct task_struct *thread; 481 struct task_struct *thread;
481 struct sched_attr attr = { 482 struct sched_attr attr = {
482 .size = sizeof(struct sched_attr), 483 .size = sizeof(struct sched_attr),
483 .sched_policy = SCHED_DEADLINE, 484 .sched_policy = SCHED_DEADLINE,
484 .sched_flags = SCHED_FLAG_SUGOV, 485 .sched_flags = SCHED_FLAG_SUGOV,
485 .sched_nice = 0, 486 .sched_nice = 0,
486 .sched_priority = 0, 487 .sched_priority = 0,
487 /* 488 /*
488 * Fake (unused) bandwidth; workaround to "fix" 489 * Fake (unused) bandwidth; workaround to "fix"
489 * priority inheritance. 490 * priority inheritance.
@@ -663,21 +664,21 @@ static int sugov_start(struct cpufreq_policy *policy)
663 struct sugov_policy *sg_policy = policy->governor_data; 664 struct sugov_policy *sg_policy = policy->governor_data;
664 unsigned int cpu; 665 unsigned int cpu;
665 666
666 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 667 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
667 sg_policy->last_freq_update_time = 0; 668 sg_policy->last_freq_update_time = 0;
668 sg_policy->next_freq = UINT_MAX; 669 sg_policy->next_freq = UINT_MAX;
669 sg_policy->work_in_progress = false; 670 sg_policy->work_in_progress = false;
670 sg_policy->need_freq_update = false; 671 sg_policy->need_freq_update = false;
671 sg_policy->cached_raw_freq = 0; 672 sg_policy->cached_raw_freq = 0;
672 673
673 for_each_cpu(cpu, policy->cpus) { 674 for_each_cpu(cpu, policy->cpus) {
674 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 675 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
675 676
676 memset(sg_cpu, 0, sizeof(*sg_cpu)); 677 memset(sg_cpu, 0, sizeof(*sg_cpu));
677 sg_cpu->cpu = cpu; 678 sg_cpu->cpu = cpu;
678 sg_cpu->sg_policy = sg_policy; 679 sg_cpu->sg_policy = sg_policy;
679 sg_cpu->flags = 0; 680 sg_cpu->flags = 0;
680 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; 681 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
681 } 682 }
682 683
683 for_each_cpu(cpu, policy->cpus) { 684 for_each_cpu(cpu, policy->cpus) {
@@ -721,14 +722,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
721} 722}
722 723
723static struct cpufreq_governor schedutil_gov = { 724static struct cpufreq_governor schedutil_gov = {
724 .name = "schedutil", 725 .name = "schedutil",
725 .owner = THIS_MODULE, 726 .owner = THIS_MODULE,
726 .dynamic_switching = true, 727 .dynamic_switching = true,
727 .init = sugov_init, 728 .init = sugov_init,
728 .exit = sugov_exit, 729 .exit = sugov_exit,
729 .start = sugov_start, 730 .start = sugov_start,
730 .stop = sugov_stop, 731 .stop = sugov_stop,
731 .limits = sugov_limits, 732 .limits = sugov_limits,
732}; 733};
733 734
734#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 735#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..f43e14ccb67d 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
14 * 14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state 15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with 16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus 17 * a 2 dimensional bitmap (the first for priority class, the second for CPUs
18 * in that class). Therefore a typical application without affinity 18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit 19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a 20 * searches). For tasks with affinity restrictions, the algorithm has a
@@ -26,7 +26,6 @@
26 * as published by the Free Software Foundation; version 2 26 * as published by the Free Software Foundation; version 2
27 * of the License. 27 * of the License.
28 */ 28 */
29
30#include <linux/gfp.h> 29#include <linux/gfp.h>
31#include <linux/sched.h> 30#include <linux/sched.h>
32#include <linux/sched/rt.h> 31#include <linux/sched/rt.h>
@@ -128,9 +127,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
128} 127}
129 128
130/** 129/**
131 * cpupri_set - update the cpu priority setting 130 * cpupri_set - update the CPU priority setting
132 * @cp: The cpupri context 131 * @cp: The cpupri context
133 * @cpu: The target cpu 132 * @cpu: The target CPU
134 * @newpri: The priority (INVALID-RT99) to assign to this CPU 133 * @newpri: The priority (INVALID-RT99) to assign to this CPU
135 * 134 *
136 * Note: Assumes cpu_rq(cpu)->lock is locked 135 * Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +150,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
151 return; 150 return;
152 151
153 /* 152 /*
154 * If the cpu was currently mapped to a different value, we 153 * If the CPU was currently mapped to a different value, we
155 * need to map it to the new value then remove the old value. 154 * need to map it to the new value then remove the old value.
156 * Note, we must add the new value first, otherwise we risk the 155 * Note, we must add the new value first, otherwise we risk the
157 * cpu being missed by the priority loop in cpupri_find. 156 * cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..141a06c914c6 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,26 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUPRI_H
3#define _LINUX_CPUPRI_H
4
5#include <linux/sched.h> 2#include <linux/sched.h>
6 3
7#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 4#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
8 5
9#define CPUPRI_INVALID -1 6#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 7#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1 8#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */ 9/* values 2-101 are RT priorities 0-99 */
13 10
14struct cpupri_vec { 11struct cpupri_vec {
15 atomic_t count; 12 atomic_t count;
16 cpumask_var_t mask; 13 cpumask_var_t mask;
17}; 14};
18 15
19struct cpupri { 16struct cpupri {
20 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 17 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
21 int *cpu_to_pri; 18 int *cpu_to_pri;
22}; 19};
23 20
24#ifdef CONFIG_SMP 21#ifdef CONFIG_SMP
25int cpupri_find(struct cpupri *cp, 22int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
26 struct task_struct *p, struct cpumask *lowest_mask);
27void cpupri_set(struct cpupri *cp, int cpu, int pri); 23void cpupri_set(struct cpupri *cp, int cpu, int pri);
28int cpupri_init(struct cpupri *cp); 24int cpupri_init(struct cpupri *cp);
29void cpupri_cleanup(struct cpupri *cp); 25void cpupri_cleanup(struct cpupri *cp);
30#endif 26#endif
31
32#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..d3b450b57ade 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -113,9 +113,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
113} 113}
114 114
115/* 115/*
116 * Account user cpu time to a process. 116 * Account user CPU time to a process.
117 * @p: the process that the cpu time gets accounted to 117 * @p: the process that the CPU time gets accounted to
118 * @cputime: the cpu time spent in user space since the last update 118 * @cputime: the CPU time spent in user space since the last update
119 */ 119 */
120void account_user_time(struct task_struct *p, u64 cputime) 120void account_user_time(struct task_struct *p, u64 cputime)
121{ 121{
@@ -135,9 +135,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
135} 135}
136 136
137/* 137/*
138 * Account guest cpu time to a process. 138 * Account guest CPU time to a process.
139 * @p: the process that the cpu time gets accounted to 139 * @p: the process that the CPU time gets accounted to
140 * @cputime: the cpu time spent in virtual machine since the last update 140 * @cputime: the CPU time spent in virtual machine since the last update
141 */ 141 */
142void account_guest_time(struct task_struct *p, u64 cputime) 142void account_guest_time(struct task_struct *p, u64 cputime)
143{ 143{
@@ -159,9 +159,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
159} 159}
160 160
161/* 161/*
162 * Account system cpu time to a process and desired cpustat field 162 * Account system CPU time to a process and desired cpustat field
163 * @p: the process that the cpu time gets accounted to 163 * @p: the process that the CPU time gets accounted to
164 * @cputime: the cpu time spent in kernel space since the last update 164 * @cputime: the CPU time spent in kernel space since the last update
165 * @index: pointer to cpustat field that has to be updated 165 * @index: pointer to cpustat field that has to be updated
166 */ 166 */
167void account_system_index_time(struct task_struct *p, 167void account_system_index_time(struct task_struct *p,
@@ -179,10 +179,10 @@ void account_system_index_time(struct task_struct *p,
179} 179}
180 180
181/* 181/*
182 * Account system cpu time to a process. 182 * Account system CPU time to a process.
183 * @p: the process that the cpu time gets accounted to 183 * @p: the process that the CPU time gets accounted to
184 * @hardirq_offset: the offset to subtract from hardirq_count() 184 * @hardirq_offset: the offset to subtract from hardirq_count()
185 * @cputime: the cpu time spent in kernel space since the last update 185 * @cputime: the CPU time spent in kernel space since the last update
186 */ 186 */
187void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 187void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
188{ 188{
@@ -205,7 +205,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
205 205
206/* 206/*
207 * Account for involuntary wait time. 207 * Account for involuntary wait time.
208 * @cputime: the cpu time spent in involuntary wait 208 * @cputime: the CPU time spent in involuntary wait
209 */ 209 */
210void account_steal_time(u64 cputime) 210void account_steal_time(u64 cputime)
211{ 211{
@@ -216,7 +216,7 @@ void account_steal_time(u64 cputime)
216 216
217/* 217/*
218 * Account for idle time. 218 * Account for idle time.
219 * @cputime: the cpu time spent in idle wait 219 * @cputime: the CPU time spent in idle wait
220 */ 220 */
221void account_idle_time(u64 cputime) 221void account_idle_time(u64 cputime)
222{ 222{
@@ -338,7 +338,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
338#ifdef CONFIG_IRQ_TIME_ACCOUNTING 338#ifdef CONFIG_IRQ_TIME_ACCOUNTING
339/* 339/*
340 * Account a tick to a process and cpustat 340 * Account a tick to a process and cpustat
341 * @p: the process that the cpu time gets accounted to 341 * @p: the process that the CPU time gets accounted to
342 * @user_tick: is the tick from userspace 342 * @user_tick: is the tick from userspace
343 * @rq: the pointer to rq 343 * @rq: the pointer to rq
344 * 344 *
@@ -400,17 +400,16 @@ static void irqtime_account_idle_ticks(int ticks)
400 irqtime_account_process_tick(current, 0, rq, ticks); 400 irqtime_account_process_tick(current, 0, rq, ticks);
401} 401}
402#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 402#else /* CONFIG_IRQ_TIME_ACCOUNTING */
403static inline void irqtime_account_idle_ticks(int ticks) {} 403static inline void irqtime_account_idle_ticks(int ticks) { }
404static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 404static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
405 struct rq *rq, int nr_ticks) {} 405 struct rq *rq, int nr_ticks) { }
406#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 406#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
407 407
408/* 408/*
409 * Use precise platform statistics if available: 409 * Use precise platform statistics if available:
410 */ 410 */
411#ifdef CONFIG_VIRT_CPU_ACCOUNTING 411#ifdef CONFIG_VIRT_CPU_ACCOUNTING
412 412# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
413#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
414void vtime_common_task_switch(struct task_struct *prev) 413void vtime_common_task_switch(struct task_struct *prev)
415{ 414{
416 if (is_idle_task(prev)) 415 if (is_idle_task(prev))
@@ -421,8 +420,7 @@ void vtime_common_task_switch(struct task_struct *prev)
421 vtime_flush(prev); 420 vtime_flush(prev);
422 arch_vtime_task_switch(prev); 421 arch_vtime_task_switch(prev);
423} 422}
424#endif 423# endif
425
426#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 424#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
427 425
428 426
@@ -469,10 +467,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
469 *ut = cputime.utime; 467 *ut = cputime.utime;
470 *st = cputime.stime; 468 *st = cputime.stime;
471} 469}
472#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 470
471#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
472
473/* 473/*
474 * Account a single tick of cpu time. 474 * Account a single tick of CPU time.
475 * @p: the process that the cpu time gets accounted to 475 * @p: the process that the CPU time gets accounted to
476 * @user_tick: indicates if the tick is a user or a system tick 476 * @user_tick: indicates if the tick is a user or a system tick
477 */ 477 */
478void account_process_tick(struct task_struct *p, int user_tick) 478void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 65cd5ead1759..58f8b7b37983 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -539,12 +539,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
539 539
540 /* 540 /*
541 * If we cannot preempt any rq, fall back to pick any 541 * If we cannot preempt any rq, fall back to pick any
542 * online cpu. 542 * online CPU:
543 */ 543 */
544 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 544 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
545 if (cpu >= nr_cpu_ids) { 545 if (cpu >= nr_cpu_ids) {
546 /* 546 /*
547 * Fail to find any suitable cpu. 547 * Failed to find any suitable CPU.
548 * The task will never come back! 548 * The task will never come back!
549 */ 549 */
550 BUG_ON(dl_bandwidth_enabled()); 550 BUG_ON(dl_bandwidth_enabled());
@@ -608,8 +608,7 @@ static inline void queue_pull_task(struct rq *rq)
608 608
609static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 609static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
610static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); 610static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
611static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, 611static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
612 int flags);
613 612
614/* 613/*
615 * We are being explicitly informed that a new instance is starting, 614 * We are being explicitly informed that a new instance is starting,
@@ -1873,7 +1872,7 @@ static int find_later_rq(struct task_struct *task)
1873 1872
1874 /* 1873 /*
1875 * We have to consider system topology and task affinity 1874 * We have to consider system topology and task affinity
1876 * first, then we can look for a suitable cpu. 1875 * first, then we can look for a suitable CPU.
1877 */ 1876 */
1878 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) 1877 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
1879 return -1; 1878 return -1;
@@ -1887,7 +1886,7 @@ static int find_later_rq(struct task_struct *task)
1887 * Now we check how well this matches with task's 1886 * Now we check how well this matches with task's
1888 * affinity and system topology. 1887 * affinity and system topology.
1889 * 1888 *
1890 * The last cpu where the task run is our first 1889 * The last CPU where the task run is our first
1891 * guess, since it is most likely cache-hot there. 1890 * guess, since it is most likely cache-hot there.
1892 */ 1891 */
1893 if (cpumask_test_cpu(cpu, later_mask)) 1892 if (cpumask_test_cpu(cpu, later_mask))
@@ -1917,9 +1916,9 @@ static int find_later_rq(struct task_struct *task)
1917 best_cpu = cpumask_first_and(later_mask, 1916 best_cpu = cpumask_first_and(later_mask,
1918 sched_domain_span(sd)); 1917 sched_domain_span(sd));
1919 /* 1918 /*
1920 * Last chance: if a cpu being in both later_mask 1919 * Last chance: if a CPU being in both later_mask
1921 * and current sd span is valid, that becomes our 1920 * and current sd span is valid, that becomes our
1922 * choice. Of course, the latest possible cpu is 1921 * choice. Of course, the latest possible CPU is
1923 * already under consideration through later_mask. 1922 * already under consideration through later_mask.
1924 */ 1923 */
1925 if (best_cpu < nr_cpu_ids) { 1924 if (best_cpu < nr_cpu_ids) {
@@ -2075,7 +2074,7 @@ retry:
2075 if (task == next_task) { 2074 if (task == next_task) {
2076 /* 2075 /*
2077 * The task is still there. We don't try 2076 * The task is still there. We don't try
2078 * again, some other cpu will pull it when ready. 2077 * again, some other CPU will pull it when ready.
2079 */ 2078 */
2080 goto out; 2079 goto out;
2081 } 2080 }
@@ -2308,7 +2307,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
2308 /* 2307 /*
2309 * Since this might be the only -deadline task on the rq, 2308 * Since this might be the only -deadline task on the rq,
2310 * this is the right place to try to pull some other one 2309 * this is the right place to try to pull some other one
2311 * from an overloaded cpu, if any. 2310 * from an overloaded CPU, if any.
2312 */ 2311 */
2313 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) 2312 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
2314 return; 2313 return;
@@ -2634,17 +2633,17 @@ void __dl_clear_params(struct task_struct *p)
2634{ 2633{
2635 struct sched_dl_entity *dl_se = &p->dl; 2634 struct sched_dl_entity *dl_se = &p->dl;
2636 2635
2637 dl_se->dl_runtime = 0; 2636 dl_se->dl_runtime = 0;
2638 dl_se->dl_deadline = 0; 2637 dl_se->dl_deadline = 0;
2639 dl_se->dl_period = 0; 2638 dl_se->dl_period = 0;
2640 dl_se->flags = 0; 2639 dl_se->flags = 0;
2641 dl_se->dl_bw = 0; 2640 dl_se->dl_bw = 0;
2642 dl_se->dl_density = 0; 2641 dl_se->dl_density = 0;
2643 2642
2644 dl_se->dl_throttled = 0; 2643 dl_se->dl_throttled = 0;
2645 dl_se->dl_yielded = 0; 2644 dl_se->dl_yielded = 0;
2646 dl_se->dl_non_contending = 0; 2645 dl_se->dl_non_contending = 0;
2647 dl_se->dl_overrun = 0; 2646 dl_se->dl_overrun = 0;
2648} 2647}
2649 2648
2650bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) 2649bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2663,21 +2662,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
2663#ifdef CONFIG_SMP 2662#ifdef CONFIG_SMP
2664int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) 2663int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
2665{ 2664{
2666 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 2665 unsigned int dest_cpu;
2667 cs_cpus_allowed);
2668 struct dl_bw *dl_b; 2666 struct dl_bw *dl_b;
2669 bool overflow; 2667 bool overflow;
2670 int cpus, ret; 2668 int cpus, ret;
2671 unsigned long flags; 2669 unsigned long flags;
2672 2670
2671 dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
2672
2673 rcu_read_lock_sched(); 2673 rcu_read_lock_sched();
2674 dl_b = dl_bw_of(dest_cpu); 2674 dl_b = dl_bw_of(dest_cpu);
2675 raw_spin_lock_irqsave(&dl_b->lock, flags); 2675 raw_spin_lock_irqsave(&dl_b->lock, flags);
2676 cpus = dl_bw_cpus(dest_cpu); 2676 cpus = dl_bw_cpus(dest_cpu);
2677 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 2677 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
2678 if (overflow) 2678 if (overflow) {
2679 ret = -EBUSY; 2679 ret = -EBUSY;
2680 else { 2680 } else {
2681 /* 2681 /*
2682 * We reserve space for this task in the destination 2682 * We reserve space for this task in the destination
2683 * root_domain, as we can't fail after this point. 2683 * root_domain, as we can't fail after this point.
@@ -2689,6 +2689,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
2689 } 2689 }
2690 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2690 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2691 rcu_read_unlock_sched(); 2691 rcu_read_unlock_sched();
2692
2692 return ret; 2693 return ret;
2693} 2694}
2694 2695
@@ -2709,6 +2710,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
2709 ret = 0; 2710 ret = 0;
2710 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 2711 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
2711 rcu_read_unlock_sched(); 2712 rcu_read_unlock_sched();
2713
2712 return ret; 2714 return ret;
2713} 2715}
2714 2716
@@ -2726,6 +2728,7 @@ bool dl_cpu_busy(unsigned int cpu)
2726 overflow = __dl_overflow(dl_b, cpus, 0, 0); 2728 overflow = __dl_overflow(dl_b, cpus, 0, 0);
2727 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2729 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2728 rcu_read_unlock_sched(); 2730 rcu_read_unlock_sched();
2731
2729 return overflow; 2732 return overflow;
2730} 2733}
2731#endif 2734#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1ca0130ed4f9..7c82a9b88510 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -9,7 +9,6 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12
13#include <linux/proc_fs.h> 12#include <linux/proc_fs.h>
14#include <linux/sched/mm.h> 13#include <linux/sched/mm.h>
15#include <linux/sched/task.h> 14#include <linux/sched/task.h>
@@ -274,34 +273,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
274 if (table == NULL) 273 if (table == NULL)
275 return NULL; 274 return NULL;
276 275
277 set_table_entry(&table[0], "min_interval", &sd->min_interval, 276 set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
278 sizeof(long), 0644, proc_doulongvec_minmax, false); 277 set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
279 set_table_entry(&table[1], "max_interval", &sd->max_interval, 278 set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
280 sizeof(long), 0644, proc_doulongvec_minmax, false); 279 set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
281 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 280 set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
282 sizeof(int), 0644, proc_dointvec_minmax, true); 281 set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
283 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 282 set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
284 sizeof(int), 0644, proc_dointvec_minmax, true); 283 set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
285 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 284 set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
286 sizeof(int), 0644, proc_dointvec_minmax, true); 285 set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
287 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 286 set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
288 sizeof(int), 0644, proc_dointvec_minmax, true); 287 set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
289 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 288 set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
290 sizeof(int), 0644, proc_dointvec_minmax, true);
291 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
292 sizeof(int), 0644, proc_dointvec_minmax, false);
293 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
294 sizeof(int), 0644, proc_dointvec_minmax, false);
295 set_table_entry(&table[9], "cache_nice_tries",
296 &sd->cache_nice_tries,
297 sizeof(int), 0644, proc_dointvec_minmax, false);
298 set_table_entry(&table[10], "flags", &sd->flags,
299 sizeof(int), 0644, proc_dointvec_minmax, false);
300 set_table_entry(&table[11], "max_newidle_lb_cost",
301 &sd->max_newidle_lb_cost,
302 sizeof(long), 0644, proc_doulongvec_minmax, false);
303 set_table_entry(&table[12], "name", sd->name,
304 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
305 /* &table[13] is terminator */ 289 /* &table[13] is terminator */
306 290
307 return table; 291 return table;
@@ -332,8 +316,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
332 return table; 316 return table;
333} 317}
334 318
335static cpumask_var_t sd_sysctl_cpus; 319static cpumask_var_t sd_sysctl_cpus;
336static struct ctl_table_header *sd_sysctl_header; 320static struct ctl_table_header *sd_sysctl_header;
337 321
338void register_sched_domain_sysctl(void) 322void register_sched_domain_sysctl(void)
339{ 323{
@@ -413,14 +397,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
413{ 397{
414 struct sched_entity *se = tg->se[cpu]; 398 struct sched_entity *se = tg->se[cpu];
415 399
416#define P(F) \ 400#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
417 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 401#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
418#define P_SCHEDSTAT(F) \ 402#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
419 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) 403#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
420#define PN(F) \
421 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
422#define PN_SCHEDSTAT(F) \
423 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
424 404
425 if (!se) 405 if (!se)
426 return; 406 return;
@@ -428,6 +408,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
428 PN(se->exec_start); 408 PN(se->exec_start);
429 PN(se->vruntime); 409 PN(se->vruntime);
430 PN(se->sum_exec_runtime); 410 PN(se->sum_exec_runtime);
411
431 if (schedstat_enabled()) { 412 if (schedstat_enabled()) {
432 PN_SCHEDSTAT(se->statistics.wait_start); 413 PN_SCHEDSTAT(se->statistics.wait_start);
433 PN_SCHEDSTAT(se->statistics.sleep_start); 414 PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +421,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
440 PN_SCHEDSTAT(se->statistics.wait_sum); 421 PN_SCHEDSTAT(se->statistics.wait_sum);
441 P_SCHEDSTAT(se->statistics.wait_count); 422 P_SCHEDSTAT(se->statistics.wait_count);
442 } 423 }
424
443 P(se->load.weight); 425 P(se->load.weight);
444 P(se->runnable_weight); 426 P(se->runnable_weight);
445#ifdef CONFIG_SMP 427#ifdef CONFIG_SMP
@@ -464,6 +446,7 @@ static char *task_group_path(struct task_group *tg)
464 return group_path; 446 return group_path;
465 447
466 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 448 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
449
467 return group_path; 450 return group_path;
468} 451}
469#endif 452#endif
@@ -799,9 +782,9 @@ void sysrq_sched_debug_show(void)
799/* 782/*
800 * This itererator needs some explanation. 783 * This itererator needs some explanation.
801 * It returns 1 for the header position. 784 * It returns 1 for the header position.
802 * This means 2 is cpu 0. 785 * This means 2 is CPU 0.
803 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 786 * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
804 * to use cpumask_* to iterate over the cpus. 787 * to use cpumask_* to iterate over the CPUs.
805 */ 788 */
806static void *sched_debug_start(struct seq_file *file, loff_t *offset) 789static void *sched_debug_start(struct seq_file *file, loff_t *offset)
807{ 790{
@@ -821,6 +804,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
821 804
822 if (n < nr_cpu_ids) 805 if (n < nr_cpu_ids)
823 return (void *)(unsigned long)(n + 2); 806 return (void *)(unsigned long)(n + 2);
807
824 return NULL; 808 return NULL;
825} 809}
826 810
@@ -835,10 +819,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
835} 819}
836 820
837static const struct seq_operations sched_debug_sops = { 821static const struct seq_operations sched_debug_sops = {
838 .start = sched_debug_start, 822 .start = sched_debug_start,
839 .next = sched_debug_next, 823 .next = sched_debug_next,
840 .stop = sched_debug_stop, 824 .stop = sched_debug_stop,
841 .show = sched_debug_show, 825 .show = sched_debug_show,
842}; 826};
843 827
844static int sched_debug_release(struct inode *inode, struct file *file) 828static int sched_debug_release(struct inode *inode, struct file *file)
@@ -876,14 +860,10 @@ static int __init init_sched_debug_procfs(void)
876 860
877__initcall(init_sched_debug_procfs); 861__initcall(init_sched_debug_procfs);
878 862
879#define __P(F) \ 863#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
880 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 864#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
881#define P(F) \ 865#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
882 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 866#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
883#define __PN(F) \
884 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
885#define PN(F) \
886 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
887 867
888 868
889#ifdef CONFIG_NUMA_BALANCING 869#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e1febd252a84..1f877de96c9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,6 @@
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */ 22 */
23
24#include <linux/sched/mm.h> 23#include <linux/sched/mm.h>
25#include <linux/sched/topology.h> 24#include <linux/sched/topology.h>
26 25
@@ -103,7 +102,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
103 102
104#ifdef CONFIG_SMP 103#ifdef CONFIG_SMP
105/* 104/*
106 * For asym packing, by default the lower numbered cpu has higher priority. 105 * For asym packing, by default the lower numbered CPU has higher priority.
107 */ 106 */
108int __weak arch_asym_cpu_priority(int cpu) 107int __weak arch_asym_cpu_priority(int cpu)
109{ 108{
@@ -1181,7 +1180,7 @@ pid_t task_numa_group_id(struct task_struct *p)
1181} 1180}
1182 1181
1183/* 1182/*
1184 * The averaged statistics, shared & private, memory & cpu, 1183 * The averaged statistics, shared & private, memory & CPU,
1185 * occupy the first half of the array. The second half of the 1184 * occupy the first half of the array. The second half of the
1186 * array is for current counters, which are averaged into the 1185 * array is for current counters, which are averaged into the
1187 * first set by task_numa_placement. 1186 * first set by task_numa_placement.
@@ -1587,7 +1586,7 @@ static void task_numa_compare(struct task_numa_env *env,
1587 * be incurred if the tasks were swapped. 1586 * be incurred if the tasks were swapped.
1588 */ 1587 */
1589 if (cur) { 1588 if (cur) {
1590 /* Skip this swap candidate if cannot move to the source cpu */ 1589 /* Skip this swap candidate if cannot move to the source CPU: */
1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1590 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1592 goto unlock; 1591 goto unlock;
1593 1592
@@ -1631,7 +1630,7 @@ static void task_numa_compare(struct task_numa_env *env,
1631 goto balance; 1630 goto balance;
1632 } 1631 }
1633 1632
1634 /* Balance doesn't matter much if we're running a task per cpu */ 1633 /* Balance doesn't matter much if we're running a task per CPU: */
1635 if (imp > env->best_imp && src_rq->nr_running == 1 && 1634 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1636 dst_rq->nr_running == 1) 1635 dst_rq->nr_running == 1)
1637 goto assign; 1636 goto assign;
@@ -1676,7 +1675,7 @@ balance:
1676 */ 1675 */
1677 if (!cur) { 1676 if (!cur) {
1678 /* 1677 /*
1679 * select_idle_siblings() uses an per-cpu cpumask that 1678 * select_idle_siblings() uses an per-CPU cpumask that
1680 * can be used from IRQ context. 1679 * can be used from IRQ context.
1681 */ 1680 */
1682 local_irq_disable(); 1681 local_irq_disable();
@@ -3362,7 +3361,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3362} 3361}
3363 3362
3364/* 3363/*
3365 * Called within set_task_rq() right before setting a task's cpu. The 3364 * Called within set_task_rq() right before setting a task's CPU. The
3366 * caller only guarantees p->pi_lock is held; no other assumptions, 3365 * caller only guarantees p->pi_lock is held; no other assumptions,
3367 * including the state of rq->lock, should be made. 3366 * including the state of rq->lock, should be made.
3368 */ 3367 */
@@ -3541,7 +3540,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
3541 3540
3542 /* 3541 /*
3543 * runnable_sum can't be lower than running_sum 3542 * runnable_sum can't be lower than running_sum
3544 * As running sum is scale with cpu capacity wehreas the runnable sum 3543 * As running sum is scale with CPU capacity wehreas the runnable sum
3545 * is not we rescale running_sum 1st 3544 * is not we rescale running_sum 1st
3546 */ 3545 */
3547 running_sum = se->avg.util_sum / 3546 running_sum = se->avg.util_sum /
@@ -4688,7 +4687,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4688 if (!se) 4687 if (!se)
4689 add_nr_running(rq, task_delta); 4688 add_nr_running(rq, task_delta);
4690 4689
4691 /* determine whether we need to wake up potentially idle cpu */ 4690 /* Determine whether we need to wake up potentially idle CPU: */
4692 if (rq->curr == rq->idle && rq->cfs.nr_running) 4691 if (rq->curr == rq->idle && rq->cfs.nr_running)
4693 resched_curr(rq); 4692 resched_curr(rq);
4694} 4693}
@@ -5053,7 +5052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5053} 5052}
5054 5053
5055/* 5054/*
5056 * Both these cpu hotplug callbacks race against unregister_fair_sched_group() 5055 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5057 * 5056 *
5058 * The race is harmless, since modifying bandwidth settings of unhooked group 5057 * The race is harmless, since modifying bandwidth settings of unhooked group
5059 * bits doesn't do much. 5058 * bits doesn't do much.
@@ -5098,7 +5097,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5098 */ 5097 */
5099 cfs_rq->runtime_remaining = 1; 5098 cfs_rq->runtime_remaining = 1;
5100 /* 5099 /*
5101 * Offline rq is schedulable till cpu is completely disabled 5100 * Offline rq is schedulable till CPU is completely disabled
5102 * in take_cpu_down(), so we prevent new cfs throttling here. 5101 * in take_cpu_down(), so we prevent new cfs throttling here.
5103 */ 5102 */
5104 cfs_rq->runtime_enabled = 0; 5103 cfs_rq->runtime_enabled = 0;
@@ -5335,8 +5334,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5335 * 5334 *
5336 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load 5335 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5337 * 5336 *
5338 * If a cpu misses updates for n ticks (as it was idle) and update gets 5337 * If a CPU misses updates for n ticks (as it was idle) and update gets
5339 * called on the n+1-th tick when cpu may be busy, then we have: 5338 * called on the n+1-th tick when CPU may be busy, then we have:
5340 * 5339 *
5341 * load_n = (1 - 1/2^i)^n * load_0 5340 * load_n = (1 - 1/2^i)^n * load_0
5342 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load 5341 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
@@ -5480,7 +5479,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
5480#ifdef CONFIG_NO_HZ_COMMON 5479#ifdef CONFIG_NO_HZ_COMMON
5481/* 5480/*
5482 * There is no sane way to deal with nohz on smp when using jiffies because the 5481 * There is no sane way to deal with nohz on smp when using jiffies because the
5483 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 5482 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5484 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 5483 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5485 * 5484 *
5486 * Therefore we need to avoid the delta approach from the regular tick when 5485 * Therefore we need to avoid the delta approach from the regular tick when
@@ -5591,7 +5590,7 @@ void cpu_load_update_active(struct rq *this_rq)
5591} 5590}
5592 5591
5593/* 5592/*
5594 * Return a low guess at the load of a migration-source cpu weighted 5593 * Return a low guess at the load of a migration-source CPU weighted
5595 * according to the scheduling class and "nice" value. 5594 * according to the scheduling class and "nice" value.
5596 * 5595 *
5597 * We want to under-estimate the load of migration sources, to 5596 * We want to under-estimate the load of migration sources, to
@@ -5609,7 +5608,7 @@ static unsigned long source_load(int cpu, int type)
5609} 5608}
5610 5609
5611/* 5610/*
5612 * Return a high guess at the load of a migration-target cpu weighted 5611 * Return a high guess at the load of a migration-target CPU weighted
5613 * according to the scheduling class and "nice" value. 5612 * according to the scheduling class and "nice" value.
5614 */ 5613 */
5615static unsigned long target_load(int cpu, int type) 5614static unsigned long target_load(int cpu, int type)
@@ -5889,7 +5888,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5889 max_spare_cap = 0; 5888 max_spare_cap = 0;
5890 5889
5891 for_each_cpu(i, sched_group_span(group)) { 5890 for_each_cpu(i, sched_group_span(group)) {
5892 /* Bias balancing toward cpus of our domain */ 5891 /* Bias balancing toward CPUs of our domain */
5893 if (local_group) 5892 if (local_group)
5894 load = source_load(i, load_idx); 5893 load = source_load(i, load_idx);
5895 else 5894 else
@@ -5919,7 +5918,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5919 if (min_runnable_load > (runnable_load + imbalance)) { 5918 if (min_runnable_load > (runnable_load + imbalance)) {
5920 /* 5919 /*
5921 * The runnable load is significantly smaller 5920 * The runnable load is significantly smaller
5922 * so we can pick this new cpu 5921 * so we can pick this new CPU:
5923 */ 5922 */
5924 min_runnable_load = runnable_load; 5923 min_runnable_load = runnable_load;
5925 min_avg_load = avg_load; 5924 min_avg_load = avg_load;
@@ -5928,7 +5927,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5928 (100*min_avg_load > imbalance_scale*avg_load)) { 5927 (100*min_avg_load > imbalance_scale*avg_load)) {
5929 /* 5928 /*
5930 * The runnable loads are close so take the 5929 * The runnable loads are close so take the
5931 * blocked load into account through avg_load. 5930 * blocked load into account through avg_load:
5932 */ 5931 */
5933 min_avg_load = avg_load; 5932 min_avg_load = avg_load;
5934 idlest = group; 5933 idlest = group;
@@ -5989,7 +5988,7 @@ skip_spare:
5989} 5988}
5990 5989
5991/* 5990/*
5992 * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 5991 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5993 */ 5992 */
5994static int 5993static int
5995find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 5994find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -6067,12 +6066,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6067 6066
6068 new_cpu = find_idlest_group_cpu(group, p, cpu); 6067 new_cpu = find_idlest_group_cpu(group, p, cpu);
6069 if (new_cpu == cpu) { 6068 if (new_cpu == cpu) {
6070 /* Now try balancing at a lower domain level of cpu */ 6069 /* Now try balancing at a lower domain level of 'cpu': */
6071 sd = sd->child; 6070 sd = sd->child;
6072 continue; 6071 continue;
6073 } 6072 }
6074 6073
6075 /* Now try balancing at a lower domain level of new_cpu */ 6074 /* Now try balancing at a lower domain level of 'new_cpu': */
6076 cpu = new_cpu; 6075 cpu = new_cpu;
6077 weight = sd->span_weight; 6076 weight = sd->span_weight;
6078 sd = NULL; 6077 sd = NULL;
@@ -6082,7 +6081,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6082 if (tmp->flags & sd_flag) 6081 if (tmp->flags & sd_flag)
6083 sd = tmp; 6082 sd = tmp;
6084 } 6083 }
6085 /* while loop will break here if sd == NULL */
6086 } 6084 }
6087 6085
6088 return new_cpu; 6086 return new_cpu;
@@ -6278,12 +6276,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6278 return target; 6276 return target;
6279 6277
6280 /* 6278 /*
6281 * If the previous cpu is cache affine and idle, don't be stupid. 6279 * If the previous CPU is cache affine and idle, don't be stupid:
6282 */ 6280 */
6283 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) 6281 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6284 return prev; 6282 return prev;
6285 6283
6286 /* Check a recently used CPU as a potential idle candidate */ 6284 /* Check a recently used CPU as a potential idle candidate: */
6287 recent_used_cpu = p->recent_used_cpu; 6285 recent_used_cpu = p->recent_used_cpu;
6288 if (recent_used_cpu != prev && 6286 if (recent_used_cpu != prev &&
6289 recent_used_cpu != target && 6287 recent_used_cpu != target &&
@@ -6292,7 +6290,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6292 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 6290 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6293 /* 6291 /*
6294 * Replace recent_used_cpu with prev as it is a potential 6292 * Replace recent_used_cpu with prev as it is a potential
6295 * candidate for the next wake. 6293 * candidate for the next wake:
6296 */ 6294 */
6297 p->recent_used_cpu = prev; 6295 p->recent_used_cpu = prev;
6298 return recent_used_cpu; 6296 return recent_used_cpu;
@@ -6357,7 +6355,7 @@ static inline unsigned long task_util(struct task_struct *p)
6357} 6355}
6358 6356
6359/* 6357/*
6360 * cpu_util_wake: Compute cpu utilization with any contributions from 6358 * cpu_util_wake: Compute CPU utilization with any contributions from
6361 * the waking task p removed. 6359 * the waking task p removed.
6362 */ 6360 */
6363static unsigned long cpu_util_wake(int cpu, struct task_struct *p) 6361static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
@@ -6403,10 +6401,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6403 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 6401 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6404 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 6402 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6405 * 6403 *
6406 * Balances load by selecting the idlest cpu in the idlest group, or under 6404 * Balances load by selecting the idlest CPU in the idlest group, or under
6407 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. 6405 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6408 * 6406 *
6409 * Returns the target cpu number. 6407 * Returns the target CPU number.
6410 * 6408 *
6411 * preempt must be disabled. 6409 * preempt must be disabled.
6412 */ 6410 */
@@ -6431,7 +6429,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6431 break; 6429 break;
6432 6430
6433 /* 6431 /*
6434 * If both cpu and prev_cpu are part of this domain, 6432 * If both 'cpu' and 'prev_cpu' are part of this domain,
6435 * cpu is a valid SD_WAKE_AFFINE target. 6433 * cpu is a valid SD_WAKE_AFFINE target.
6436 */ 6434 */
6437 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 6435 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
@@ -6482,9 +6480,9 @@ pick_cpu:
6482static void detach_entity_cfs_rq(struct sched_entity *se); 6480static void detach_entity_cfs_rq(struct sched_entity *se);
6483 6481
6484/* 6482/*
6485 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 6483 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
6486 * cfs_rq_of(p) references at time of call are still valid and identify the 6484 * cfs_rq_of(p) references at time of call are still valid and identify the
6487 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6485 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6488 */ 6486 */
6489static void migrate_task_rq_fair(struct task_struct *p) 6487static void migrate_task_rq_fair(struct task_struct *p)
6490{ 6488{
@@ -6918,17 +6916,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6918 * BASICS 6916 * BASICS
6919 * 6917 *
6920 * The purpose of load-balancing is to achieve the same basic fairness the 6918 * The purpose of load-balancing is to achieve the same basic fairness the
6921 * per-cpu scheduler provides, namely provide a proportional amount of compute 6919 * per-CPU scheduler provides, namely provide a proportional amount of compute
6922 * time to each task. This is expressed in the following equation: 6920 * time to each task. This is expressed in the following equation:
6923 * 6921 *
6924 * W_i,n/P_i == W_j,n/P_j for all i,j (1) 6922 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6925 * 6923 *
6926 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight 6924 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
6927 * W_i,0 is defined as: 6925 * W_i,0 is defined as:
6928 * 6926 *
6929 * W_i,0 = \Sum_j w_i,j (2) 6927 * W_i,0 = \Sum_j w_i,j (2)
6930 * 6928 *
6931 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight 6929 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
6932 * is derived from the nice value as per sched_prio_to_weight[]. 6930 * is derived from the nice value as per sched_prio_to_weight[].
6933 * 6931 *
6934 * The weight average is an exponential decay average of the instantaneous 6932 * The weight average is an exponential decay average of the instantaneous
@@ -6936,7 +6934,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6936 * 6934 *
6937 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 6935 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6938 * 6936 *
6939 * C_i is the compute capacity of cpu i, typically it is the 6937 * C_i is the compute capacity of CPU i, typically it is the
6940 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 6938 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6941 * can also include other factors [XXX]. 6939 * can also include other factors [XXX].
6942 * 6940 *
@@ -6957,11 +6955,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6957 * SCHED DOMAINS 6955 * SCHED DOMAINS
6958 * 6956 *
6959 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) 6957 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6960 * for all i,j solution, we create a tree of cpus that follows the hardware 6958 * for all i,j solution, we create a tree of CPUs that follows the hardware
6961 * topology where each level pairs two lower groups (or better). This results 6959 * topology where each level pairs two lower groups (or better). This results
6962 * in O(log n) layers. Furthermore we reduce the number of cpus going up the 6960 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
6963 * tree to only the first of the previous level and we decrease the frequency 6961 * tree to only the first of the previous level and we decrease the frequency
6964 * of load-balance at each level inv. proportional to the number of cpus in 6962 * of load-balance at each level inv. proportional to the number of CPUs in
6965 * the groups. 6963 * the groups.
6966 * 6964 *
6967 * This yields: 6965 * This yields:
@@ -6970,7 +6968,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6970 * \Sum { --- * --- * 2^i } = O(n) (5) 6968 * \Sum { --- * --- * 2^i } = O(n) (5)
6971 * i = 0 2^i 2^i 6969 * i = 0 2^i 2^i
6972 * `- size of each group 6970 * `- size of each group
6973 * | | `- number of cpus doing load-balance 6971 * | | `- number of CPUs doing load-balance
6974 * | `- freq 6972 * | `- freq
6975 * `- sum over all levels 6973 * `- sum over all levels
6976 * 6974 *
@@ -6978,7 +6976,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6978 * this makes (5) the runtime complexity of the balancer. 6976 * this makes (5) the runtime complexity of the balancer.
6979 * 6977 *
6980 * An important property here is that each CPU is still (indirectly) connected 6978 * An important property here is that each CPU is still (indirectly) connected
6981 * to every other cpu in at most O(log n) steps: 6979 * to every other CPU in at most O(log n) steps:
6982 * 6980 *
6983 * The adjacency matrix of the resulting graph is given by: 6981 * The adjacency matrix of the resulting graph is given by:
6984 * 6982 *
@@ -6990,7 +6988,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6990 * 6988 *
6991 * A^(log_2 n)_i,j != 0 for all i,j (7) 6989 * A^(log_2 n)_i,j != 0 for all i,j (7)
6992 * 6990 *
6993 * Showing there's indeed a path between every cpu in at most O(log n) steps. 6991 * Showing there's indeed a path between every CPU in at most O(log n) steps.
6994 * The task movement gives a factor of O(m), giving a convergence complexity 6992 * The task movement gives a factor of O(m), giving a convergence complexity
6995 * of: 6993 * of:
6996 * 6994 *
@@ -7000,7 +6998,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
7000 * WORK CONSERVING 6998 * WORK CONSERVING
7001 * 6999 *
7002 * In order to avoid CPUs going idle while there's still work to do, new idle 7000 * In order to avoid CPUs going idle while there's still work to do, new idle
7003 * balancing is more aggressive and has the newly idle cpu iterate up the domain 7001 * balancing is more aggressive and has the newly idle CPU iterate up the domain
7004 * tree itself instead of relying on other CPUs to bring it work. 7002 * tree itself instead of relying on other CPUs to bring it work.
7005 * 7003 *
7006 * This adds some complexity to both (5) and (8) but it reduces the total idle 7004 * This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -7021,7 +7019,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
7021 * 7019 *
7022 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) 7020 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
7023 * 7021 *
7024 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. 7022 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
7025 * 7023 *
7026 * The big problem is S_k, its a global sum needed to compute a local (W_i) 7024 * The big problem is S_k, its a global sum needed to compute a local (W_i)
7027 * property. 7025 * property.
@@ -7185,7 +7183,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7185 env->flags |= LBF_SOME_PINNED; 7183 env->flags |= LBF_SOME_PINNED;
7186 7184
7187 /* 7185 /*
7188 * Remember if this task can be migrated to any other cpu in 7186 * Remember if this task can be migrated to any other CPU in
7189 * our sched_group. We may want to revisit it if we couldn't 7187 * our sched_group. We may want to revisit it if we couldn't
7190 * meet load balance goals by pulling other tasks on src_cpu. 7188 * meet load balance goals by pulling other tasks on src_cpu.
7191 * 7189 *
@@ -7195,7 +7193,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7195 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 7193 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7196 return 0; 7194 return 0;
7197 7195
7198 /* Prevent to re-select dst_cpu via env's cpus */ 7196 /* Prevent to re-select dst_cpu via env's CPUs: */
7199 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7197 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7200 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7198 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7201 env->flags |= LBF_DST_PINNED; 7199 env->flags |= LBF_DST_PINNED;
@@ -7769,8 +7767,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7769 * Group imbalance indicates (and tries to solve) the problem where balancing 7767 * Group imbalance indicates (and tries to solve) the problem where balancing
7770 * groups is inadequate due to ->cpus_allowed constraints. 7768 * groups is inadequate due to ->cpus_allowed constraints.
7771 * 7769 *
7772 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a 7770 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
7773 * cpumask covering 1 cpu of the first group and 3 cpus of the second group. 7771 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
7774 * Something like: 7772 * Something like:
7775 * 7773 *
7776 * { 0 1 2 3 } { 4 5 6 7 } 7774 * { 0 1 2 3 } { 4 5 6 7 }
@@ -7778,7 +7776,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7778 * 7776 *
7779 * If we were to balance group-wise we'd place two tasks in the first group and 7777 * If we were to balance group-wise we'd place two tasks in the first group and
7780 * two tasks in the second group. Clearly this is undesired as it will overload 7778 * two tasks in the second group. Clearly this is undesired as it will overload
7781 * cpu 3 and leave one of the cpus in the second group unused. 7779 * cpu 3 and leave one of the CPUs in the second group unused.
7782 * 7780 *
7783 * The current solution to this issue is detecting the skew in the first group 7781 * The current solution to this issue is detecting the skew in the first group
7784 * by noticing the lower domain failed to reach balance and had difficulty 7782 * by noticing the lower domain failed to reach balance and had difficulty
@@ -7891,7 +7889,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
7891 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 7889 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7892 struct rq *rq = cpu_rq(i); 7890 struct rq *rq = cpu_rq(i);
7893 7891
7894 /* Bias balancing toward cpus of our domain */ 7892 /* Bias balancing toward CPUs of our domain: */
7895 if (local_group) 7893 if (local_group)
7896 load = target_load(i, load_idx); 7894 load = target_load(i, load_idx);
7897 else 7895 else
@@ -7977,7 +7975,7 @@ asym_packing:
7977 if (!(env->sd->flags & SD_ASYM_PACKING)) 7975 if (!(env->sd->flags & SD_ASYM_PACKING))
7978 return true; 7976 return true;
7979 7977
7980 /* No ASYM_PACKING if target cpu is already busy */ 7978 /* No ASYM_PACKING if target CPU is already busy */
7981 if (env->idle == CPU_NOT_IDLE) 7979 if (env->idle == CPU_NOT_IDLE)
7982 return true; 7980 return true;
7983 /* 7981 /*
@@ -7990,7 +7988,7 @@ asym_packing:
7990 if (!sds->busiest) 7988 if (!sds->busiest)
7991 return true; 7989 return true;
7992 7990
7993 /* Prefer to move from lowest priority cpu's work */ 7991 /* Prefer to move from lowest priority CPU's work */
7994 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, 7992 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7995 sg->asym_prefer_cpu)) 7993 sg->asym_prefer_cpu))
7996 return true; 7994 return true;
@@ -8243,7 +8241,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8243 if (busiest->group_type == group_imbalanced) { 8241 if (busiest->group_type == group_imbalanced) {
8244 /* 8242 /*
8245 * In the group_imb case we cannot rely on group-wide averages 8243 * In the group_imb case we cannot rely on group-wide averages
8246 * to ensure cpu-load equilibrium, look at wider averages. XXX 8244 * to ensure CPU-load equilibrium, look at wider averages. XXX
8247 */ 8245 */
8248 busiest->load_per_task = 8246 busiest->load_per_task =
8249 min(busiest->load_per_task, sds->avg_load); 8247 min(busiest->load_per_task, sds->avg_load);
@@ -8262,7 +8260,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8262 } 8260 }
8263 8261
8264 /* 8262 /*
8265 * If there aren't any idle cpus, avoid creating some. 8263 * If there aren't any idle CPUs, avoid creating some.
8266 */ 8264 */
8267 if (busiest->group_type == group_overloaded && 8265 if (busiest->group_type == group_overloaded &&
8268 local->group_type == group_overloaded) { 8266 local->group_type == group_overloaded) {
@@ -8276,9 +8274,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8276 } 8274 }
8277 8275
8278 /* 8276 /*
8279 * We're trying to get all the cpus to the average_load, so we don't 8277 * We're trying to get all the CPUs to the average_load, so we don't
8280 * want to push ourselves above the average load, nor do we wish to 8278 * want to push ourselves above the average load, nor do we wish to
8281 * reduce the max loaded cpu below the average load. At the same time, 8279 * reduce the max loaded CPU below the average load. At the same time,
8282 * we also don't want to reduce the group load below the group 8280 * we also don't want to reduce the group load below the group
8283 * capacity. Thus we look for the minimum possible imbalance. 8281 * capacity. Thus we look for the minimum possible imbalance.
8284 */ 8282 */
@@ -8372,9 +8370,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8372 8370
8373 if (env->idle == CPU_IDLE) { 8371 if (env->idle == CPU_IDLE) {
8374 /* 8372 /*
8375 * This cpu is idle. If the busiest group is not overloaded 8373 * This CPU is idle. If the busiest group is not overloaded
8376 * and there is no imbalance between this and busiest group 8374 * and there is no imbalance between this and busiest group
8377 * wrt idle cpus, it is balanced. The imbalance becomes 8375 * wrt idle CPUs, it is balanced. The imbalance becomes
8378 * significant if the diff is greater than 1 otherwise we 8376 * significant if the diff is greater than 1 otherwise we
8379 * might end up to just move the imbalance on another group 8377 * might end up to just move the imbalance on another group
8380 */ 8378 */
@@ -8402,7 +8400,7 @@ out_balanced:
8402} 8400}
8403 8401
8404/* 8402/*
8405 * find_busiest_queue - find the busiest runqueue among the cpus in group. 8403 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
8406 */ 8404 */
8407static struct rq *find_busiest_queue(struct lb_env *env, 8405static struct rq *find_busiest_queue(struct lb_env *env,
8408 struct sched_group *group) 8406 struct sched_group *group)
@@ -8446,7 +8444,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8446 8444
8447 /* 8445 /*
8448 * When comparing with imbalance, use weighted_cpuload() 8446 * When comparing with imbalance, use weighted_cpuload()
8449 * which is not scaled with the cpu capacity. 8447 * which is not scaled with the CPU capacity.
8450 */ 8448 */
8451 8449
8452 if (rq->nr_running == 1 && wl > env->imbalance && 8450 if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8454,9 +8452,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8454 continue; 8452 continue;
8455 8453
8456 /* 8454 /*
8457 * For the load comparisons with the other cpu's, consider 8455 * For the load comparisons with the other CPU's, consider
8458 * the weighted_cpuload() scaled with the cpu capacity, so 8456 * the weighted_cpuload() scaled with the CPU capacity, so
8459 * that the load can be moved away from the cpu that is 8457 * that the load can be moved away from the CPU that is
8460 * potentially running at a lower capacity. 8458 * potentially running at a lower capacity.
8461 * 8459 *
8462 * Thus we're looking for max(wl_i / capacity_i), crosswise 8460 * Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8527,13 +8525,13 @@ static int should_we_balance(struct lb_env *env)
8527 return 0; 8525 return 0;
8528 8526
8529 /* 8527 /*
8530 * In the newly idle case, we will allow all the cpu's 8528 * In the newly idle case, we will allow all the CPUs
8531 * to do the newly idle load balance. 8529 * to do the newly idle load balance.
8532 */ 8530 */
8533 if (env->idle == CPU_NEWLY_IDLE) 8531 if (env->idle == CPU_NEWLY_IDLE)
8534 return 1; 8532 return 1;
8535 8533
8536 /* Try to find first idle cpu */ 8534 /* Try to find first idle CPU */
8537 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { 8535 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8538 if (!idle_cpu(cpu)) 8536 if (!idle_cpu(cpu))
8539 continue; 8537 continue;
@@ -8546,7 +8544,7 @@ static int should_we_balance(struct lb_env *env)
8546 balance_cpu = group_balance_cpu(sg); 8544 balance_cpu = group_balance_cpu(sg);
8547 8545
8548 /* 8546 /*
8549 * First idle cpu or the first cpu(busiest) in this sched group 8547 * First idle CPU or the first CPU(busiest) in this sched group
8550 * is eligible for doing load balancing at this and above domains. 8548 * is eligible for doing load balancing at this and above domains.
8551 */ 8549 */
8552 return balance_cpu == env->dst_cpu; 8550 return balance_cpu == env->dst_cpu;
@@ -8655,7 +8653,7 @@ more_balance:
8655 * Revisit (affine) tasks on src_cpu that couldn't be moved to 8653 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8656 * us and move them to an alternate dst_cpu in our sched_group 8654 * us and move them to an alternate dst_cpu in our sched_group
8657 * where they can run. The upper limit on how many times we 8655 * where they can run. The upper limit on how many times we
8658 * iterate on same src_cpu is dependent on number of cpus in our 8656 * iterate on same src_cpu is dependent on number of CPUs in our
8659 * sched_group. 8657 * sched_group.
8660 * 8658 *
8661 * This changes load balance semantics a bit on who can move 8659 * This changes load balance semantics a bit on who can move
@@ -8672,7 +8670,7 @@ more_balance:
8672 */ 8670 */
8673 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 8671 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8674 8672
8675 /* Prevent to re-select dst_cpu via env's cpus */ 8673 /* Prevent to re-select dst_cpu via env's CPUs */
8676 cpumask_clear_cpu(env.dst_cpu, env.cpus); 8674 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8677 8675
8678 env.dst_rq = cpu_rq(env.new_dst_cpu); 8676 env.dst_rq = cpu_rq(env.new_dst_cpu);
@@ -8734,9 +8732,10 @@ more_balance:
8734 8732
8735 raw_spin_lock_irqsave(&busiest->lock, flags); 8733 raw_spin_lock_irqsave(&busiest->lock, flags);
8736 8734
8737 /* don't kick the active_load_balance_cpu_stop, 8735 /*
8738 * if the curr task on busiest cpu can't be 8736 * Don't kick the active_load_balance_cpu_stop,
8739 * moved to this_cpu 8737 * if the curr task on busiest CPU can't be
8738 * moved to this_cpu:
8740 */ 8739 */
8741 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 8740 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8742 raw_spin_unlock_irqrestore(&busiest->lock, 8741 raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8962,7 +8961,7 @@ out:
8962} 8961}
8963 8962
8964/* 8963/*
8965 * active_load_balance_cpu_stop is run by cpu stopper. It pushes 8964 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
8966 * running tasks off the busiest CPU onto idle CPUs. It requires at 8965 * running tasks off the busiest CPU onto idle CPUs. It requires at
8967 * least 1 task to be running on each physical CPU where possible, and 8966 * least 1 task to be running on each physical CPU where possible, and
8968 * avoids physical / logical imbalances. 8967 * avoids physical / logical imbalances.
@@ -8986,7 +8985,7 @@ static int active_load_balance_cpu_stop(void *data)
8986 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 8985 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8987 goto out_unlock; 8986 goto out_unlock;
8988 8987
8989 /* make sure the requested cpu hasn't gone down in the meantime */ 8988 /* Make sure the requested CPU hasn't gone down in the meantime: */
8990 if (unlikely(busiest_cpu != smp_processor_id() || 8989 if (unlikely(busiest_cpu != smp_processor_id() ||
8991 !busiest_rq->active_balance)) 8990 !busiest_rq->active_balance))
8992 goto out_unlock; 8991 goto out_unlock;
@@ -8998,7 +8997,7 @@ static int active_load_balance_cpu_stop(void *data)
8998 /* 8997 /*
8999 * This condition is "impossible", if it occurs 8998 * This condition is "impossible", if it occurs
9000 * we need to fix it. Originally reported by 8999 * we need to fix it. Originally reported by
9001 * Bjorn Helgaas on a 128-cpu setup. 9000 * Bjorn Helgaas on a 128-CPU setup.
9002 */ 9001 */
9003 BUG_ON(busiest_rq == target_rq); 9002 BUG_ON(busiest_rq == target_rq);
9004 9003
@@ -9100,7 +9099,7 @@ static void nohz_balancer_kick(void)
9100 return; 9099 return;
9101 /* 9100 /*
9102 * Use smp_send_reschedule() instead of resched_cpu(). 9101 * Use smp_send_reschedule() instead of resched_cpu().
9103 * This way we generate a sched IPI on the target cpu which 9102 * This way we generate a sched IPI on the target CPU which
9104 * is idle. And the softirq performing nohz idle load balance 9103 * is idle. And the softirq performing nohz idle load balance
9105 * will be run before returning from the IPI. 9104 * will be run before returning from the IPI.
9106 */ 9105 */
@@ -9157,14 +9156,12 @@ unlock:
9157} 9156}
9158 9157
9159/* 9158/*
9160 * This routine will record that the cpu is going idle with tick stopped. 9159 * This routine will record that the CPU is going idle with tick stopped.
9161 * This info will be used in performing idle load balancing in the future. 9160 * This info will be used in performing idle load balancing in the future.
9162 */ 9161 */
9163void nohz_balance_enter_idle(int cpu) 9162void nohz_balance_enter_idle(int cpu)
9164{ 9163{
9165 /* 9164 /* If this CPU is going down, then nothing needs to be done: */
9166 * If this cpu is going down, then nothing needs to be done.
9167 */
9168 if (!cpu_active(cpu)) 9165 if (!cpu_active(cpu))
9169 return; 9166 return;
9170 9167
@@ -9175,9 +9172,7 @@ void nohz_balance_enter_idle(int cpu)
9175 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 9172 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9176 return; 9173 return;
9177 9174
9178 /* 9175 /* If we're a completely isolated CPU, we don't play: */
9179 * If we're a completely isolated CPU, we don't play.
9180 */
9181 if (on_null_domain(cpu_rq(cpu))) 9176 if (on_null_domain(cpu_rq(cpu)))
9182 return; 9177 return;
9183 9178
@@ -9286,7 +9281,7 @@ out:
9286 9281
9287 /* 9282 /*
9288 * next_balance will be updated only when there is a need. 9283 * next_balance will be updated only when there is a need.
9289 * When the cpu is attached to null domain for ex, it will not be 9284 * When the CPU is attached to null domain for ex, it will not be
9290 * updated. 9285 * updated.
9291 */ 9286 */
9292 if (likely(update_next_balance)) { 9287 if (likely(update_next_balance)) {
@@ -9310,7 +9305,7 @@ out:
9310#ifdef CONFIG_NO_HZ_COMMON 9305#ifdef CONFIG_NO_HZ_COMMON
9311/* 9306/*
9312 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 9307 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9313 * rebalancing for all the cpus for whom scheduler ticks are stopped. 9308 * rebalancing for all the CPUs for whom scheduler ticks are stopped.
9314 */ 9309 */
9315static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) 9310static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9316{ 9311{
@@ -9330,8 +9325,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9330 continue; 9325 continue;
9331 9326
9332 /* 9327 /*
9333 * If this cpu gets work to do, stop the load balancing 9328 * If this CPU gets work to do, stop the load balancing
9334 * work being done for other cpus. Next load 9329 * work being done for other CPUs. Next load
9335 * balancing owner will pick it up. 9330 * balancing owner will pick it up.
9336 */ 9331 */
9337 if (need_resched()) 9332 if (need_resched())
@@ -9373,13 +9368,13 @@ end:
9373 9368
9374/* 9369/*
9375 * Current heuristic for kicking the idle load balancer in the presence 9370 * Current heuristic for kicking the idle load balancer in the presence
9376 * of an idle cpu in the system. 9371 * of an idle CPU in the system.
9377 * - This rq has more than one task. 9372 * - This rq has more than one task.
9378 * - This rq has at least one CFS task and the capacity of the CPU is 9373 * - This rq has at least one CFS task and the capacity of the CPU is
9379 * significantly reduced because of RT tasks or IRQs. 9374 * significantly reduced because of RT tasks or IRQs.
9380 * - At parent of LLC scheduler domain level, this cpu's scheduler group has 9375 * - At parent of LLC scheduler domain level, this CPU's scheduler group has
9381 * multiple busy cpu. 9376 * multiple busy CPUs.
9382 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 9377 * - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler
9383 * domain span are idle. 9378 * domain span are idle.
9384 */ 9379 */
9385static inline bool nohz_kick_needed(struct rq *rq) 9380static inline bool nohz_kick_needed(struct rq *rq)
@@ -9469,10 +9464,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9469 CPU_IDLE : CPU_NOT_IDLE; 9464 CPU_IDLE : CPU_NOT_IDLE;
9470 9465
9471 /* 9466 /*
9472 * If this cpu has a pending nohz_balance_kick, then do the 9467 * If this CPU has a pending nohz_balance_kick, then do the
9473 * balancing on behalf of the other idle cpus whose ticks are 9468 * balancing on behalf of the other idle CPUs whose ticks are
9474 * stopped. Do nohz_idle_balance *before* rebalance_domains to 9469 * stopped. Do nohz_idle_balance *before* rebalance_domains to
9475 * give the idle cpus a chance to load balance. Else we may 9470 * give the idle CPUs a chance to load balance. Else we may
9476 * load balance only within the local sched_domain hierarchy 9471 * load balance only within the local sched_domain hierarchy
9477 * and abort nohz_idle_balance altogether if we pull some load. 9472 * and abort nohz_idle_balance altogether if we pull some load.
9478 */ 9473 */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..343d25f85477 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Generic entry point for the idle threads 2 * Generic entry points for the idle threads
3 */ 3 */
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/sched/idle.h> 5#include <linux/sched/idle.h>
@@ -332,8 +332,8 @@ void cpu_startup_entry(enum cpuhp_state state)
332{ 332{
333 /* 333 /*
334 * This #ifdef needs to die, but it's too late in the cycle to 334 * This #ifdef needs to die, but it's too late in the cycle to
335 * make this generic (arm and sh have never invoked the canary 335 * make this generic (ARM and SH have never invoked the canary
336 * init for the non boot cpus!). Will be fixed in 3.11 336 * init for the non boot CPUs!). Will be fixed in 3.11
337 */ 337 */
338#ifdef CONFIG_X86 338#ifdef CONFIG_X86
339 /* 339 /*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 48b8a83f5185..ec73680922f8 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -14,7 +14,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
14{ 14{
15 return task_cpu(p); /* IDLE tasks as never migrated */ 15 return task_cpu(p); /* IDLE tasks as never migrated */
16} 16}
17#endif /* CONFIG_SMP */ 17#endif
18 18
19/* 19/*
20 * Idle tasks are unconditionally rescheduled: 20 * Idle tasks are unconditionally rescheduled:
@@ -30,6 +30,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
30 put_prev_task(rq, prev); 30 put_prev_task(rq, prev);
31 update_idle_core(rq); 31 update_idle_core(rq);
32 schedstat_inc(rq->sched_goidle); 32 schedstat_inc(rq->sched_goidle);
33
33 return rq->idle; 34 return rq->idle;
34} 35}
35 36
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 39f340dde1d7..aad5f48a07c6 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -6,13 +6,13 @@
6 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker 6 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
7 * 7 *
8 */ 8 */
9
10#include <linux/sched/isolation.h> 9#include <linux/sched/isolation.h>
11#include <linux/tick.h> 10#include <linux/tick.h>
12#include <linux/init.h> 11#include <linux/init.h>
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/static_key.h> 13#include <linux/static_key.h>
15#include <linux/ctype.h> 14#include <linux/ctype.h>
15
16#include "sched.h" 16#include "sched.h"
17 17
18DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); 18DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a398e7e28a8a 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -32,29 +32,29 @@
32 * Due to a number of reasons the above turns in the mess below: 32 * Due to a number of reasons the above turns in the mess below:
33 * 33 *
34 * - for_each_possible_cpu() is prohibitively expensive on machines with 34 * - for_each_possible_cpu() is prohibitively expensive on machines with
35 * serious number of cpus, therefore we need to take a distributed approach 35 * serious number of CPUs, therefore we need to take a distributed approach
36 * to calculating nr_active. 36 * to calculating nr_active.
37 * 37 *
38 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 38 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
39 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } 39 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
40 * 40 *
41 * So assuming nr_active := 0 when we start out -- true per definition, we 41 * So assuming nr_active := 0 when we start out -- true per definition, we
42 * can simply take per-cpu deltas and fold those into a global accumulate 42 * can simply take per-CPU deltas and fold those into a global accumulate
43 * to obtain the same result. See calc_load_fold_active(). 43 * to obtain the same result. See calc_load_fold_active().
44 * 44 *
45 * Furthermore, in order to avoid synchronizing all per-cpu delta folding 45 * Furthermore, in order to avoid synchronizing all per-CPU delta folding
46 * across the machine, we assume 10 ticks is sufficient time for every 46 * across the machine, we assume 10 ticks is sufficient time for every
47 * cpu to have completed this task. 47 * CPU to have completed this task.
48 * 48 *
49 * This places an upper-bound on the IRQ-off latency of the machine. Then 49 * This places an upper-bound on the IRQ-off latency of the machine. Then
50 * again, being late doesn't loose the delta, just wrecks the sample. 50 * again, being late doesn't loose the delta, just wrecks the sample.
51 * 51 *
52 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because 52 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
53 * this would add another cross-cpu cacheline miss and atomic operation 53 * this would add another cross-CPU cacheline miss and atomic operation
54 * to the wakeup path. Instead we increment on whatever cpu the task ran 54 * to the wakeup path. Instead we increment on whatever CPU the task ran
55 * when it went into uninterruptible state and decrement on whatever cpu 55 * when it went into uninterruptible state and decrement on whatever CPU
56 * did the wakeup. This means that only the sum of nr_uninterruptible over 56 * did the wakeup. This means that only the sum of nr_uninterruptible over
57 * all cpus yields the correct result. 57 * all CPUs yields the correct result.
58 * 58 *
59 * This covers the NO_HZ=n code, for extra head-aches, see the comment below. 59 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
60 */ 60 */
@@ -115,11 +115,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
115 * Handle NO_HZ for the global load-average. 115 * Handle NO_HZ for the global load-average.
116 * 116 *
117 * Since the above described distributed algorithm to compute the global 117 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by 118 * load-average relies on per-CPU sampling from the tick, it is affected by
119 * NO_HZ. 119 * NO_HZ.
120 * 120 *
121 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon 121 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 122 * entering NO_HZ state such that we can include this as an 'extra' CPU delta
123 * when we read the global state. 123 * when we read the global state.
124 * 124 *
125 * Obviously reality has to ruin such a delightfully simple scheme: 125 * Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +146,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
146 * busy state. 146 * busy state.
147 * 147 *
148 * This is solved by pushing the window forward, and thus skipping the 148 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which 149 * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
150 * was in effect at the time the window opened). This also solves the issue 150 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ 151 * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
152 * intervals. 152 * intervals.
153 * 153 *
154 * When making the ILB scale, we should try to pull this in as well. 154 * When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +299,7 @@ calc_load_n(unsigned long load, unsigned long exp,
299} 299}
300 300
301/* 301/*
302 * NO_HZ can leave us missing all per-cpu ticks calling 302 * NO_HZ can leave us missing all per-CPU ticks calling
303 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into 303 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
304 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold 304 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
305 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. 305 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +363,7 @@ void calc_global_load(unsigned long ticks)
363 return; 363 return;
364 364
365 /* 365 /*
366 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. 366 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
367 */ 367 */
368 delta = calc_load_nohz_fold(); 368 delta = calc_load_nohz_fold();
369 if (delta) 369 if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..2c6ae2413fa2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -27,18 +27,18 @@
27 * except MEMBARRIER_CMD_QUERY. 27 * except MEMBARRIER_CMD_QUERY.
28 */ 28 */
29#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 29#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
30#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 30#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
31 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 31 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
32 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 32 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
33#else 33#else
34#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 34#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
35#endif 35#endif
36 36
37#define MEMBARRIER_CMD_BITMASK \ 37#define MEMBARRIER_CMD_BITMASK \
38 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 38 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
39 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 39 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
40 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 40 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
41 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 41 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
42 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 42 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
43 43
44static void ipi_mb(void *info) 44static void ipi_mb(void *info)
@@ -85,6 +85,7 @@ static int membarrier_global_expedited(void)
85 */ 85 */
86 if (cpu == raw_smp_processor_id()) 86 if (cpu == raw_smp_processor_id())
87 continue; 87 continue;
88
88 rcu_read_lock(); 89 rcu_read_lock();
89 p = task_rcu_dereference(&cpu_rq(cpu)->curr); 90 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
90 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & 91 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +189,7 @@ static int membarrier_private_expedited(int flags)
188 * rq->curr modification in scheduler. 189 * rq->curr modification in scheduler.
189 */ 190 */
190 smp_mb(); /* exit from system call is not a mb */ 191 smp_mb(); /* exit from system call is not a mb */
192
191 return 0; 193 return 0;
192} 194}
193 195
@@ -219,6 +221,7 @@ static int membarrier_register_global_expedited(void)
219 } 221 }
220 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 222 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
221 &mm->membarrier_state); 223 &mm->membarrier_state);
224
222 return 0; 225 return 0;
223} 226}
224 227
@@ -253,6 +256,7 @@ static int membarrier_register_private_expedited(int flags)
253 synchronize_sched(); 256 synchronize_sched();
254 } 257 }
255 atomic_or(state, &mm->membarrier_state); 258 atomic_or(state, &mm->membarrier_state);
259
256 return 0; 260 return 0;
257} 261}
258 262
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c80563b4f6b9..e40498872111 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1453,9 +1453,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1453 return; 1453 return;
1454 1454
1455 /* 1455 /*
1456 * There appears to be other cpus that can accept 1456 * There appear to be other CPUs that can accept
1457 * current and none to run 'p', so lets reschedule 1457 * the current task but none can run 'p', so lets reschedule
1458 * to try and push current away: 1458 * to try and push the current task away:
1459 */ 1459 */
1460 requeue_task_rt(rq, p, 1); 1460 requeue_task_rt(rq, p, 1);
1461 resched_curr(rq); 1461 resched_curr(rq);
@@ -1596,12 +1596,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1596 if (!task_running(rq, p) && 1596 if (!task_running(rq, p) &&
1597 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1597 cpumask_test_cpu(cpu, &p->cpus_allowed))
1598 return 1; 1598 return 1;
1599
1599 return 0; 1600 return 0;
1600} 1601}
1601 1602
1602/* 1603/*
1603 * Return the highest pushable rq's task, which is suitable to be executed 1604 * Return the highest pushable rq's task, which is suitable to be executed
1604 * on the cpu, NULL otherwise 1605 * on the CPU, NULL otherwise
1605 */ 1606 */
1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1607static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1607{ 1608{
@@ -1639,11 +1640,11 @@ static int find_lowest_rq(struct task_struct *task)
1639 return -1; /* No targets found */ 1640 return -1; /* No targets found */
1640 1641
1641 /* 1642 /*
1642 * At this point we have built a mask of cpus representing the 1643 * At this point we have built a mask of CPUs representing the
1643 * lowest priority tasks in the system. Now we want to elect 1644 * lowest priority tasks in the system. Now we want to elect
1644 * the best one based on our affinity and topology. 1645 * the best one based on our affinity and topology.
1645 * 1646 *
1646 * We prioritize the last cpu that the task executed on since 1647 * We prioritize the last CPU that the task executed on since
1647 * it is most likely cache-hot in that location. 1648 * it is most likely cache-hot in that location.
1648 */ 1649 */
1649 if (cpumask_test_cpu(cpu, lowest_mask)) 1650 if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1652,7 @@ static int find_lowest_rq(struct task_struct *task)
1651 1652
1652 /* 1653 /*
1653 * Otherwise, we consult the sched_domains span maps to figure 1654 * Otherwise, we consult the sched_domains span maps to figure
1654 * out which cpu is logically closest to our hot cache data. 1655 * out which CPU is logically closest to our hot cache data.
1655 */ 1656 */
1656 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1657 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1658 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1693,7 @@ static int find_lowest_rq(struct task_struct *task)
1692 cpu = cpumask_any(lowest_mask); 1693 cpu = cpumask_any(lowest_mask);
1693 if (cpu < nr_cpu_ids) 1694 if (cpu < nr_cpu_ids)
1694 return cpu; 1695 return cpu;
1696
1695 return -1; 1697 return -1;
1696} 1698}
1697 1699
@@ -1827,7 +1829,7 @@ retry:
1827 * The task hasn't migrated, and is still the next 1829 * The task hasn't migrated, and is still the next
1828 * eligible task, but we failed to find a run-queue 1830 * eligible task, but we failed to find a run-queue
1829 * to push it to. Do not retry in this case, since 1831 * to push it to. Do not retry in this case, since
1830 * other cpus will pull from us when ready. 1832 * other CPUs will pull from us when ready.
1831 */ 1833 */
1832 goto out; 1834 goto out;
1833 } 1835 }
@@ -1919,7 +1921,7 @@ static int rto_next_cpu(struct root_domain *rd)
1919 * rt_next_cpu() will simply return the first CPU found in 1921 * rt_next_cpu() will simply return the first CPU found in
1920 * the rto_mask. 1922 * the rto_mask.
1921 * 1923 *
1922 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it 1924 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1923 * will return the next CPU found in the rto_mask. 1925 * will return the next CPU found in the rto_mask.
1924 * 1926 *
1925 * If there are no more CPUs left in the rto_mask, then a check is made 1927 * If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1982,7 @@ static void tell_cpu_to_push(struct rq *rq)
1980 raw_spin_lock(&rq->rd->rto_lock); 1982 raw_spin_lock(&rq->rd->rto_lock);
1981 1983
1982 /* 1984 /*
1983 * The rto_cpu is updated under the lock, if it has a valid cpu 1985 * The rto_cpu is updated under the lock, if it has a valid CPU
1984 * then the IPI is still running and will continue due to the 1986 * then the IPI is still running and will continue due to the
1985 * update to loop_next, and nothing needs to be done here. 1987 * update to loop_next, and nothing needs to be done here.
1986 * Otherwise it is finishing up and an ipi needs to be sent. 1988 * Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2107,7 @@ static void pull_rt_task(struct rq *this_rq)
2105 2107
2106 /* 2108 /*
2107 * There's a chance that p is higher in priority 2109 * There's a chance that p is higher in priority
2108 * than what's currently running on its cpu. 2110 * than what's currently running on its CPU.
2109 * This is just that p is wakeing up and hasn't 2111 * This is just that p is wakeing up and hasn't
2110 * had a chance to schedule. We only pull 2112 * had a chance to schedule. We only pull
2111 * p if it is lower in priority than the 2113 * p if it is lower in priority than the
@@ -2693,6 +2695,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
2693 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2695 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2694 } 2696 }
2695 mutex_unlock(&mutex); 2697 mutex_unlock(&mutex);
2698
2696 return ret; 2699 return ret;
2697} 2700}
2698 2701
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc6c8b5a24ad..bd1461ae06e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2 2/*
3 * Scheduler internal types and methods:
4 */
3#include <linux/sched.h> 5#include <linux/sched.h>
4#include <linux/sched/autogroup.h> 6#include <linux/sched/autogroup.h>
5#include <linux/sched/sysctl.h> 7#include <linux/sched/sysctl.h>
@@ -79,11 +81,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
79 * and does not change the user-interface for setting shares/weights. 81 * and does not change the user-interface for setting shares/weights.
80 * 82 *
81 * We increase resolution only if we have enough bits to allow this increased 83 * We increase resolution only if we have enough bits to allow this increased
82 * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are 84 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
83 * pretty high and the returns do not justify the increased costs. 85 * are pretty high and the returns do not justify the increased costs.
84 * 86 *
85 * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to 87 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
86 * increase coverage and consistency always enable it on 64bit platforms. 88 * increase coverage and consistency always enable it on 64-bit platforms.
87 */ 89 */
88#ifdef CONFIG_64BIT 90#ifdef CONFIG_64BIT
89# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 91# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +113,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
111 * 10 -> just above 1us 113 * 10 -> just above 1us
112 * 9 -> just above 0.5us 114 * 9 -> just above 0.5us
113 */ 115 */
114#define DL_SCALE (10) 116#define DL_SCALE 10
115
116/*
117 * These are the 'tuning knobs' of the scheduler:
118 */
119 117
120/* 118/*
121 * single value that denotes runtime == period, ie unlimited time. 119 * Single value that denotes runtime == period, ie unlimited time.
122 */ 120 */
123#define RUNTIME_INF ((u64)~0ULL) 121#define RUNTIME_INF ((u64)~0ULL)
124 122
125static inline int idle_policy(int policy) 123static inline int idle_policy(int policy)
126{ 124{
@@ -235,9 +233,9 @@ void __dl_clear_params(struct task_struct *p);
235 * control. 233 * control.
236 */ 234 */
237struct dl_bandwidth { 235struct dl_bandwidth {
238 raw_spinlock_t dl_runtime_lock; 236 raw_spinlock_t dl_runtime_lock;
239 u64 dl_runtime; 237 u64 dl_runtime;
240 u64 dl_period; 238 u64 dl_period;
241}; 239};
242 240
243static inline int dl_bandwidth_enabled(void) 241static inline int dl_bandwidth_enabled(void)
@@ -246,8 +244,9 @@ static inline int dl_bandwidth_enabled(void)
246} 244}
247 245
248struct dl_bw { 246struct dl_bw {
249 raw_spinlock_t lock; 247 raw_spinlock_t lock;
250 u64 bw, total_bw; 248 u64 bw;
249 u64 total_bw;
251}; 250};
252 251
253static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 252static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +272,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
273 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 272 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
274} 273}
275 274
276void dl_change_utilization(struct task_struct *p, u64 new_bw); 275extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
277extern void init_dl_bw(struct dl_bw *dl_b); 276extern void init_dl_bw(struct dl_bw *dl_b);
278extern int sched_dl_global_validate(void); 277extern int sched_dl_global_validate(void);
279extern void sched_dl_do_global(void); 278extern void sched_dl_do_global(void);
280extern int sched_dl_overflow(struct task_struct *p, int policy, 279extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
281 const struct sched_attr *attr);
282extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 280extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
283extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 281extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
284extern bool __checkparam_dl(const struct sched_attr *attr); 282extern bool __checkparam_dl(const struct sched_attr *attr);
285extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 283extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
286extern int dl_task_can_attach(struct task_struct *p, 284extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
287 const struct cpumask *cs_cpus_allowed); 285extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
288extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
289 const struct cpumask *trial);
290extern bool dl_cpu_busy(unsigned int cpu); 286extern bool dl_cpu_busy(unsigned int cpu);
291 287
292#ifdef CONFIG_CGROUP_SCHED 288#ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +296,36 @@ extern struct list_head task_groups;
300 296
301struct cfs_bandwidth { 297struct cfs_bandwidth {
302#ifdef CONFIG_CFS_BANDWIDTH 298#ifdef CONFIG_CFS_BANDWIDTH
303 raw_spinlock_t lock; 299 raw_spinlock_t lock;
304 ktime_t period; 300 ktime_t period;
305 u64 quota, runtime; 301 u64 quota;
306 s64 hierarchical_quota; 302 u64 runtime;
307 u64 runtime_expires; 303 s64 hierarchical_quota;
308 304 u64 runtime_expires;
309 int idle, period_active; 305
310 struct hrtimer period_timer, slack_timer; 306 int idle;
311 struct list_head throttled_cfs_rq; 307 int period_active;
312 308 struct hrtimer period_timer;
313 /* statistics */ 309 struct hrtimer slack_timer;
314 int nr_periods, nr_throttled; 310 struct list_head throttled_cfs_rq;
315 u64 throttled_time; 311
312 /* Statistics: */
313 int nr_periods;
314 int nr_throttled;
315 u64 throttled_time;
316#endif 316#endif
317}; 317};
318 318
319/* task group related information */ 319/* Task group related information */
320struct task_group { 320struct task_group {
321 struct cgroup_subsys_state css; 321 struct cgroup_subsys_state css;
322 322
323#ifdef CONFIG_FAIR_GROUP_SCHED 323#ifdef CONFIG_FAIR_GROUP_SCHED
324 /* schedulable entities of this group on each cpu */ 324 /* schedulable entities of this group on each CPU */
325 struct sched_entity **se; 325 struct sched_entity **se;
326 /* runqueue "owned" by this group on each cpu */ 326 /* runqueue "owned" by this group on each CPU */
327 struct cfs_rq **cfs_rq; 327 struct cfs_rq **cfs_rq;
328 unsigned long shares; 328 unsigned long shares;
329 329
330#ifdef CONFIG_SMP 330#ifdef CONFIG_SMP
331 /* 331 /*
@@ -333,29 +333,29 @@ struct task_group {
333 * it in its own cacheline separated from the fields above which 333 * it in its own cacheline separated from the fields above which
334 * will also be accessed at each tick. 334 * will also be accessed at each tick.
335 */ 335 */
336 atomic_long_t load_avg ____cacheline_aligned; 336 atomic_long_t load_avg ____cacheline_aligned;
337#endif 337#endif
338#endif 338#endif
339 339
340#ifdef CONFIG_RT_GROUP_SCHED 340#ifdef CONFIG_RT_GROUP_SCHED
341 struct sched_rt_entity **rt_se; 341 struct sched_rt_entity **rt_se;
342 struct rt_rq **rt_rq; 342 struct rt_rq **rt_rq;
343 343
344 struct rt_bandwidth rt_bandwidth; 344 struct rt_bandwidth rt_bandwidth;
345#endif 345#endif
346 346
347 struct rcu_head rcu; 347 struct rcu_head rcu;
348 struct list_head list; 348 struct list_head list;
349 349
350 struct task_group *parent; 350 struct task_group *parent;
351 struct list_head siblings; 351 struct list_head siblings;
352 struct list_head children; 352 struct list_head children;
353 353
354#ifdef CONFIG_SCHED_AUTOGROUP 354#ifdef CONFIG_SCHED_AUTOGROUP
355 struct autogroup *autogroup; 355 struct autogroup *autogroup;
356#endif 356#endif
357 357
358 struct cfs_bandwidth cfs_bandwidth; 358 struct cfs_bandwidth cfs_bandwidth;
359}; 359};
360 360
361#ifdef CONFIG_FAIR_GROUP_SCHED 361#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +369,8 @@ struct task_group {
369 * (The default weight is 1024 - so there's no practical 369 * (The default weight is 1024 - so there's no practical
370 * limitation from this.) 370 * limitation from this.)
371 */ 371 */
372#define MIN_SHARES (1UL << 1) 372#define MIN_SHARES (1UL << 1)
373#define MAX_SHARES (1UL << 18) 373#define MAX_SHARES (1UL << 18)
374#endif 374#endif
375 375
376typedef int (*tg_visitor)(struct task_group *, void *); 376typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +443,39 @@ struct cfs_bandwidth { };
443 443
444/* CFS-related fields in a runqueue */ 444/* CFS-related fields in a runqueue */
445struct cfs_rq { 445struct cfs_rq {
446 struct load_weight load; 446 struct load_weight load;
447 unsigned long runnable_weight; 447 unsigned long runnable_weight;
448 unsigned int nr_running, h_nr_running; 448 unsigned int nr_running;
449 unsigned int h_nr_running;
449 450
450 u64 exec_clock; 451 u64 exec_clock;
451 u64 min_vruntime; 452 u64 min_vruntime;
452#ifndef CONFIG_64BIT 453#ifndef CONFIG_64BIT
453 u64 min_vruntime_copy; 454 u64 min_vruntime_copy;
454#endif 455#endif
455 456
456 struct rb_root_cached tasks_timeline; 457 struct rb_root_cached tasks_timeline;
457 458
458 /* 459 /*
459 * 'curr' points to currently running entity on this cfs_rq. 460 * 'curr' points to currently running entity on this cfs_rq.
460 * It is set to NULL otherwise (i.e when none are currently running). 461 * It is set to NULL otherwise (i.e when none are currently running).
461 */ 462 */
462 struct sched_entity *curr, *next, *last, *skip; 463 struct sched_entity *curr;
464 struct sched_entity *next;
465 struct sched_entity *last;
466 struct sched_entity *skip;
463 467
464#ifdef CONFIG_SCHED_DEBUG 468#ifdef CONFIG_SCHED_DEBUG
465 unsigned int nr_spread_over; 469 unsigned int nr_spread_over;
466#endif 470#endif
467 471
468#ifdef CONFIG_SMP 472#ifdef CONFIG_SMP
469 /* 473 /*
470 * CFS load tracking 474 * CFS load tracking
471 */ 475 */
472 struct sched_avg avg; 476 struct sched_avg avg;
473#ifndef CONFIG_64BIT 477#ifndef CONFIG_64BIT
474 u64 load_last_update_time_copy; 478 u64 load_last_update_time_copy;
475#endif 479#endif
476 struct { 480 struct {
477 raw_spinlock_t lock ____cacheline_aligned; 481 raw_spinlock_t lock ____cacheline_aligned;
@@ -482,9 +486,9 @@ struct cfs_rq {
482 } removed; 486 } removed;
483 487
484#ifdef CONFIG_FAIR_GROUP_SCHED 488#ifdef CONFIG_FAIR_GROUP_SCHED
485 unsigned long tg_load_avg_contrib; 489 unsigned long tg_load_avg_contrib;
486 long propagate; 490 long propagate;
487 long prop_runnable_sum; 491 long prop_runnable_sum;
488 492
489 /* 493 /*
490 * h_load = weight * f(tg) 494 * h_load = weight * f(tg)
@@ -492,36 +496,38 @@ struct cfs_rq {
492 * Where f(tg) is the recursive weight fraction assigned to 496 * Where f(tg) is the recursive weight fraction assigned to
493 * this group. 497 * this group.
494 */ 498 */
495 unsigned long h_load; 499 unsigned long h_load;
496 u64 last_h_load_update; 500 u64 last_h_load_update;
497 struct sched_entity *h_load_next; 501 struct sched_entity *h_load_next;
498#endif /* CONFIG_FAIR_GROUP_SCHED */ 502#endif /* CONFIG_FAIR_GROUP_SCHED */
499#endif /* CONFIG_SMP */ 503#endif /* CONFIG_SMP */
500 504
501#ifdef CONFIG_FAIR_GROUP_SCHED 505#ifdef CONFIG_FAIR_GROUP_SCHED
502 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 506 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
503 507
504 /* 508 /*
505 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 509 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
506 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 510 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
507 * (like users, containers etc.) 511 * (like users, containers etc.)
508 * 512 *
509 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 513 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
510 * list is used during load balance. 514 * This list is used during load balance.
511 */ 515 */
512 int on_list; 516 int on_list;
513 struct list_head leaf_cfs_rq_list; 517 struct list_head leaf_cfs_rq_list;
514 struct task_group *tg; /* group that "owns" this runqueue */ 518 struct task_group *tg; /* group that "owns" this runqueue */
515 519
516#ifdef CONFIG_CFS_BANDWIDTH 520#ifdef CONFIG_CFS_BANDWIDTH
517 int runtime_enabled; 521 int runtime_enabled;
518 u64 runtime_expires; 522 u64 runtime_expires;
519 s64 runtime_remaining; 523 s64 runtime_remaining;
520 524
521 u64 throttled_clock, throttled_clock_task; 525 u64 throttled_clock;
522 u64 throttled_clock_task_time; 526 u64 throttled_clock_task;
523 int throttled, throttle_count; 527 u64 throttled_clock_task_time;
524 struct list_head throttled_list; 528 int throttled;
529 int throttle_count;
530 struct list_head throttled_list;
525#endif /* CONFIG_CFS_BANDWIDTH */ 531#endif /* CONFIG_CFS_BANDWIDTH */
526#endif /* CONFIG_FAIR_GROUP_SCHED */ 532#endif /* CONFIG_FAIR_GROUP_SCHED */
527}; 533};
@@ -538,45 +544,45 @@ static inline int rt_bandwidth_enabled(void)
538 544
539/* Real-Time classes' related field in a runqueue: */ 545/* Real-Time classes' related field in a runqueue: */
540struct rt_rq { 546struct rt_rq {
541 struct rt_prio_array active; 547 struct rt_prio_array active;
542 unsigned int rt_nr_running; 548 unsigned int rt_nr_running;
543 unsigned int rr_nr_running; 549 unsigned int rr_nr_running;
544#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 550#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
545 struct { 551 struct {
546 int curr; /* highest queued rt task prio */ 552 int curr; /* highest queued rt task prio */
547#ifdef CONFIG_SMP 553#ifdef CONFIG_SMP
548 int next; /* next highest */ 554 int next; /* next highest */
549#endif 555#endif
550 } highest_prio; 556 } highest_prio;
551#endif 557#endif
552#ifdef CONFIG_SMP 558#ifdef CONFIG_SMP
553 unsigned long rt_nr_migratory; 559 unsigned long rt_nr_migratory;
554 unsigned long rt_nr_total; 560 unsigned long rt_nr_total;
555 int overloaded; 561 int overloaded;
556 struct plist_head pushable_tasks; 562 struct plist_head pushable_tasks;
557#endif /* CONFIG_SMP */ 563#endif /* CONFIG_SMP */
558 int rt_queued; 564 int rt_queued;
559 565
560 int rt_throttled; 566 int rt_throttled;
561 u64 rt_time; 567 u64 rt_time;
562 u64 rt_runtime; 568 u64 rt_runtime;
563 /* Nests inside the rq lock: */ 569 /* Nests inside the rq lock: */
564 raw_spinlock_t rt_runtime_lock; 570 raw_spinlock_t rt_runtime_lock;
565 571
566#ifdef CONFIG_RT_GROUP_SCHED 572#ifdef CONFIG_RT_GROUP_SCHED
567 unsigned long rt_nr_boosted; 573 unsigned long rt_nr_boosted;
568 574
569 struct rq *rq; 575 struct rq *rq;
570 struct task_group *tg; 576 struct task_group *tg;
571#endif 577#endif
572}; 578};
573 579
574/* Deadline class' related fields in a runqueue */ 580/* Deadline class' related fields in a runqueue */
575struct dl_rq { 581struct dl_rq {
576 /* runqueue is an rbtree, ordered by deadline */ 582 /* runqueue is an rbtree, ordered by deadline */
577 struct rb_root_cached root; 583 struct rb_root_cached root;
578 584
579 unsigned long dl_nr_running; 585 unsigned long dl_nr_running;
580 586
581#ifdef CONFIG_SMP 587#ifdef CONFIG_SMP
582 /* 588 /*
@@ -586,28 +592,28 @@ struct dl_rq {
586 * should migrate somewhere else. 592 * should migrate somewhere else.
587 */ 593 */
588 struct { 594 struct {
589 u64 curr; 595 u64 curr;
590 u64 next; 596 u64 next;
591 } earliest_dl; 597 } earliest_dl;
592 598
593 unsigned long dl_nr_migratory; 599 unsigned long dl_nr_migratory;
594 int overloaded; 600 int overloaded;
595 601
596 /* 602 /*
597 * Tasks on this rq that can be pushed away. They are kept in 603 * Tasks on this rq that can be pushed away. They are kept in
598 * an rb-tree, ordered by tasks' deadlines, with caching 604 * an rb-tree, ordered by tasks' deadlines, with caching
599 * of the leftmost (earliest deadline) element. 605 * of the leftmost (earliest deadline) element.
600 */ 606 */
601 struct rb_root_cached pushable_dl_tasks_root; 607 struct rb_root_cached pushable_dl_tasks_root;
602#else 608#else
603 struct dl_bw dl_bw; 609 struct dl_bw dl_bw;
604#endif 610#endif
605 /* 611 /*
606 * "Active utilization" for this runqueue: increased when a 612 * "Active utilization" for this runqueue: increased when a
607 * task wakes up (becomes TASK_RUNNING) and decreased when a 613 * task wakes up (becomes TASK_RUNNING) and decreased when a
608 * task blocks 614 * task blocks
609 */ 615 */
610 u64 running_bw; 616 u64 running_bw;
611 617
612 /* 618 /*
613 * Utilization of the tasks "assigned" to this runqueue (including 619 * Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +624,14 @@ struct dl_rq {
618 * This is needed to compute the "inactive utilization" for the 624 * This is needed to compute the "inactive utilization" for the
619 * runqueue (inactive utilization = this_bw - running_bw). 625 * runqueue (inactive utilization = this_bw - running_bw).
620 */ 626 */
621 u64 this_bw; 627 u64 this_bw;
622 u64 extra_bw; 628 u64 extra_bw;
623 629
624 /* 630 /*
625 * Inverse of the fraction of CPU utilization that can be reclaimed 631 * Inverse of the fraction of CPU utilization that can be reclaimed
626 * by the GRUB algorithm. 632 * by the GRUB algorithm.
627 */ 633 */
628 u64 bw_ratio; 634 u64 bw_ratio;
629}; 635};
630 636
631#ifdef CONFIG_SMP 637#ifdef CONFIG_SMP
@@ -638,51 +644,51 @@ static inline bool sched_asym_prefer(int a, int b)
638/* 644/*
639 * We add the notion of a root-domain which will be used to define per-domain 645 * We add the notion of a root-domain which will be used to define per-domain
640 * variables. Each exclusive cpuset essentially defines an island domain by 646 * variables. Each exclusive cpuset essentially defines an island domain by
641 * fully partitioning the member cpus from any other cpuset. Whenever a new 647 * fully partitioning the member CPUs from any other cpuset. Whenever a new
642 * exclusive cpuset is created, we also create and attach a new root-domain 648 * exclusive cpuset is created, we also create and attach a new root-domain
643 * object. 649 * object.
644 * 650 *
645 */ 651 */
646struct root_domain { 652struct root_domain {
647 atomic_t refcount; 653 atomic_t refcount;
648 atomic_t rto_count; 654 atomic_t rto_count;
649 struct rcu_head rcu; 655 struct rcu_head rcu;
650 cpumask_var_t span; 656 cpumask_var_t span;
651 cpumask_var_t online; 657 cpumask_var_t online;
652 658
653 /* Indicate more than one runnable task for any CPU */ 659 /* Indicate more than one runnable task for any CPU */
654 bool overload; 660 bool overload;
655 661
656 /* 662 /*
657 * The bit corresponding to a CPU gets set here if such CPU has more 663 * The bit corresponding to a CPU gets set here if such CPU has more
658 * than one runnable -deadline task (as it is below for RT tasks). 664 * than one runnable -deadline task (as it is below for RT tasks).
659 */ 665 */
660 cpumask_var_t dlo_mask; 666 cpumask_var_t dlo_mask;
661 atomic_t dlo_count; 667 atomic_t dlo_count;
662 struct dl_bw dl_bw; 668 struct dl_bw dl_bw;
663 struct cpudl cpudl; 669 struct cpudl cpudl;
664 670
665#ifdef HAVE_RT_PUSH_IPI 671#ifdef HAVE_RT_PUSH_IPI
666 /* 672 /*
667 * For IPI pull requests, loop across the rto_mask. 673 * For IPI pull requests, loop across the rto_mask.
668 */ 674 */
669 struct irq_work rto_push_work; 675 struct irq_work rto_push_work;
670 raw_spinlock_t rto_lock; 676 raw_spinlock_t rto_lock;
671 /* These are only updated and read within rto_lock */ 677 /* These are only updated and read within rto_lock */
672 int rto_loop; 678 int rto_loop;
673 int rto_cpu; 679 int rto_cpu;
674 /* These atomics are updated outside of a lock */ 680 /* These atomics are updated outside of a lock */
675 atomic_t rto_loop_next; 681 atomic_t rto_loop_next;
676 atomic_t rto_loop_start; 682 atomic_t rto_loop_start;
677#endif 683#endif
678 /* 684 /*
679 * The "RT overload" flag: it gets set if a CPU has more than 685 * The "RT overload" flag: it gets set if a CPU has more than
680 * one runnable RT task. 686 * one runnable RT task.
681 */ 687 */
682 cpumask_var_t rto_mask; 688 cpumask_var_t rto_mask;
683 struct cpupri cpupri; 689 struct cpupri cpupri;
684 690
685 unsigned long max_cpu_capacity; 691 unsigned long max_cpu_capacity;
686}; 692};
687 693
688extern struct root_domain def_root_domain; 694extern struct root_domain def_root_domain;
@@ -708,39 +714,39 @@ extern void rto_push_irq_work_func(struct irq_work *work);
708 */ 714 */
709struct rq { 715struct rq {
710 /* runqueue lock: */ 716 /* runqueue lock: */
711 raw_spinlock_t lock; 717 raw_spinlock_t lock;
712 718
713 /* 719 /*
714 * nr_running and cpu_load should be in the same cacheline because 720 * nr_running and cpu_load should be in the same cacheline because
715 * remote CPUs use both these fields when doing load calculation. 721 * remote CPUs use both these fields when doing load calculation.
716 */ 722 */
717 unsigned int nr_running; 723 unsigned int nr_running;
718#ifdef CONFIG_NUMA_BALANCING 724#ifdef CONFIG_NUMA_BALANCING
719 unsigned int nr_numa_running; 725 unsigned int nr_numa_running;
720 unsigned int nr_preferred_running; 726 unsigned int nr_preferred_running;
721#endif 727#endif
722 #define CPU_LOAD_IDX_MAX 5 728 #define CPU_LOAD_IDX_MAX 5
723 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 729 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
724#ifdef CONFIG_NO_HZ_COMMON 730#ifdef CONFIG_NO_HZ_COMMON
725#ifdef CONFIG_SMP 731#ifdef CONFIG_SMP
726 unsigned long last_load_update_tick; 732 unsigned long last_load_update_tick;
727#endif /* CONFIG_SMP */ 733#endif /* CONFIG_SMP */
728 unsigned long nohz_flags; 734 unsigned long nohz_flags;
729#endif /* CONFIG_NO_HZ_COMMON */ 735#endif /* CONFIG_NO_HZ_COMMON */
730 736
731 /* capture load from *all* tasks on this cpu: */ 737 /* capture load from *all* tasks on this CPU: */
732 struct load_weight load; 738 struct load_weight load;
733 unsigned long nr_load_updates; 739 unsigned long nr_load_updates;
734 u64 nr_switches; 740 u64 nr_switches;
735 741
736 struct cfs_rq cfs; 742 struct cfs_rq cfs;
737 struct rt_rq rt; 743 struct rt_rq rt;
738 struct dl_rq dl; 744 struct dl_rq dl;
739 745
740#ifdef CONFIG_FAIR_GROUP_SCHED 746#ifdef CONFIG_FAIR_GROUP_SCHED
741 /* list of leaf cfs_rq on this cpu: */ 747 /* list of leaf cfs_rq on this CPU: */
742 struct list_head leaf_cfs_rq_list; 748 struct list_head leaf_cfs_rq_list;
743 struct list_head *tmp_alone_branch; 749 struct list_head *tmp_alone_branch;
744#endif /* CONFIG_FAIR_GROUP_SCHED */ 750#endif /* CONFIG_FAIR_GROUP_SCHED */
745 751
746 /* 752 /*
@@ -749,94 +755,98 @@ struct rq {
749 * one CPU and if it got migrated afterwards it may decrease 755 * one CPU and if it got migrated afterwards it may decrease
750 * it on another CPU. Always updated under the runqueue lock: 756 * it on another CPU. Always updated under the runqueue lock:
751 */ 757 */
752 unsigned long nr_uninterruptible; 758 unsigned long nr_uninterruptible;
753 759
754 struct task_struct *curr, *idle, *stop; 760 struct task_struct *curr;
755 unsigned long next_balance; 761 struct task_struct *idle;
756 struct mm_struct *prev_mm; 762 struct task_struct *stop;
763 unsigned long next_balance;
764 struct mm_struct *prev_mm;
757 765
758 unsigned int clock_update_flags; 766 unsigned int clock_update_flags;
759 u64 clock; 767 u64 clock;
760 u64 clock_task; 768 u64 clock_task;
761 769
762 atomic_t nr_iowait; 770 atomic_t nr_iowait;
763 771
764#ifdef CONFIG_SMP 772#ifdef CONFIG_SMP
765 struct root_domain *rd; 773 struct root_domain *rd;
766 struct sched_domain *sd; 774 struct sched_domain *sd;
775
776 unsigned long cpu_capacity;
777 unsigned long cpu_capacity_orig;
767 778
768 unsigned long cpu_capacity; 779 struct callback_head *balance_callback;
769 unsigned long cpu_capacity_orig;
770 780
771 struct callback_head *balance_callback; 781 unsigned char idle_balance;
772 782
773 unsigned char idle_balance;
774 /* For active balancing */ 783 /* For active balancing */
775 int active_balance; 784 int active_balance;
776 int push_cpu; 785 int push_cpu;
777 struct cpu_stop_work active_balance_work; 786 struct cpu_stop_work active_balance_work;
778 /* cpu of this runqueue: */ 787
779 int cpu; 788 /* CPU of this runqueue: */
780 int online; 789 int cpu;
790 int online;
781 791
782 struct list_head cfs_tasks; 792 struct list_head cfs_tasks;
783 793
784 u64 rt_avg; 794 u64 rt_avg;
785 u64 age_stamp; 795 u64 age_stamp;
786 u64 idle_stamp; 796 u64 idle_stamp;
787 u64 avg_idle; 797 u64 avg_idle;
788 798
789 /* This is used to determine avg_idle's max value */ 799 /* This is used to determine avg_idle's max value */
790 u64 max_idle_balance_cost; 800 u64 max_idle_balance_cost;
791#endif 801#endif
792 802
793#ifdef CONFIG_IRQ_TIME_ACCOUNTING 803#ifdef CONFIG_IRQ_TIME_ACCOUNTING
794 u64 prev_irq_time; 804 u64 prev_irq_time;
795#endif 805#endif
796#ifdef CONFIG_PARAVIRT 806#ifdef CONFIG_PARAVIRT
797 u64 prev_steal_time; 807 u64 prev_steal_time;
798#endif 808#endif
799#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 809#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
800 u64 prev_steal_time_rq; 810 u64 prev_steal_time_rq;
801#endif 811#endif
802 812
803 /* calc_load related fields */ 813 /* calc_load related fields */
804 unsigned long calc_load_update; 814 unsigned long calc_load_update;
805 long calc_load_active; 815 long calc_load_active;
806 816
807#ifdef CONFIG_SCHED_HRTICK 817#ifdef CONFIG_SCHED_HRTICK
808#ifdef CONFIG_SMP 818#ifdef CONFIG_SMP
809 int hrtick_csd_pending; 819 int hrtick_csd_pending;
810 call_single_data_t hrtick_csd; 820 call_single_data_t hrtick_csd;
811#endif 821#endif
812 struct hrtimer hrtick_timer; 822 struct hrtimer hrtick_timer;
813#endif 823#endif
814 824
815#ifdef CONFIG_SCHEDSTATS 825#ifdef CONFIG_SCHEDSTATS
816 /* latency stats */ 826 /* latency stats */
817 struct sched_info rq_sched_info; 827 struct sched_info rq_sched_info;
818 unsigned long long rq_cpu_time; 828 unsigned long long rq_cpu_time;
819 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 829 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
820 830
821 /* sys_sched_yield() stats */ 831 /* sys_sched_yield() stats */
822 unsigned int yld_count; 832 unsigned int yld_count;
823 833
824 /* schedule() stats */ 834 /* schedule() stats */
825 unsigned int sched_count; 835 unsigned int sched_count;
826 unsigned int sched_goidle; 836 unsigned int sched_goidle;
827 837
828 /* try_to_wake_up() stats */ 838 /* try_to_wake_up() stats */
829 unsigned int ttwu_count; 839 unsigned int ttwu_count;
830 unsigned int ttwu_local; 840 unsigned int ttwu_local;
831#endif 841#endif
832 842
833#ifdef CONFIG_SMP 843#ifdef CONFIG_SMP
834 struct llist_head wake_list; 844 struct llist_head wake_list;
835#endif 845#endif
836 846
837#ifdef CONFIG_CPU_IDLE 847#ifdef CONFIG_CPU_IDLE
838 /* Must be inspected within a rcu lock section */ 848 /* Must be inspected within a rcu lock section */
839 struct cpuidle_state *idle_state; 849 struct cpuidle_state *idle_state;
840#endif 850#endif
841}; 851};
842 852
@@ -902,9 +912,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
902 * one position though, because the next rq_unpin_lock() will shift it 912 * one position though, because the next rq_unpin_lock() will shift it
903 * back. 913 * back.
904 */ 914 */
905#define RQCF_REQ_SKIP 0x01 915#define RQCF_REQ_SKIP 0x01
906#define RQCF_ACT_SKIP 0x02 916#define RQCF_ACT_SKIP 0x02
907#define RQCF_UPDATED 0x04 917#define RQCF_UPDATED 0x04
908 918
909static inline void assert_clock_updated(struct rq *rq) 919static inline void assert_clock_updated(struct rq *rq)
910{ 920{
@@ -1057,12 +1067,12 @@ extern void sched_ttwu_pending(void);
1057 1067
1058/** 1068/**
1059 * highest_flag_domain - Return highest sched_domain containing flag. 1069 * highest_flag_domain - Return highest sched_domain containing flag.
1060 * @cpu: The cpu whose highest level of sched domain is to 1070 * @cpu: The CPU whose highest level of sched domain is to
1061 * be returned. 1071 * be returned.
1062 * @flag: The flag to check for the highest sched_domain 1072 * @flag: The flag to check for the highest sched_domain
1063 * for the given cpu. 1073 * for the given CPU.
1064 * 1074 *
1065 * Returns the highest sched_domain of a cpu which contains the given flag. 1075 * Returns the highest sched_domain of a CPU which contains the given flag.
1066 */ 1076 */
1067static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1077static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
1068{ 1078{
@@ -1097,30 +1107,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
1097DECLARE_PER_CPU(struct sched_domain *, sd_asym); 1107DECLARE_PER_CPU(struct sched_domain *, sd_asym);
1098 1108
1099struct sched_group_capacity { 1109struct sched_group_capacity {
1100 atomic_t ref; 1110 atomic_t ref;
1101 /* 1111 /*
1102 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 1112 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
1103 * for a single CPU. 1113 * for a single CPU.
1104 */ 1114 */
1105 unsigned long capacity; 1115 unsigned long capacity;
1106 unsigned long min_capacity; /* Min per-CPU capacity in group */ 1116 unsigned long min_capacity; /* Min per-CPU capacity in group */
1107 unsigned long next_update; 1117 unsigned long next_update;
1108 int imbalance; /* XXX unrelated to capacity but shared group state */ 1118 int imbalance; /* XXX unrelated to capacity but shared group state */
1109 1119
1110#ifdef CONFIG_SCHED_DEBUG 1120#ifdef CONFIG_SCHED_DEBUG
1111 int id; 1121 int id;
1112#endif 1122#endif
1113 1123
1114 unsigned long cpumask[0]; /* balance mask */ 1124 unsigned long cpumask[0]; /* Balance mask */
1115}; 1125};
1116 1126
1117struct sched_group { 1127struct sched_group {
1118 struct sched_group *next; /* Must be a circular list */ 1128 struct sched_group *next; /* Must be a circular list */
1119 atomic_t ref; 1129 atomic_t ref;
1120 1130
1121 unsigned int group_weight; 1131 unsigned int group_weight;
1122 struct sched_group_capacity *sgc; 1132 struct sched_group_capacity *sgc;
1123 int asym_prefer_cpu; /* cpu of highest priority in group */ 1133 int asym_prefer_cpu; /* CPU of highest priority in group */
1124 1134
1125 /* 1135 /*
1126 * The CPUs this group covers. 1136 * The CPUs this group covers.
@@ -1129,7 +1139,7 @@ struct sched_group {
1129 * by attaching extra space to the end of the structure, 1139 * by attaching extra space to the end of the structure,
1130 * depending on how many CPUs the kernel has booted up with) 1140 * depending on how many CPUs the kernel has booted up with)
1131 */ 1141 */
1132 unsigned long cpumask[0]; 1142 unsigned long cpumask[0];
1133}; 1143};
1134 1144
1135static inline struct cpumask *sched_group_span(struct sched_group *sg) 1145static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1146,8 +1156,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
1146} 1156}
1147 1157
1148/** 1158/**
1149 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 1159 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
1150 * @group: The group whose first cpu is to be returned. 1160 * @group: The group whose first CPU is to be returned.
1151 */ 1161 */
1152static inline unsigned int group_first_cpu(struct sched_group *group) 1162static inline unsigned int group_first_cpu(struct sched_group *group)
1153{ 1163{
@@ -1357,9 +1367,9 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1357/* 1367/*
1358 * wake flags 1368 * wake flags
1359 */ 1369 */
1360#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ 1370#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
1361#define WF_FORK 0x02 /* child wakeup after fork */ 1371#define WF_FORK 0x02 /* Child wakeup after fork */
1362#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 1372#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
1363 1373
1364/* 1374/*
1365 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1375 * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1370,11 +1380,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1370 * slice expiry etc. 1380 * slice expiry etc.
1371 */ 1381 */
1372 1382
1373#define WEIGHT_IDLEPRIO 3 1383#define WEIGHT_IDLEPRIO 3
1374#define WMULT_IDLEPRIO 1431655765 1384#define WMULT_IDLEPRIO 1431655765
1375 1385
1376extern const int sched_prio_to_weight[40]; 1386extern const int sched_prio_to_weight[40];
1377extern const u32 sched_prio_to_wmult[40]; 1387extern const u32 sched_prio_to_wmult[40];
1378 1388
1379/* 1389/*
1380 * {de,en}queue flags: 1390 * {de,en}queue flags:
@@ -1396,9 +1406,9 @@ extern const u32 sched_prio_to_wmult[40];
1396 */ 1406 */
1397 1407
1398#define DEQUEUE_SLEEP 0x01 1408#define DEQUEUE_SLEEP 0x01
1399#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ 1409#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
1400#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ 1410#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
1401#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ 1411#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
1402 1412
1403#define ENQUEUE_WAKEUP 0x01 1413#define ENQUEUE_WAKEUP 0x01
1404#define ENQUEUE_RESTORE 0x02 1414#define ENQUEUE_RESTORE 0x02
@@ -1420,10 +1430,10 @@ struct sched_class {
1420 1430
1421 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1431 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1422 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1432 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1423 void (*yield_task) (struct rq *rq); 1433 void (*yield_task) (struct rq *rq);
1424 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 1434 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
1425 1435
1426 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1436 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
1427 1437
1428 /* 1438 /*
1429 * It is the responsibility of the pick_next_task() method that will 1439 * It is the responsibility of the pick_next_task() method that will
@@ -1433,16 +1443,16 @@ struct sched_class {
1433 * May return RETRY_TASK when it finds a higher prio class has runnable 1443 * May return RETRY_TASK when it finds a higher prio class has runnable
1434 * tasks. 1444 * tasks.
1435 */ 1445 */
1436 struct task_struct * (*pick_next_task) (struct rq *rq, 1446 struct task_struct * (*pick_next_task)(struct rq *rq,
1437 struct task_struct *prev, 1447 struct task_struct *prev,
1438 struct rq_flags *rf); 1448 struct rq_flags *rf);
1439 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1449 void (*put_prev_task)(struct rq *rq, struct task_struct *p);
1440 1450
1441#ifdef CONFIG_SMP 1451#ifdef CONFIG_SMP
1442 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1452 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1443 void (*migrate_task_rq)(struct task_struct *p); 1453 void (*migrate_task_rq)(struct task_struct *p);
1444 1454
1445 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1455 void (*task_woken)(struct rq *this_rq, struct task_struct *task);
1446 1456
1447 void (*set_cpus_allowed)(struct task_struct *p, 1457 void (*set_cpus_allowed)(struct task_struct *p,
1448 const struct cpumask *newmask); 1458 const struct cpumask *newmask);
@@ -1451,31 +1461,31 @@ struct sched_class {
1451 void (*rq_offline)(struct rq *rq); 1461 void (*rq_offline)(struct rq *rq);
1452#endif 1462#endif
1453 1463
1454 void (*set_curr_task) (struct rq *rq); 1464 void (*set_curr_task)(struct rq *rq);
1455 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1465 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1456 void (*task_fork) (struct task_struct *p); 1466 void (*task_fork)(struct task_struct *p);
1457 void (*task_dead) (struct task_struct *p); 1467 void (*task_dead)(struct task_struct *p);
1458 1468
1459 /* 1469 /*
1460 * The switched_from() call is allowed to drop rq->lock, therefore we 1470 * The switched_from() call is allowed to drop rq->lock, therefore we
1461 * cannot assume the switched_from/switched_to pair is serliazed by 1471 * cannot assume the switched_from/switched_to pair is serliazed by
1462 * rq->lock. They are however serialized by p->pi_lock. 1472 * rq->lock. They are however serialized by p->pi_lock.
1463 */ 1473 */
1464 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1474 void (*switched_from)(struct rq *this_rq, struct task_struct *task);
1465 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1475 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1466 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1476 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1467 int oldprio); 1477 int oldprio);
1468 1478
1469 unsigned int (*get_rr_interval) (struct rq *rq, 1479 unsigned int (*get_rr_interval)(struct rq *rq,
1470 struct task_struct *task); 1480 struct task_struct *task);
1471 1481
1472 void (*update_curr) (struct rq *rq); 1482 void (*update_curr)(struct rq *rq);
1473 1483
1474#define TASK_SET_GROUP 0 1484#define TASK_SET_GROUP 0
1475#define TASK_MOVE_GROUP 1 1485#define TASK_MOVE_GROUP 1
1476 1486
1477#ifdef CONFIG_FAIR_GROUP_SCHED 1487#ifdef CONFIG_FAIR_GROUP_SCHED
1478 void (*task_change_group) (struct task_struct *p, int type); 1488 void (*task_change_group)(struct task_struct *p, int type);
1479#endif 1489#endif
1480}; 1490};
1481 1491
@@ -1524,6 +1534,7 @@ static inline void idle_set_state(struct rq *rq,
1524static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1534static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1525{ 1535{
1526 SCHED_WARN_ON(!rcu_read_lock_held()); 1536 SCHED_WARN_ON(!rcu_read_lock_held());
1537
1527 return rq->idle_state; 1538 return rq->idle_state;
1528} 1539}
1529#else 1540#else
@@ -1562,9 +1573,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1562extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1573extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
1563extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1574extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
1564 1575
1565#define BW_SHIFT 20 1576#define BW_SHIFT 20
1566#define BW_UNIT (1 << BW_SHIFT) 1577#define BW_UNIT (1 << BW_SHIFT)
1567#define RATIO_SHIFT 8 1578#define RATIO_SHIFT 8
1568unsigned long to_ratio(u64 period, u64 runtime); 1579unsigned long to_ratio(u64 period, u64 runtime);
1569 1580
1570extern void init_entity_runnable_average(struct sched_entity *se); 1581extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1814,8 +1825,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1814/* 1825/*
1815 * Unfair double_lock_balance: Optimizes throughput at the expense of 1826 * Unfair double_lock_balance: Optimizes throughput at the expense of
1816 * latency by eliminating extra atomic operations when the locks are 1827 * latency by eliminating extra atomic operations when the locks are
1817 * already in proper order on entry. This favors lower cpu-ids and will 1828 * already in proper order on entry. This favors lower CPU-ids and will
1818 * grant the double lock to lower cpus over higher ids under contention, 1829 * grant the double lock to lower CPUs over higher ids under contention,
1819 * regardless of entry order into the function. 1830 * regardless of entry order into the function.
1820 */ 1831 */
1821static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1832static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1847,7 +1858,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1847static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1858static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1848{ 1859{
1849 if (unlikely(!irqs_disabled())) { 1860 if (unlikely(!irqs_disabled())) {
1850 /* printk() doesn't work good under rq->lock */ 1861 /* printk() doesn't work well under rq->lock */
1851 raw_spin_unlock(&this_rq->lock); 1862 raw_spin_unlock(&this_rq->lock);
1852 BUG_ON(1); 1863 BUG_ON(1);
1853 } 1864 }
@@ -2106,15 +2117,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2106#endif /* CONFIG_CPU_FREQ */ 2117#endif /* CONFIG_CPU_FREQ */
2107 2118
2108#ifdef arch_scale_freq_capacity 2119#ifdef arch_scale_freq_capacity
2109#ifndef arch_scale_freq_invariant 2120# ifndef arch_scale_freq_invariant
2110#define arch_scale_freq_invariant() (true) 2121# define arch_scale_freq_invariant() true
2111#endif 2122# endif
2112#else /* arch_scale_freq_capacity */ 2123#else
2113#define arch_scale_freq_invariant() (false) 2124# define arch_scale_freq_invariant() false
2114#endif 2125#endif
2115 2126
2116#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2127#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2117
2118static inline unsigned long cpu_util_dl(struct rq *rq) 2128static inline unsigned long cpu_util_dl(struct rq *rq)
2119{ 2129{
2120 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2130 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2124,5 +2134,4 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
2124{ 2134{
2125 return rq->cfs.avg.util_avg; 2135 return rq->cfs.avg.util_avg;
2126} 2136}
2127
2128#endif 2137#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..968c1fe3099a 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -78,8 +78,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
78 * This itererator needs some explanation. 78 * This itererator needs some explanation.
79 * It returns 1 for the header position. 79 * It returns 1 for the header position.
80 * This means 2 is cpu 0. 80 * This means 2 is cpu 0.
81 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 81 * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
82 * to use cpumask_* to iterate over the cpus. 82 * to use cpumask_* to iterate over the CPUs.
83 */ 83 */
84static void *schedstat_start(struct seq_file *file, loff_t *offset) 84static void *schedstat_start(struct seq_file *file, loff_t *offset)
85{ 85{
@@ -99,12 +99,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
99 99
100 if (n < nr_cpu_ids) 100 if (n < nr_cpu_ids)
101 return (void *)(unsigned long)(n + 2); 101 return (void *)(unsigned long)(n + 2);
102
102 return NULL; 103 return NULL;
103} 104}
104 105
105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) 106static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
106{ 107{
107 (*offset)++; 108 (*offset)++;
109
108 return schedstat_start(file, offset); 110 return schedstat_start(file, offset);
109} 111}
110 112
@@ -134,6 +136,7 @@ static const struct file_operations proc_schedstat_operations = {
134static int __init proc_schedstat_init(void) 136static int __init proc_schedstat_init(void)
135{ 137{
136 proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 138 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
139
137 return 0; 140 return 0;
138} 141}
139subsys_initcall(proc_schedstat_init); 142subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
30 if (rq) 30 if (rq)
31 rq->rq_sched_info.run_delay += delta; 31 rq->rq_sched_info.run_delay += delta;
32} 32}
33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
34#define __schedstat_inc(var) do { var++; } while (0) 34#define __schedstat_inc(var) do { var++; } while (0)
35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) 35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
36#define __schedstat_add(var, amt) do { var += (amt); } while (0) 36#define __schedstat_add(var, amt) do { var += (amt); } while (0)
37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) 37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
38#define __schedstat_set(var, val) do { var = (val); } while (0) 38#define __schedstat_set(var, val) do { var = (val); } while (0)
39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
40#define schedstat_val(var) (var) 40#define schedstat_val(var) (var)
41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) 41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
42 42
43#else /* !CONFIG_SCHEDSTATS */ 43#else /* !CONFIG_SCHEDSTATS: */
44static inline void 44static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
45rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 45static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
46{} 46static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
47static inline void 47# define schedstat_enabled() 0
48rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) 48# define __schedstat_inc(var) do { } while (0)
49{} 49# define schedstat_inc(var) do { } while (0)
50static inline void 50# define __schedstat_add(var, amt) do { } while (0)
51rq_sched_info_depart(struct rq *rq, unsigned long long delta) 51# define schedstat_add(var, amt) do { } while (0)
52{} 52# define __schedstat_set(var, val) do { } while (0)
53#define schedstat_enabled() 0 53# define schedstat_set(var, val) do { } while (0)
54#define __schedstat_inc(var) do { } while (0) 54# define schedstat_val(var) 0
55#define schedstat_inc(var) do { } while (0) 55# define schedstat_val_or_zero(var) 0
56#define __schedstat_add(var, amt) do { } while (0)
57#define schedstat_add(var, amt) do { } while (0)
58#define __schedstat_set(var, val) do { } while (0)
59#define schedstat_set(var, val) do { } while (0)
60#define schedstat_val(var) 0
61#define schedstat_val_or_zero(var) 0
62#endif /* CONFIG_SCHEDSTATS */ 56#endif /* CONFIG_SCHEDSTATS */
63 57
64#ifdef CONFIG_SCHED_INFO 58#ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
69 63
70/* 64/*
71 * We are interested in knowing how long it was from the *first* time a 65 * We are interested in knowing how long it was from the *first* time a
72 * task was queued to the time that it finally hit a cpu, we call this routine 66 * task was queued to the time that it finally hit a CPU, we call this routine
73 * from dequeue_task() to account for possible rq->clock skew across cpus. The 67 * from dequeue_task() to account for possible rq->clock skew across CPUs. The
74 * delta taken on each cpu would annul the skew. 68 * delta taken on each CPU would annul the skew.
75 */ 69 */
76static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) 70static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
77{ 71{
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
87} 81}
88 82
89/* 83/*
90 * Called when a task finally hits the cpu. We can now calculate how 84 * Called when a task finally hits the CPU. We can now calculate how
91 * long it was waiting to run. We also note when it began so that we 85 * long it was waiting to run. We also note when it began so that we
92 * can keep stats on how long its timeslice is. 86 * can keep stats on how long its timeslice is.
93 */ 87 */
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
112 */ 106 */
113static inline void sched_info_queued(struct rq *rq, struct task_struct *t) 107static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
114{ 108{
115 if (unlikely(sched_info_on())) 109 if (unlikely(sched_info_on())) {
116 if (!t->sched_info.last_queued) 110 if (!t->sched_info.last_queued)
117 t->sched_info.last_queued = rq_clock(rq); 111 t->sched_info.last_queued = rq_clock(rq);
112 }
118} 113}
119 114
120/* 115/*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
127 */ 122 */
128static inline void sched_info_depart(struct rq *rq, struct task_struct *t) 123static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
129{ 124{
130 unsigned long long delta = rq_clock(rq) - 125 unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
131 t->sched_info.last_arrival;
132 126
133 rq_sched_info_depart(rq, delta); 127 rq_sched_info_depart(rq, delta);
134 128
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
142 * the idle task.) We are only called when prev != next. 136 * the idle task.) We are only called when prev != next.
143 */ 137 */
144static inline void 138static inline void
145__sched_info_switch(struct rq *rq, 139__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
146 struct task_struct *prev, struct task_struct *next)
147{ 140{
148 /* 141 /*
149 * prev now departs the cpu. It's not interesting to record 142 * prev now departs the CPU. It's not interesting to record
150 * stats about how efficient we were at scheduling the idle 143 * stats about how efficient we were at scheduling the idle
151 * process, however. 144 * process, however.
152 */ 145 */
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
156 if (next != rq->idle) 149 if (next != rq->idle)
157 sched_info_arrive(rq, next); 150 sched_info_arrive(rq, next);
158} 151}
152
159static inline void 153static inline void
160sched_info_switch(struct rq *rq, 154sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
161 struct task_struct *prev, struct task_struct *next)
162{ 155{
163 if (unlikely(sched_info_on())) 156 if (unlikely(sched_info_on()))
164 __sched_info_switch(rq, prev, next); 157 __sched_info_switch(rq, prev, next);
165} 158}
166#else 159
167#define sched_info_queued(rq, t) do { } while (0) 160#else /* !CONFIG_SCHED_INFO: */
168#define sched_info_reset_dequeued(t) do { } while (0) 161# define sched_info_queued(rq, t) do { } while (0)
169#define sched_info_dequeued(rq, t) do { } while (0) 162# define sched_info_reset_dequeued(t) do { } while (0)
170#define sched_info_depart(rq, t) do { } while (0) 163# define sched_info_dequeued(rq, t) do { } while (0)
171#define sched_info_arrive(rq, next) do { } while (0) 164# define sched_info_depart(rq, t) do { } while (0)
172#define sched_info_switch(rq, t, next) do { } while (0) 165# define sched_info_arrive(rq, next) do { } while (0)
166# define sched_info_switch(rq, t, next) do { } while (0)
173#endif /* CONFIG_SCHED_INFO */ 167#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index ea8d2b6a1239..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/* 2/*
5 * stop-task scheduling class. 3 * stop-task scheduling class.
6 * 4 *
@@ -9,6 +7,7 @@
9 * 7 *
10 * See kernel/stop_machine.c 8 * See kernel/stop_machine.c
11 */ 9 */
10#include "sched.h"
12 11
13#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
14static int 13static int
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b88ab4e0207f 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,4 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2/*
3 * <linux/swait.h> (simple wait queues ) implementation:
4 */
2#include <linux/sched/signal.h> 5#include <linux/sched/signal.h>
3#include <linux/swait.h> 6#include <linux/swait.h>
4 7
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..219eee70e457 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -41,8 +41,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
41 if (!(sd->flags & SD_LOAD_BALANCE)) { 41 if (!(sd->flags & SD_LOAD_BALANCE)) {
42 printk("does not load-balance\n"); 42 printk("does not load-balance\n");
43 if (sd->parent) 43 if (sd->parent)
44 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 44 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
45 " has parent");
46 return -1; 45 return -1;
47 } 46 }
48 47
@@ -50,12 +49,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
50 cpumask_pr_args(sched_domain_span(sd)), sd->name); 49 cpumask_pr_args(sched_domain_span(sd)), sd->name);
51 50
52 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 51 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
53 printk(KERN_ERR "ERROR: domain->span does not contain " 52 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
54 "CPU%d\n", cpu);
55 } 53 }
56 if (!cpumask_test_cpu(cpu, sched_group_span(group))) { 54 if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
57 printk(KERN_ERR "ERROR: domain->groups does not contain" 55 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
58 " CPU%d\n", cpu);
59 } 56 }
60 57
61 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 58 printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +112,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
115 112
116 if (sd->parent && 113 if (sd->parent &&
117 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 114 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
118 printk(KERN_ERR "ERROR: parent span is not a superset " 115 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
119 "of domain->span\n");
120 return 0; 116 return 0;
121} 117}
122 118
@@ -595,7 +591,7 @@ int group_balance_cpu(struct sched_group *sg)
595 * are not. 591 * are not.
596 * 592 *
597 * This leads to a few particularly weird cases where the sched_domain's are 593 * This leads to a few particularly weird cases where the sched_domain's are
598 * not of the same number for each cpu. Consider: 594 * not of the same number for each CPU. Consider:
599 * 595 *
600 * NUMA-2 0-3 0-3 596 * NUMA-2 0-3 0-3
601 * groups: {0-2},{1-3} {1-3},{0-2} 597 * groups: {0-2},{1-3} {1-3},{0-2}
@@ -780,7 +776,7 @@ fail:
780 * ^ ^ ^ ^ 776 * ^ ^ ^ ^
781 * `-' `-' 777 * `-' `-'
782 * 778 *
783 * The sched_domains are per-cpu and have a two way link (parent & child) and 779 * The sched_domains are per-CPU and have a two way link (parent & child) and
784 * denote the ever growing mask of CPUs belonging to that level of topology. 780 * denote the ever growing mask of CPUs belonging to that level of topology.
785 * 781 *
786 * Each sched_domain has a circular (double) linked list of sched_group's, each 782 * Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1017,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1021 d->rd = alloc_rootdomain(); 1017 d->rd = alloc_rootdomain();
1022 if (!d->rd) 1018 if (!d->rd)
1023 return sa_sd; 1019 return sa_sd;
1020
1024 return sa_rootdomain; 1021 return sa_rootdomain;
1025} 1022}
1026 1023
@@ -1047,12 +1044,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
1047} 1044}
1048 1045
1049#ifdef CONFIG_NUMA 1046#ifdef CONFIG_NUMA
1050static int sched_domains_numa_levels;
1051enum numa_topology_type sched_numa_topology_type; 1047enum numa_topology_type sched_numa_topology_type;
1052static int *sched_domains_numa_distance; 1048
1053int sched_max_numa_distance; 1049static int sched_domains_numa_levels;
1054static struct cpumask ***sched_domains_numa_masks; 1050static int sched_domains_curr_level;
1055static int sched_domains_curr_level; 1051
1052int sched_max_numa_distance;
1053static int *sched_domains_numa_distance;
1054static struct cpumask ***sched_domains_numa_masks;
1056#endif 1055#endif
1057 1056
1058/* 1057/*
@@ -1074,11 +1073,11 @@ static int sched_domains_curr_level;
1074 * SD_ASYM_PACKING - describes SMT quirks 1073 * SD_ASYM_PACKING - describes SMT quirks
1075 */ 1074 */
1076#define TOPOLOGY_SD_FLAGS \ 1075#define TOPOLOGY_SD_FLAGS \
1077 (SD_SHARE_CPUCAPACITY | \ 1076 (SD_SHARE_CPUCAPACITY | \
1078 SD_SHARE_PKG_RESOURCES | \ 1077 SD_SHARE_PKG_RESOURCES | \
1079 SD_NUMA | \ 1078 SD_NUMA | \
1080 SD_ASYM_PACKING | \ 1079 SD_ASYM_PACKING | \
1081 SD_ASYM_CPUCAPACITY | \ 1080 SD_ASYM_CPUCAPACITY | \
1082 SD_SHARE_POWERDOMAIN) 1081 SD_SHARE_POWERDOMAIN)
1083 1082
1084static struct sched_domain * 1083static struct sched_domain *
@@ -1628,7 +1627,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
1628 pr_err(" the %s domain not a subset of the %s domain\n", 1627 pr_err(" the %s domain not a subset of the %s domain\n",
1629 child->name, sd->name); 1628 child->name, sd->name);
1630#endif 1629#endif
1631 /* Fixup, ensure @sd has at least @child cpus. */ 1630 /* Fixup, ensure @sd has at least @child CPUs. */
1632 cpumask_or(sched_domain_span(sd), 1631 cpumask_or(sched_domain_span(sd),
1633 sched_domain_span(sd), 1632 sched_domain_span(sd),
1634 sched_domain_span(child)); 1633 sched_domain_span(child));
@@ -1720,6 +1719,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
1720 ret = 0; 1719 ret = 0;
1721error: 1720error:
1722 __free_domain_allocs(&d, alloc_state, cpu_map); 1721 __free_domain_allocs(&d, alloc_state, cpu_map);
1722
1723 return ret; 1723 return ret;
1724} 1724}
1725 1725
@@ -1824,6 +1824,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1824 return 1; 1824 return 1;
1825 1825
1826 tmp = SD_ATTR_INIT; 1826 tmp = SD_ATTR_INIT;
1827
1827 return !memcmp(cur ? (cur + idx_cur) : &tmp, 1828 return !memcmp(cur ? (cur + idx_cur) : &tmp,
1828 new ? (new + idx_new) : &tmp, 1829 new ? (new + idx_new) : &tmp,
1829 sizeof(struct sched_domain_attr)); 1830 sizeof(struct sched_domain_attr));
@@ -1929,4 +1930,3 @@ match2:
1929 1930
1930 mutex_unlock(&sched_domains_mutex); 1931 mutex_unlock(&sched_domains_mutex);
1931} 1932}
1932
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..7b2a142ae629 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -107,6 +107,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
107 break; 107 break;
108 } 108 }
109 } 109 }
110
110 return nr_exclusive; 111 return nr_exclusive;
111} 112}
112 113
@@ -317,6 +318,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
317 spin_unlock(&wq->lock); 318 spin_unlock(&wq->lock);
318 schedule(); 319 schedule();
319 spin_lock(&wq->lock); 320 spin_lock(&wq->lock);
321
320 return 0; 322 return 0;
321} 323}
322EXPORT_SYMBOL(do_wait_intr); 324EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +335,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
333 spin_unlock_irq(&wq->lock); 335 spin_unlock_irq(&wq->lock);
334 schedule(); 336 schedule();
335 spin_lock_irq(&wq->lock); 337 spin_lock_irq(&wq->lock);
338
336 return 0; 339 return 0;
337} 340}
338EXPORT_SYMBOL(do_wait_intr_irq); 341EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +381,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
378 381
379 if (ret) 382 if (ret)
380 list_del_init(&wq_entry->entry); 383 list_del_init(&wq_entry->entry);
384
381 return ret; 385 return ret;
382} 386}
383EXPORT_SYMBOL(autoremove_wake_function); 387EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..5293c59163a6 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -29,8 +29,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
29 wait_bit->key.bit_nr != key->bit_nr || 29 wait_bit->key.bit_nr != key->bit_nr ||
30 test_bit(key->bit_nr, key->flags)) 30 test_bit(key->bit_nr, key->flags))
31 return 0; 31 return 0;
32 else 32
33 return autoremove_wake_function(wq_entry, mode, sync, key); 33 return autoremove_wake_function(wq_entry, mode, sync, key);
34} 34}
35EXPORT_SYMBOL(wake_bit_function); 35EXPORT_SYMBOL(wake_bit_function);
36 36
@@ -50,7 +50,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
50 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) 50 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
51 ret = (*action)(&wbq_entry->key, mode); 51 ret = (*action)(&wbq_entry->key, mode);
52 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); 52 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
53
53 finish_wait(wq_head, &wbq_entry->wq_entry); 54 finish_wait(wq_head, &wbq_entry->wq_entry);
55
54 return ret; 56 return ret;
55} 57}
56EXPORT_SYMBOL(__wait_on_bit); 58EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +75,7 @@ int __sched out_of_line_wait_on_bit_timeout(
73 DEFINE_WAIT_BIT(wq_entry, word, bit); 75 DEFINE_WAIT_BIT(wq_entry, word, bit);
74 76
75 wq_entry.key.timeout = jiffies + timeout; 77 wq_entry.key.timeout = jiffies + timeout;
78
76 return __wait_on_bit(wq_head, &wq_entry, action, mode); 79 return __wait_on_bit(wq_head, &wq_entry, action, mode);
77} 80}
78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); 81EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +123,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) 123void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
121{ 124{
122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 125 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
126
123 if (waitqueue_active(wq_head)) 127 if (waitqueue_active(wq_head))
124 __wake_up(wq_head, TASK_NORMAL, 1, &key); 128 __wake_up(wq_head, TASK_NORMAL, 1, &key);
125} 129}
@@ -157,6 +161,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
157{ 161{
158 if (BITS_PER_LONG == 64) { 162 if (BITS_PER_LONG == 64) {
159 unsigned long q = (unsigned long)p; 163 unsigned long q = (unsigned long)p;
164
160 return bit_waitqueue((void *)(q & ~1), q & 1); 165 return bit_waitqueue((void *)(q & ~1), q & 1);
161 } 166 }
162 return bit_waitqueue(p, 0); 167 return bit_waitqueue(p, 0);
@@ -173,6 +178,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
173 wait_bit->key.bit_nr != key->bit_nr || 178 wait_bit->key.bit_nr != key->bit_nr ||
174 atomic_read(val) != 0) 179 atomic_read(val) != 0)
175 return 0; 180 return 0;
181
176 return autoremove_wake_function(wq_entry, mode, sync, key); 182 return autoremove_wake_function(wq_entry, mode, sync, key);
177} 183}
178 184
@@ -196,6 +202,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
196 ret = (*action)(val, mode); 202 ret = (*action)(val, mode);
197 } while (!ret && atomic_read(val) != 0); 203 } while (!ret && atomic_read(val) != 0);
198 finish_wait(wq_head, &wbq_entry->wq_entry); 204 finish_wait(wq_head, &wbq_entry->wq_entry);
205
199 return ret; 206 return ret;
200} 207}
201 208
@@ -226,6 +233,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
226 schedule(); 233 schedule();
227 if (signal_pending_state(mode, current)) 234 if (signal_pending_state(mode, current))
228 return -EINTR; 235 return -EINTR;
236
229 return 0; 237 return 0;
230} 238}
231EXPORT_SYMBOL(atomic_t_wait); 239EXPORT_SYMBOL(atomic_t_wait);
@@ -250,6 +258,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
250 schedule(); 258 schedule();
251 if (signal_pending_state(mode, current)) 259 if (signal_pending_state(mode, current))
252 return -EINTR; 260 return -EINTR;
261
253 return 0; 262 return 0;
254} 263}
255EXPORT_SYMBOL(bit_wait); 264EXPORT_SYMBOL(bit_wait);
@@ -259,6 +268,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
259 io_schedule(); 268 io_schedule();
260 if (signal_pending_state(mode, current)) 269 if (signal_pending_state(mode, current))
261 return -EINTR; 270 return -EINTR;
271
262 return 0; 272 return 0;
263} 273}
264EXPORT_SYMBOL(bit_wait_io); 274EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +276,13 @@ EXPORT_SYMBOL(bit_wait_io);
266__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) 276__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
267{ 277{
268 unsigned long now = READ_ONCE(jiffies); 278 unsigned long now = READ_ONCE(jiffies);
279
269 if (time_after_eq(now, word->timeout)) 280 if (time_after_eq(now, word->timeout))
270 return -EAGAIN; 281 return -EAGAIN;
271 schedule_timeout(word->timeout - now); 282 schedule_timeout(word->timeout - now);
272 if (signal_pending_state(mode, current)) 283 if (signal_pending_state(mode, current))
273 return -EINTR; 284 return -EINTR;
285
274 return 0; 286 return 0;
275} 287}
276EXPORT_SYMBOL_GPL(bit_wait_timeout); 288EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +290,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
278__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) 290__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
279{ 291{
280 unsigned long now = READ_ONCE(jiffies); 292 unsigned long now = READ_ONCE(jiffies);
293
281 if (time_after_eq(now, word->timeout)) 294 if (time_after_eq(now, word->timeout))
282 return -EAGAIN; 295 return -EAGAIN;
283 io_schedule_timeout(word->timeout - now); 296 io_schedule_timeout(word->timeout - now);
284 if (signal_pending_state(mode, current)) 297 if (signal_pending_state(mode, current))
285 return -EINTR; 298 return -EINTR;
299
286 return 0; 300 return 0;
287} 301}
288EXPORT_SYMBOL_GPL(bit_wait_io_timeout); 302EXPORT_SYMBOL_GPL(bit_wait_io_timeout);