aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-03-06 11:14:05 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-06 11:14:05 -0500
commit45802da05e666a81b421422d3e302930c0e24e77 (patch)
treefeca43796693395bb2912c59768dc809022e7583
parent203b6609e0ede49eb0b97008b1150c69e9d2ffd3 (diff)
parentad01423aedaa7c6dd62d560b73a3cb39e6da3901 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - refcount conversions - Solve the rq->leaf_cfs_rq_list can of worms for real. - improve power-aware scheduling - add sysctl knob for Energy Aware Scheduling - documentation updates - misc other changes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits) kthread: Do not use TIMER_IRQSAFE kthread: Convert worker lock to raw spinlock sched/fair: Use non-atomic cpumask_{set,clear}_cpu() sched/fair: Remove unused 'sd' parameter from select_idle_smt() sched/wait: Use freezable_schedule() when possible sched/fair: Prune, fix and simplify the nohz_balancer_kick() comment block sched/fair: Explain LLC nohz kick condition sched/fair: Simplify nohz_balancer_kick() sched/topology: Fix percpu data types in struct sd_data & struct s_data sched/fair: Simplify post_init_entity_util_avg() by calling it with a task_struct pointer argument sched/fair: Fix O(nr_cgroups) in the load balancing path sched/fair: Optimize update_blocked_averages() sched/fair: Fix insertion in rq->leaf_cfs_rq_list sched/fair: Add tmp_alone_branch assertion sched/core: Use READ_ONCE()/WRITE_ONCE() in move_queued_task()/task_rq_lock() sched/debug: Initialize sd_sysctl_cpus if !CONFIG_CPUMASK_OFFSTACK sched/pelt: Skip updating util_est when utilization is higher than CPU's capacity sched/fair: Update scale invariance of PELT sched/fair: Move the rq_of() helper function sched/core: Convert task_struct.stack_refcount to refcount_t ...
-rw-r--r--Documentation/power/energy-model.txt144
-rw-r--r--Documentation/scheduler/sched-energy.txt425
-rw-r--r--Documentation/sysctl/kernel.txt12
-rw-r--r--MAINTAINERS9
-rw-r--r--fs/exec.c4
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/kthread.h9
-rw-r--r--include/linux/sched.h33
-rw-r--r--include/linux/sched/signal.h5
-rw-r--r--include/linux/sched/sysctl.h7
-rw-r--r--include/linux/sched/task.h4
-rw-r--r--include/linux/sched/task_stack.h2
-rw-r--r--include/linux/sched/topology.h8
-rw-r--r--include/linux/wait.h6
-rw-r--r--init/init_task.c6
-rw-r--r--kernel/fork.c24
-rw-r--r--kernel/kthread.c43
-rw-r--r--kernel/sched/core.c12
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c458
-rw-r--r--kernel/sched/isolation.c2
-rw-r--r--kernel/sched/pelt.c45
-rw-r--r--kernel/sched/pelt.h114
-rw-r--r--kernel/sched/rt.c6
-rw-r--r--kernel/sched/sched.h54
-rw-r--r--kernel/sched/topology.c33
-rw-r--r--kernel/sysctl.c11
29 files changed, 1165 insertions, 324 deletions
diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt
new file mode 100644
index 000000000000..a2b0ae4c76bd
--- /dev/null
+++ b/Documentation/power/energy-model.txt
@@ -0,0 +1,144 @@
1 ====================
2 Energy Model of CPUs
3 ====================
4
51. Overview
6-----------
7
8The Energy Model (EM) framework serves as an interface between drivers knowing
9the power consumed by CPUs at various performance levels, and the kernel
10subsystems willing to use that information to make energy-aware decisions.
11
12The source of the information about the power consumed by CPUs can vary greatly
13from one platform to another. These power costs can be estimated using
14devicetree data in some cases. In others, the firmware will know better.
15Alternatively, userspace might be best positioned. And so on. In order to avoid
16each and every client subsystem to re-implement support for each and every
17possible source of information on its own, the EM framework intervenes as an
18abstraction layer which standardizes the format of power cost tables in the
19kernel, hence enabling to avoid redundant work.
20
21The figure below depicts an example of drivers (Arm-specific here, but the
22approach is applicable to any architecture) providing power costs to the EM
23framework, and interested clients reading the data from it.
24
25 +---------------+ +-----------------+ +---------------+
26 | Thermal (IPA) | | Scheduler (EAS) | | Other |
27 +---------------+ +-----------------+ +---------------+
28 | | em_pd_energy() |
29 | | em_cpu_get() |
30 +---------+ | +---------+
31 | | |
32 v v v
33 +---------------------+
34 | Energy Model |
35 | Framework |
36 +---------------------+
37 ^ ^ ^
38 | | | em_register_perf_domain()
39 +----------+ | +---------+
40 | | |
41 +---------------+ +---------------+ +--------------+
42 | cpufreq-dt | | arm_scmi | | Other |
43 +---------------+ +---------------+ +--------------+
44 ^ ^ ^
45 | | |
46 +--------------+ +---------------+ +--------------+
47 | Device Tree | | Firmware | | ? |
48 +--------------+ +---------------+ +--------------+
49
50The EM framework manages power cost tables per 'performance domain' in the
51system. A performance domain is a group of CPUs whose performance is scaled
52together. Performance domains generally have a 1-to-1 mapping with CPUFreq
53policies. All CPUs in a performance domain are required to have the same
54micro-architecture. CPUs in different performance domains can have different
55micro-architectures.
56
57
582. Core APIs
59------------
60
61 2.1 Config options
62
63CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
64
65
66 2.2 Registration of performance domains
67
68Drivers are expected to register performance domains into the EM framework by
69calling the following API:
70
71 int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
72 struct em_data_callback *cb);
73
74Drivers must specify the CPUs of the performance domains using the cpumask
75argument, and provide a callback function returning <frequency, power> tuples
76for each capacity state. The callback function provided by the driver is free
77to fetch data from any relevant location (DT, firmware, ...), and by any mean
78deemed necessary. See Section 3. for an example of driver implementing this
79callback, and kernel/power/energy_model.c for further documentation on this
80API.
81
82
83 2.3 Accessing performance domains
84
85Subsystems interested in the energy model of a CPU can retrieve it using the
86em_cpu_get() API. The energy model tables are allocated once upon creation of
87the performance domains, and kept in memory untouched.
88
89The energy consumed by a performance domain can be estimated using the
90em_pd_energy() API. The estimation is performed assuming that the schedutil
91CPUfreq governor is in use.
92
93More details about the above APIs can be found in include/linux/energy_model.h.
94
95
963. Example driver
97-----------------
98
99This section provides a simple example of a CPUFreq driver registering a
100performance domain in the Energy Model framework using the (fake) 'foo'
101protocol. The driver implements an est_power() function to be provided to the
102EM framework.
103
104 -> drivers/cpufreq/foo_cpufreq.c
105
10601 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
10702 {
10803 long freq, power;
10904
11005 /* Use the 'foo' protocol to ceil the frequency */
11106 freq = foo_get_freq_ceil(cpu, *KHz);
11207 if (freq < 0);
11308 return freq;
11409
11510 /* Estimate the power cost for the CPU at the relevant freq. */
11611 power = foo_estimate_power(cpu, freq);
11712 if (power < 0);
11813 return power;
11914
12015 /* Return the values to the EM framework */
12116 *mW = power;
12217 *KHz = freq;
12318
12419 return 0;
12520 }
12621
12722 static int foo_cpufreq_init(struct cpufreq_policy *policy)
12823 {
12924 struct em_data_callback em_cb = EM_DATA_CB(est_power);
13025 int nr_opp, ret;
13126
13227 /* Do the actual CPUFreq init work ... */
13328 ret = do_foo_cpufreq_init(policy);
13429 if (ret)
13530 return ret;
13631
13732 /* Find the number of OPPs for this policy */
13833 nr_opp = foo_get_nr_opp(policy);
13934
14035 /* And register the new performance domain */
14136 em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
14237
14338 return 0;
14439 }
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
new file mode 100644
index 000000000000..197d81f4b836
--- /dev/null
+++ b/Documentation/scheduler/sched-energy.txt
@@ -0,0 +1,425 @@
1 =======================
2 Energy Aware Scheduling
3 =======================
4
51. Introduction
6---------------
7
8Energy Aware Scheduling (or EAS) gives the scheduler the ability to predict
9the impact of its decisions on the energy consumed by CPUs. EAS relies on an
10Energy Model (EM) of the CPUs to select an energy efficient CPU for each task,
11with a minimal impact on throughput. This document aims at providing an
12introduction on how EAS works, what are the main design decisions behind it, and
13details what is needed to get it to run.
14
15Before going any further, please note that at the time of writing:
16
17 /!\ EAS does not support platforms with symmetric CPU topologies /!\
18
19EAS operates only on heterogeneous CPU topologies (such as Arm big.LITTLE)
20because this is where the potential for saving energy through scheduling is
21the highest.
22
23The actual EM used by EAS is _not_ maintained by the scheduler, but by a
24dedicated framework. For details about this framework and what it provides,
25please refer to its documentation (see Documentation/power/energy-model.txt).
26
27
282. Background and Terminology
29-----------------------------
30
31To make it clear from the start:
32 - energy = [joule] (resource like a battery on powered devices)
33 - power = energy/time = [joule/second] = [watt]
34
35The goal of EAS is to minimize energy, while still getting the job done. That
36is, we want to maximize:
37
38 performance [inst/s]
39 --------------------
40 power [W]
41
42which is equivalent to minimizing:
43
44 energy [J]
45 -----------
46 instruction
47
48while still getting 'good' performance. It is essentially an alternative
49optimization objective to the current performance-only objective for the
50scheduler. This alternative considers two objectives: energy-efficiency and
51performance.
52
53The idea behind introducing an EM is to allow the scheduler to evaluate the
54implications of its decisions rather than blindly applying energy-saving
55techniques that may have positive effects only on some platforms. At the same
56time, the EM must be as simple as possible to minimize the scheduler latency
57impact.
58
59In short, EAS changes the way CFS tasks are assigned to CPUs. When it is time
60for the scheduler to decide where a task should run (during wake-up), the EM
61is used to break the tie between several good CPU candidates and pick the one
62that is predicted to yield the best energy consumption without harming the
63system's throughput. The predictions made by EAS rely on specific elements of
64knowledge about the platform's topology, which include the 'capacity' of CPUs,
65and their respective energy costs.
66
67
683. Topology information
69-----------------------
70
71EAS (as well as the rest of the scheduler) uses the notion of 'capacity' to
72differentiate CPUs with different computing throughput. The 'capacity' of a CPU
73represents the amount of work it can absorb when running at its highest
74frequency compared to the most capable CPU of the system. Capacity values are
75normalized in a 1024 range, and are comparable with the utilization signals of
76tasks and CPUs computed by the Per-Entity Load Tracking (PELT) mechanism. Thanks
77to capacity and utilization values, EAS is able to estimate how big/busy a
78task/CPU is, and to take this into consideration when evaluating performance vs
79energy trade-offs. The capacity of CPUs is provided via arch-specific code
80through the arch_scale_cpu_capacity() callback.
81
82The rest of platform knowledge used by EAS is directly read from the Energy
83Model (EM) framework. The EM of a platform is composed of a power cost table
84per 'performance domain' in the system (see Documentation/power/energy-model.txt
85for futher details about performance domains).
86
87The scheduler manages references to the EM objects in the topology code when the
88scheduling domains are built, or re-built. For each root domain (rd), the
89scheduler maintains a singly linked list of all performance domains intersecting
90the current rd->span. Each node in the list contains a pointer to a struct
91em_perf_domain as provided by the EM framework.
92
93The lists are attached to the root domains in order to cope with exclusive
94cpuset configurations. Since the boundaries of exclusive cpusets do not
95necessarily match those of performance domains, the lists of different root
96domains can contain duplicate elements.
97
98Example 1.
99 Let us consider a platform with 12 CPUs, split in 3 performance domains
100 (pd0, pd4 and pd8), organized as follows:
101
102 CPUs: 0 1 2 3 4 5 6 7 8 9 10 11
103 PDs: |--pd0--|--pd4--|---pd8---|
104 RDs: |----rd1----|-----rd2-----|
105
106 Now, consider that userspace decided to split the system with two
107 exclusive cpusets, hence creating two independent root domains, each
108 containing 6 CPUs. The two root domains are denoted rd1 and rd2 in the
109 above figure. Since pd4 intersects with both rd1 and rd2, it will be
110 present in the linked list '->pd' attached to each of them:
111 * rd1->pd: pd0 -> pd4
112 * rd2->pd: pd4 -> pd8
113
114 Please note that the scheduler will create two duplicate list nodes for
115 pd4 (one for each list). However, both just hold a pointer to the same
116 shared data structure of the EM framework.
117
118Since the access to these lists can happen concurrently with hotplug and other
119things, they are protected by RCU, like the rest of topology structures
120manipulated by the scheduler.
121
122EAS also maintains a static key (sched_energy_present) which is enabled when at
123least one root domain meets all conditions for EAS to start. Those conditions
124are summarized in Section 6.
125
126
1274. Energy-Aware task placement
128------------------------------
129
130EAS overrides the CFS task wake-up balancing code. It uses the EM of the
131platform and the PELT signals to choose an energy-efficient target CPU during
132wake-up balance. When EAS is enabled, select_task_rq_fair() calls
133find_energy_efficient_cpu() to do the placement decision. This function looks
134for the CPU with the highest spare capacity (CPU capacity - CPU utilization) in
135each performance domain since it is the one which will allow us to keep the
136frequency the lowest. Then, the function checks if placing the task there could
137save energy compared to leaving it on prev_cpu, i.e. the CPU where the task ran
138in its previous activation.
139
140find_energy_efficient_cpu() uses compute_energy() to estimate what will be the
141energy consumed by the system if the waking task was migrated. compute_energy()
142looks at the current utilization landscape of the CPUs and adjusts it to
143'simulate' the task migration. The EM framework provides the em_pd_energy() API
144which computes the expected energy consumption of each performance domain for
145the given utilization landscape.
146
147An example of energy-optimized task placement decision is detailed below.
148
149Example 2.
150 Let us consider a (fake) platform with 2 independent performance domains
151 composed of two CPUs each. CPU0 and CPU1 are little CPUs; CPU2 and CPU3
152 are big.
153
154 The scheduler must decide where to place a task P whose util_avg = 200
155 and prev_cpu = 0.
156
157 The current utilization landscape of the CPUs is depicted on the graph
158 below. CPUs 0-3 have a util_avg of 400, 100, 600 and 500 respectively
159 Each performance domain has three Operating Performance Points (OPPs).
160 The CPU capacity and power cost associated with each OPP is listed in
161 the Energy Model table. The util_avg of P is shown on the figures
162 below as 'PP'.
163
164 CPU util.
165 1024 - - - - - - - Energy Model
166 +-----------+-------------+
167 | Little | Big |
168 768 ============= +-----+-----+------+------+
169 | Cap | Pwr | Cap | Pwr |
170 +-----+-----+------+------+
171 512 =========== - ##- - - - - | 170 | 50 | 512 | 400 |
172 ## ## | 341 | 150 | 768 | 800 |
173 341 -PP - - - - ## ## | 512 | 300 | 1024 | 1700 |
174 PP ## ## +-----+-----+------+------+
175 170 -## - - - - ## ##
176 ## ## ## ##
177 ------------ -------------
178 CPU0 CPU1 CPU2 CPU3
179
180 Current OPP: ===== Other OPP: - - - util_avg (100 each): ##
181
182
183 find_energy_efficient_cpu() will first look for the CPUs with the
184 maximum spare capacity in the two performance domains. In this example,
185 CPU1 and CPU3. Then it will estimate the energy of the system if P was
186 placed on either of them, and check if that would save some energy
187 compared to leaving P on CPU0. EAS assumes that OPPs follow utilization
188 (which is coherent with the behaviour of the schedutil CPUFreq
189 governor, see Section 6. for more details on this topic).
190
191 Case 1. P is migrated to CPU1
192 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
193
194 1024 - - - - - - -
195
196 Energy calculation:
197 768 ============= * CPU0: 200 / 341 * 150 = 88
198 * CPU1: 300 / 341 * 150 = 131
199 * CPU2: 600 / 768 * 800 = 625
200 512 - - - - - - - ##- - - - - * CPU3: 500 / 768 * 800 = 520
201 ## ## => total_energy = 1364
202 341 =========== ## ##
203 PP ## ##
204 170 -## - - PP- ## ##
205 ## ## ## ##
206 ------------ -------------
207 CPU0 CPU1 CPU2 CPU3
208
209
210 Case 2. P is migrated to CPU3
211 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
212
213 1024 - - - - - - -
214
215 Energy calculation:
216 768 ============= * CPU0: 200 / 341 * 150 = 88
217 * CPU1: 100 / 341 * 150 = 43
218 PP * CPU2: 600 / 768 * 800 = 625
219 512 - - - - - - - ##- - -PP - * CPU3: 700 / 768 * 800 = 729
220 ## ## => total_energy = 1485
221 341 =========== ## ##
222 ## ##
223 170 -## - - - - ## ##
224 ## ## ## ##
225 ------------ -------------
226 CPU0 CPU1 CPU2 CPU3
227
228
229 Case 3. P stays on prev_cpu / CPU 0
230 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
231
232 1024 - - - - - - -
233
234 Energy calculation:
235 768 ============= * CPU0: 400 / 512 * 300 = 234
236 * CPU1: 100 / 512 * 300 = 58
237 * CPU2: 600 / 768 * 800 = 625
238 512 =========== - ##- - - - - * CPU3: 500 / 768 * 800 = 520
239 ## ## => total_energy = 1437
240 341 -PP - - - - ## ##
241 PP ## ##
242 170 -## - - - - ## ##
243 ## ## ## ##
244 ------------ -------------
245 CPU0 CPU1 CPU2 CPU3
246
247
248 From these calculations, the Case 1 has the lowest total energy. So CPU 1
249 is be the best candidate from an energy-efficiency standpoint.
250
251Big CPUs are generally more power hungry than the little ones and are thus used
252mainly when a task doesn't fit the littles. However, little CPUs aren't always
253necessarily more energy-efficient than big CPUs. For some systems, the high OPPs
254of the little CPUs can be less energy-efficient than the lowest OPPs of the
255bigs, for example. So, if the little CPUs happen to have enough utilization at
256a specific point in time, a small task waking up at that moment could be better
257of executing on the big side in order to save energy, even though it would fit
258on the little side.
259
260And even in the case where all OPPs of the big CPUs are less energy-efficient
261than those of the little, using the big CPUs for a small task might still, under
262specific conditions, save energy. Indeed, placing a task on a little CPU can
263result in raising the OPP of the entire performance domain, and that will
264increase the cost of the tasks already running there. If the waking task is
265placed on a big CPU, its own execution cost might be higher than if it was
266running on a little, but it won't impact the other tasks of the little CPUs
267which will keep running at a lower OPP. So, when considering the total energy
268consumed by CPUs, the extra cost of running that one task on a big core can be
269smaller than the cost of raising the OPP on the little CPUs for all the other
270tasks.
271
272The examples above would be nearly impossible to get right in a generic way, and
273for all platforms, without knowing the cost of running at different OPPs on all
274CPUs of the system. Thanks to its EM-based design, EAS should cope with them
275correctly without too many troubles. However, in order to ensure a minimal
276impact on throughput for high-utilization scenarios, EAS also implements another
277mechanism called 'over-utilization'.
278
279
2805. Over-utilization
281-------------------
282
283From a general standpoint, the use-cases where EAS can help the most are those
284involving a light/medium CPU utilization. Whenever long CPU-bound tasks are
285being run, they will require all of the available CPU capacity, and there isn't
286much that can be done by the scheduler to save energy without severly harming
287throughput. In order to avoid hurting performance with EAS, CPUs are flagged as
288'over-utilized' as soon as they are used at more than 80% of their compute
289capacity. As long as no CPUs are over-utilized in a root domain, load balancing
290is disabled and EAS overridess the wake-up balancing code. EAS is likely to load
291the most energy efficient CPUs of the system more than the others if that can be
292done without harming throughput. So, the load-balancer is disabled to prevent
293it from breaking the energy-efficient task placement found by EAS. It is safe to
294do so when the system isn't overutilized since being below the 80% tipping point
295implies that:
296
297 a. there is some idle time on all CPUs, so the utilization signals used by
298 EAS are likely to accurately represent the 'size' of the various tasks
299 in the system;
300 b. all tasks should already be provided with enough CPU capacity,
301 regardless of their nice values;
302 c. since there is spare capacity all tasks must be blocking/sleeping
303 regularly and balancing at wake-up is sufficient.
304
305As soon as one CPU goes above the 80% tipping point, at least one of the three
306assumptions above becomes incorrect. In this scenario, the 'overutilized' flag
307is raised for the entire root domain, EAS is disabled, and the load-balancer is
308re-enabled. By doing so, the scheduler falls back onto load-based algorithms for
309wake-up and load balance under CPU-bound conditions. This provides a better
310respect of the nice values of tasks.
311
312Since the notion of overutilization largely relies on detecting whether or not
313there is some idle time in the system, the CPU capacity 'stolen' by higher
314(than CFS) scheduling classes (as well as IRQ) must be taken into account. As
315such, the detection of overutilization accounts for the capacity used not only
316by CFS tasks, but also by the other scheduling classes and IRQ.
317
318
3196. Dependencies and requirements for EAS
320----------------------------------------
321
322Energy Aware Scheduling depends on the CPUs of the system having specific
323hardware properties and on other features of the kernel being enabled. This
324section lists these dependencies and provides hints as to how they can be met.
325
326
327 6.1 - Asymmetric CPU topology
328
329As mentioned in the introduction, EAS is only supported on platforms with
330asymmetric CPU topologies for now. This requirement is checked at run-time by
331looking for the presence of the SD_ASYM_CPUCAPACITY flag when the scheduling
332domains are built.
333
334The flag is set/cleared automatically by the scheduler topology code whenever
335there are CPUs with different capacities in a root domain. The capacities of
336CPUs are provided by arch-specific code through the arch_scale_cpu_capacity()
337callback. As an example, arm and arm64 share an implementation of this callback
338which uses a combination of CPUFreq data and device-tree bindings to compute the
339capacity of CPUs (see drivers/base/arch_topology.c for more details).
340
341So, in order to use EAS on your platform your architecture must implement the
342arch_scale_cpu_capacity() callback, and some of the CPUs must have a lower
343capacity than others.
344
345Please note that EAS is not fundamentally incompatible with SMP, but no
346significant savings on SMP platforms have been observed yet. This restriction
347could be amended in the future if proven otherwise.
348
349
350 6.2 - Energy Model presence
351
352EAS uses the EM of a platform to estimate the impact of scheduling decisions on
353energy. So, your platform must provide power cost tables to the EM framework in
354order to make EAS start. To do so, please refer to documentation of the
355independent EM framework in Documentation/power/energy-model.txt.
356
357Please also note that the scheduling domains need to be re-built after the
358EM has been registered in order to start EAS.
359
360
361 6.3 - Energy Model complexity
362
363The task wake-up path is very latency-sensitive. When the EM of a platform is
364too complex (too many CPUs, too many performance domains, too many performance
365states, ...), the cost of using it in the wake-up path can become prohibitive.
366The energy-aware wake-up algorithm has a complexity of:
367
368 C = Nd * (Nc + Ns)
369
370with: Nd the number of performance domains; Nc the number of CPUs; and Ns the
371total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8).
372
373A complexity check is performed at the root domain level, when scheduling
374domains are built. EAS will not start on a root domain if its C happens to be
375higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the
376time of writing).
377
378If you really want to use EAS but the complexity of your platform's Energy
379Model is too high to be used with a single root domain, you're left with only
380two possible options:
381
382 1. split your system into separate, smaller, root domains using exclusive
383 cpusets and enable EAS locally on each of them. This option has the
384 benefit to work out of the box but the drawback of preventing load
385 balance between root domains, which can result in an unbalanced system
386 overall;
387 2. submit patches to reduce the complexity of the EAS wake-up algorithm,
388 hence enabling it to cope with larger EMs in reasonable time.
389
390
391 6.4 - Schedutil governor
392
393EAS tries to predict at which OPP will the CPUs be running in the close future
394in order to estimate their energy consumption. To do so, it is assumed that OPPs
395of CPUs follow their utilization.
396
397Although it is very difficult to provide hard guarantees regarding the accuracy
398of this assumption in practice (because the hardware might not do what it is
399told to do, for example), schedutil as opposed to other CPUFreq governors at
400least _requests_ frequencies calculated using the utilization signals.
401Consequently, the only sane governor to use together with EAS is schedutil,
402because it is the only one providing some degree of consistency between
403frequency requests and energy predictions.
404
405Using EAS with any other governor than schedutil is not supported.
406
407
408 6.5 Scale-invariant utilization signals
409
410In order to make accurate prediction across CPUs and for all performance
411states, EAS needs frequency-invariant and CPU-invariant PELT signals. These can
412be obtained using the architecture-defined arch_scale{cpu,freq}_capacity()
413callbacks.
414
415Using EAS on a platform that doesn't implement these two callbacks is not
416supported.
417
418
419 6.6 Multithreading (SMT)
420
421EAS in its current form is SMT unaware and is not able to leverage
422multithreaded hardware to save energy. EAS considers threads as independent
423CPUs, which can actually be counter-productive for both performance and energy.
424
425EAS on SMT is not supported.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index c0527d8a468a..379063e58326 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -79,6 +79,7 @@ show up in /proc/sys/kernel:
79- reboot-cmd [ SPARC only ] 79- reboot-cmd [ SPARC only ]
80- rtsig-max 80- rtsig-max
81- rtsig-nr 81- rtsig-nr
82- sched_energy_aware
82- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst 83- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst
83- sem 84- sem
84- sem_next_id [ sysv ipc ] 85- sem_next_id [ sysv ipc ]
@@ -890,6 +891,17 @@ rtsig-nr shows the number of RT signals currently queued.
890 891
891============================================================== 892==============================================================
892 893
894sched_energy_aware:
895
896Enables/disables Energy Aware Scheduling (EAS). EAS starts
897automatically on platforms where it can run (that is,
898platforms with asymmetric CPU topologies and having an Energy
899Model available). If your platform happens to meet the
900requirements for EAS but you do not want to use it, change
901this value to 0.
902
903==============================================================
904
893sched_schedstats: 905sched_schedstats:
894 906
895Enables/disables scheduler statistics. Enabling this feature 907Enables/disables scheduler statistics. Enabling this feature
diff --git a/MAINTAINERS b/MAINTAINERS
index 5e5529b9ffc8..366362b16f34 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12280,14 +12280,6 @@ S: Maintained
12280F: drivers/net/ppp/pptp.c 12280F: drivers/net/ppp/pptp.c
12281W: http://sourceforge.net/projects/accel-pptp 12281W: http://sourceforge.net/projects/accel-pptp
12282 12282
12283PREEMPTIBLE KERNEL
12284M: Robert Love <rml@tech9.net>
12285L: kpreempt-tech@lists.sourceforge.net
12286W: https://www.kernel.org/pub/linux/kernel/people/rml/preempt-kernel
12287S: Supported
12288F: Documentation/preempt-locking.txt
12289F: include/linux/preempt.h
12290
12291PRINTK 12283PRINTK
12292M: Petr Mladek <pmladek@suse.com> 12284M: Petr Mladek <pmladek@suse.com>
12293M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> 12285M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
@@ -13525,6 +13517,7 @@ F: kernel/sched/
13525F: include/linux/sched.h 13517F: include/linux/sched.h
13526F: include/uapi/linux/sched.h 13518F: include/uapi/linux/sched.h
13527F: include/linux/wait.h 13519F: include/linux/wait.h
13520F: include/linux/preempt.h
13528 13521
13529SCR24X CHIP CARD INTERFACE DRIVER 13522SCR24X CHIP CARD INTERFACE DRIVER
13530M: Lubomir Rintel <lkundrak@v3.sk> 13523M: Lubomir Rintel <lkundrak@v3.sk>
diff --git a/fs/exec.c b/fs/exec.c
index bcf383730bea..74f3672146a7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1189,7 +1189,7 @@ no_thread_group:
1189 flush_itimer_signals(); 1189 flush_itimer_signals();
1190#endif 1190#endif
1191 1191
1192 if (atomic_read(&oldsighand->count) != 1) { 1192 if (refcount_read(&oldsighand->count) != 1) {
1193 struct sighand_struct *newsighand; 1193 struct sighand_struct *newsighand;
1194 /* 1194 /*
1195 * This ->sighand is shared with the CLONE_SIGHAND 1195 * This ->sighand is shared with the CLONE_SIGHAND
@@ -1199,7 +1199,7 @@ no_thread_group:
1199 if (!newsighand) 1199 if (!newsighand)
1200 return -ENOMEM; 1200 return -ENOMEM;
1201 1201
1202 atomic_set(&newsighand->count, 1); 1202 refcount_set(&newsighand->count, 1);
1203 memcpy(newsighand->action, oldsighand->action, 1203 memcpy(newsighand->action, oldsighand->action,
1204 sizeof(newsighand->action)); 1204 sizeof(newsighand->action));
1205 1205
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 0b63d68dedb2..f912872fbf91 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
64 else 64 else
65 bytes += kobjsize(current->files); 65 bytes += kobjsize(current->files);
66 66
67 if (current->sighand && atomic_read(&current->sighand->count) > 1) 67 if (current->sighand && refcount_read(&current->sighand->count) > 1)
68 sbytes += kobjsize(current->sighand); 68 sbytes += kobjsize(current->sighand);
69 else 69 else
70 bytes += kobjsize(current->sighand); 70 bytes += kobjsize(current->sighand);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a7083a45a26c..6049baa5b8bc 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -13,6 +13,7 @@
13#include <linux/securebits.h> 13#include <linux/securebits.h>
14#include <linux/seqlock.h> 14#include <linux/seqlock.h>
15#include <linux/rbtree.h> 15#include <linux/rbtree.h>
16#include <linux/refcount.h>
16#include <linux/sched/autogroup.h> 17#include <linux/sched/autogroup.h>
17#include <net/net_namespace.h> 18#include <net/net_namespace.h>
18#include <linux/sched/rt.h> 19#include <linux/sched/rt.h>
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 1577a2d56e9d..2c89e60bc752 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -86,7 +86,7 @@ enum {
86 86
87struct kthread_worker { 87struct kthread_worker {
88 unsigned int flags; 88 unsigned int flags;
89 spinlock_t lock; 89 raw_spinlock_t lock;
90 struct list_head work_list; 90 struct list_head work_list;
91 struct list_head delayed_work_list; 91 struct list_head delayed_work_list;
92 struct task_struct *task; 92 struct task_struct *task;
@@ -107,7 +107,7 @@ struct kthread_delayed_work {
107}; 107};
108 108
109#define KTHREAD_WORKER_INIT(worker) { \ 109#define KTHREAD_WORKER_INIT(worker) { \
110 .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \ 110 .lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock), \
111 .work_list = LIST_HEAD_INIT((worker).work_list), \ 111 .work_list = LIST_HEAD_INIT((worker).work_list), \
112 .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\ 112 .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\
113 } 113 }
@@ -165,9 +165,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
165#define kthread_init_delayed_work(dwork, fn) \ 165#define kthread_init_delayed_work(dwork, fn) \
166 do { \ 166 do { \
167 kthread_init_work(&(dwork)->work, (fn)); \ 167 kthread_init_work(&(dwork)->work, (fn)); \
168 __init_timer(&(dwork)->timer, \ 168 timer_setup(&(dwork)->timer, \
169 kthread_delayed_work_timer_fn, \ 169 kthread_delayed_work_timer_fn, 0); \
170 TIMER_IRQSAFE); \
171 } while (0) 170 } while (0)
172 171
173int kthread_worker_fn(void *worker_ptr); 172int kthread_worker_fn(void *worker_ptr);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89ddece0b003..903ef29b62c3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -21,6 +21,7 @@
21#include <linux/seccomp.h> 21#include <linux/seccomp.h>
22#include <linux/nodemask.h> 22#include <linux/nodemask.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/refcount.h>
24#include <linux/resource.h> 25#include <linux/resource.h>
25#include <linux/latencytop.h> 26#include <linux/latencytop.h>
26#include <linux/sched/prio.h> 27#include <linux/sched/prio.h>
@@ -356,12 +357,6 @@ struct util_est {
356 * For cfs_rq, it is the aggregated load_avg of all runnable and 357 * For cfs_rq, it is the aggregated load_avg of all runnable and
357 * blocked sched_entities. 358 * blocked sched_entities.
358 * 359 *
359 * load_avg may also take frequency scaling into account:
360 *
361 * load_avg = runnable% * scale_load_down(load) * freq%
362 *
363 * where freq% is the CPU frequency normalized to the highest frequency.
364 *
365 * [util_avg definition] 360 * [util_avg definition]
366 * 361 *
367 * util_avg = running% * SCHED_CAPACITY_SCALE 362 * util_avg = running% * SCHED_CAPACITY_SCALE
@@ -370,17 +365,14 @@ struct util_est {
370 * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable 365 * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
371 * and blocked sched_entities. 366 * and blocked sched_entities.
372 * 367 *
373 * util_avg may also factor frequency scaling and CPU capacity scaling: 368 * load_avg and util_avg don't direcly factor frequency scaling and CPU
374 * 369 * capacity scaling. The scaling is done through the rq_clock_pelt that
375 * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% 370 * is used for computing those signals (see update_rq_clock_pelt())
376 *
377 * where freq% is the same as above, and capacity% is the CPU capacity
378 * normalized to the greatest capacity (due to uarch differences, etc).
379 * 371 *
380 * N.B., the above ratios (runnable%, running%, freq%, and capacity%) 372 * N.B., the above ratios (runnable% and running%) themselves are in the
381 * themselves are in the range of [0, 1]. To do fixed point arithmetics, 373 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
382 * we therefore scale them to as large a range as necessary. This is for 374 * to as large a range as necessary. This is for example reflected by
383 * example reflected by util_avg's SCHED_CAPACITY_SCALE. 375 * util_avg's SCHED_CAPACITY_SCALE.
384 * 376 *
385 * [Overflow issue] 377 * [Overflow issue]
386 * 378 *
@@ -607,7 +599,7 @@ struct task_struct {
607 randomized_struct_fields_start 599 randomized_struct_fields_start
608 600
609 void *stack; 601 void *stack;
610 atomic_t usage; 602 refcount_t usage;
611 /* Per task flags (PF_*), defined further below: */ 603 /* Per task flags (PF_*), defined further below: */
612 unsigned int flags; 604 unsigned int flags;
613 unsigned int ptrace; 605 unsigned int ptrace;
@@ -1187,7 +1179,7 @@ struct task_struct {
1187#endif 1179#endif
1188#ifdef CONFIG_THREAD_INFO_IN_TASK 1180#ifdef CONFIG_THREAD_INFO_IN_TASK
1189 /* A live task holds one reference: */ 1181 /* A live task holds one reference: */
1190 atomic_t stack_refcount; 1182 refcount_t stack_refcount;
1191#endif 1183#endif
1192#ifdef CONFIG_LIVEPATCH 1184#ifdef CONFIG_LIVEPATCH
1193 int patch_state; 1185 int patch_state;
@@ -1403,7 +1395,6 @@ extern struct pid *cad_pid;
1403#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ 1395#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
1404#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ 1396#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
1405#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1397#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1406#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
1407#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ 1398#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
1408#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ 1399#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
1409 1400
@@ -1753,9 +1744,9 @@ static __always_inline bool need_resched(void)
1753static inline unsigned int task_cpu(const struct task_struct *p) 1744static inline unsigned int task_cpu(const struct task_struct *p)
1754{ 1745{
1755#ifdef CONFIG_THREAD_INFO_IN_TASK 1746#ifdef CONFIG_THREAD_INFO_IN_TASK
1756 return p->cpu; 1747 return READ_ONCE(p->cpu);
1757#else 1748#else
1758 return task_thread_info(p)->cpu; 1749 return READ_ONCE(task_thread_info(p)->cpu);
1759#endif 1750#endif
1760} 1751}
1761 1752
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 13789d10a50e..ae5655197698 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -8,13 +8,14 @@
8#include <linux/sched/jobctl.h> 8#include <linux/sched/jobctl.h>
9#include <linux/sched/task.h> 9#include <linux/sched/task.h>
10#include <linux/cred.h> 10#include <linux/cred.h>
11#include <linux/refcount.h>
11 12
12/* 13/*
13 * Types defining task->signal and task->sighand and APIs using them: 14 * Types defining task->signal and task->sighand and APIs using them:
14 */ 15 */
15 16
16struct sighand_struct { 17struct sighand_struct {
17 atomic_t count; 18 refcount_t count;
18 struct k_sigaction action[_NSIG]; 19 struct k_sigaction action[_NSIG];
19 spinlock_t siglock; 20 spinlock_t siglock;
20 wait_queue_head_t signalfd_wqh; 21 wait_queue_head_t signalfd_wqh;
@@ -82,7 +83,7 @@ struct multiprocess_signals {
82 * the locking of signal_struct. 83 * the locking of signal_struct.
83 */ 84 */
84struct signal_struct { 85struct signal_struct {
85 atomic_t sigcnt; 86 refcount_t sigcnt;
86 atomic_t live; 87 atomic_t live;
87 int nr_threads; 88 int nr_threads;
88 struct list_head thread_head; 89 struct list_head thread_head;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index a9c32daeb9d8..99ce6d728df7 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -83,4 +83,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write,
83 void __user *buffer, size_t *lenp, 83 void __user *buffer, size_t *lenp,
84 loff_t *ppos); 84 loff_t *ppos);
85 85
86#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
87extern unsigned int sysctl_sched_energy_aware;
88extern int sched_energy_aware_handler(struct ctl_table *table, int write,
89 void __user *buffer, size_t *lenp,
90 loff_t *ppos);
91#endif
92
86#endif /* _LINUX_SCHED_SYSCTL_H */ 93#endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 44c6f15800ff..2e97a2227045 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -88,13 +88,13 @@ extern void sched_exec(void);
88#define sched_exec() {} 88#define sched_exec() {}
89#endif 89#endif
90 90
91#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) 91#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0)
92 92
93extern void __put_task_struct(struct task_struct *t); 93extern void __put_task_struct(struct task_struct *t);
94 94
95static inline void put_task_struct(struct task_struct *t) 95static inline void put_task_struct(struct task_struct *t)
96{ 96{
97 if (atomic_dec_and_test(&t->usage)) 97 if (refcount_dec_and_test(&t->usage))
98 __put_task_struct(t); 98 __put_task_struct(t);
99} 99}
100 100
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index 6a841929073f..2413427e439c 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -61,7 +61,7 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
61#ifdef CONFIG_THREAD_INFO_IN_TASK 61#ifdef CONFIG_THREAD_INFO_IN_TASK
62static inline void *try_get_task_stack(struct task_struct *tsk) 62static inline void *try_get_task_stack(struct task_struct *tsk)
63{ 63{
64 return atomic_inc_not_zero(&tsk->stack_refcount) ? 64 return refcount_inc_not_zero(&tsk->stack_refcount) ?
65 task_stack_page(tsk) : NULL; 65 task_stack_page(tsk) : NULL;
66} 66}
67 67
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c31d3a47a47c..57c7ed3fe465 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -176,10 +176,10 @@ typedef int (*sched_domain_flags_f)(void);
176#define SDTL_OVERLAP 0x01 176#define SDTL_OVERLAP 0x01
177 177
178struct sd_data { 178struct sd_data {
179 struct sched_domain **__percpu sd; 179 struct sched_domain *__percpu *sd;
180 struct sched_domain_shared **__percpu sds; 180 struct sched_domain_shared *__percpu *sds;
181 struct sched_group **__percpu sg; 181 struct sched_group *__percpu *sg;
182 struct sched_group_capacity **__percpu sgc; 182 struct sched_group_capacity *__percpu *sgc;
183}; 183};
184 184
185struct sched_domain_topology_level { 185struct sched_domain_topology_level {
diff --git a/include/linux/wait.h b/include/linux/wait.h
index ed7c122cb31f..5f3efabc36f4 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -308,7 +308,7 @@ do { \
308 308
309#define __wait_event_freezable(wq_head, condition) \ 309#define __wait_event_freezable(wq_head, condition) \
310 ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ 310 ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \
311 schedule(); try_to_freeze()) 311 freezable_schedule())
312 312
313/** 313/**
314 * wait_event_freezable - sleep (or freeze) until a condition gets true 314 * wait_event_freezable - sleep (or freeze) until a condition gets true
@@ -367,7 +367,7 @@ do { \
367#define __wait_event_freezable_timeout(wq_head, condition, timeout) \ 367#define __wait_event_freezable_timeout(wq_head, condition, timeout) \
368 ___wait_event(wq_head, ___wait_cond_timeout(condition), \ 368 ___wait_event(wq_head, ___wait_cond_timeout(condition), \
369 TASK_INTERRUPTIBLE, 0, timeout, \ 369 TASK_INTERRUPTIBLE, 0, timeout, \
370 __ret = schedule_timeout(__ret); try_to_freeze()) 370 __ret = freezable_schedule_timeout(__ret))
371 371
372/* 372/*
373 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid 373 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
@@ -588,7 +588,7 @@ do { \
588 588
589#define __wait_event_freezable_exclusive(wq, condition) \ 589#define __wait_event_freezable_exclusive(wq, condition) \
590 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ 590 ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \
591 schedule(); try_to_freeze()) 591 freezable_schedule())
592 592
593#define wait_event_freezable_exclusive(wq, condition) \ 593#define wait_event_freezable_exclusive(wq, condition) \
594({ \ 594({ \
diff --git a/init/init_task.c b/init/init_task.c
index 5aebe3be4d7c..46dbf546264d 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -44,7 +44,7 @@ static struct signal_struct init_signals = {
44}; 44};
45 45
46static struct sighand_struct init_sighand = { 46static struct sighand_struct init_sighand = {
47 .count = ATOMIC_INIT(1), 47 .count = REFCOUNT_INIT(1),
48 .action = { { { .sa_handler = SIG_DFL, } }, }, 48 .action = { { { .sa_handler = SIG_DFL, } }, },
49 .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock), 49 .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock),
50 .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), 50 .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh),
@@ -61,11 +61,11 @@ struct task_struct init_task
61= { 61= {
62#ifdef CONFIG_THREAD_INFO_IN_TASK 62#ifdef CONFIG_THREAD_INFO_IN_TASK
63 .thread_info = INIT_THREAD_INFO(init_task), 63 .thread_info = INIT_THREAD_INFO(init_task),
64 .stack_refcount = ATOMIC_INIT(1), 64 .stack_refcount = REFCOUNT_INIT(1),
65#endif 65#endif
66 .state = 0, 66 .state = 0,
67 .stack = init_stack, 67 .stack = init_stack,
68 .usage = ATOMIC_INIT(2), 68 .usage = REFCOUNT_INIT(2),
69 .flags = PF_KTHREAD, 69 .flags = PF_KTHREAD,
70 .prio = MAX_PRIO - 20, 70 .prio = MAX_PRIO - 20,
71 .static_prio = MAX_PRIO - 20, 71 .static_prio = MAX_PRIO - 20,
diff --git a/kernel/fork.c b/kernel/fork.c
index b69248e6f0e0..77059b211608 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -429,7 +429,7 @@ static void release_task_stack(struct task_struct *tsk)
429#ifdef CONFIG_THREAD_INFO_IN_TASK 429#ifdef CONFIG_THREAD_INFO_IN_TASK
430void put_task_stack(struct task_struct *tsk) 430void put_task_stack(struct task_struct *tsk)
431{ 431{
432 if (atomic_dec_and_test(&tsk->stack_refcount)) 432 if (refcount_dec_and_test(&tsk->stack_refcount))
433 release_task_stack(tsk); 433 release_task_stack(tsk);
434} 434}
435#endif 435#endif
@@ -447,7 +447,7 @@ void free_task(struct task_struct *tsk)
447 * If the task had a separate stack allocation, it should be gone 447 * If the task had a separate stack allocation, it should be gone
448 * by now. 448 * by now.
449 */ 449 */
450 WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); 450 WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
451#endif 451#endif
452 rt_mutex_debug_task_free(tsk); 452 rt_mutex_debug_task_free(tsk);
453 ftrace_graph_exit_task(tsk); 453 ftrace_graph_exit_task(tsk);
@@ -710,14 +710,14 @@ static inline void free_signal_struct(struct signal_struct *sig)
710 710
711static inline void put_signal_struct(struct signal_struct *sig) 711static inline void put_signal_struct(struct signal_struct *sig)
712{ 712{
713 if (atomic_dec_and_test(&sig->sigcnt)) 713 if (refcount_dec_and_test(&sig->sigcnt))
714 free_signal_struct(sig); 714 free_signal_struct(sig);
715} 715}
716 716
717void __put_task_struct(struct task_struct *tsk) 717void __put_task_struct(struct task_struct *tsk)
718{ 718{
719 WARN_ON(!tsk->exit_state); 719 WARN_ON(!tsk->exit_state);
720 WARN_ON(atomic_read(&tsk->usage)); 720 WARN_ON(refcount_read(&tsk->usage));
721 WARN_ON(tsk == current); 721 WARN_ON(tsk == current);
722 722
723 cgroup_free(tsk); 723 cgroup_free(tsk);
@@ -867,7 +867,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
867 tsk->stack_vm_area = stack_vm_area; 867 tsk->stack_vm_area = stack_vm_area;
868#endif 868#endif
869#ifdef CONFIG_THREAD_INFO_IN_TASK 869#ifdef CONFIG_THREAD_INFO_IN_TASK
870 atomic_set(&tsk->stack_refcount, 1); 870 refcount_set(&tsk->stack_refcount, 1);
871#endif 871#endif
872 872
873 if (err) 873 if (err)
@@ -896,7 +896,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
896 * One for us, one for whoever does the "release_task()" (usually 896 * One for us, one for whoever does the "release_task()" (usually
897 * parent) 897 * parent)
898 */ 898 */
899 atomic_set(&tsk->usage, 2); 899 refcount_set(&tsk->usage, 2);
900#ifdef CONFIG_BLK_DEV_IO_TRACE 900#ifdef CONFIG_BLK_DEV_IO_TRACE
901 tsk->btrace_seq = 0; 901 tsk->btrace_seq = 0;
902#endif 902#endif
@@ -1463,7 +1463,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1463 struct sighand_struct *sig; 1463 struct sighand_struct *sig;
1464 1464
1465 if (clone_flags & CLONE_SIGHAND) { 1465 if (clone_flags & CLONE_SIGHAND) {
1466 atomic_inc(&current->sighand->count); 1466 refcount_inc(&current->sighand->count);
1467 return 0; 1467 return 0;
1468 } 1468 }
1469 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); 1469 sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
@@ -1471,7 +1471,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1471 if (!sig) 1471 if (!sig)
1472 return -ENOMEM; 1472 return -ENOMEM;
1473 1473
1474 atomic_set(&sig->count, 1); 1474 refcount_set(&sig->count, 1);
1475 spin_lock_irq(&current->sighand->siglock); 1475 spin_lock_irq(&current->sighand->siglock);
1476 memcpy(sig->action, current->sighand->action, sizeof(sig->action)); 1476 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1477 spin_unlock_irq(&current->sighand->siglock); 1477 spin_unlock_irq(&current->sighand->siglock);
@@ -1480,7 +1480,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1480 1480
1481void __cleanup_sighand(struct sighand_struct *sighand) 1481void __cleanup_sighand(struct sighand_struct *sighand)
1482{ 1482{
1483 if (atomic_dec_and_test(&sighand->count)) { 1483 if (refcount_dec_and_test(&sighand->count)) {
1484 signalfd_cleanup(sighand); 1484 signalfd_cleanup(sighand);
1485 /* 1485 /*
1486 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it 1486 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
@@ -1527,7 +1527,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1527 1527
1528 sig->nr_threads = 1; 1528 sig->nr_threads = 1;
1529 atomic_set(&sig->live, 1); 1529 atomic_set(&sig->live, 1);
1530 atomic_set(&sig->sigcnt, 1); 1530 refcount_set(&sig->sigcnt, 1);
1531 1531
1532 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ 1532 /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1533 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); 1533 sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
@@ -2082,7 +2082,7 @@ static __latent_entropy struct task_struct *copy_process(
2082 } else { 2082 } else {
2083 current->signal->nr_threads++; 2083 current->signal->nr_threads++;
2084 atomic_inc(&current->signal->live); 2084 atomic_inc(&current->signal->live);
2085 atomic_inc(&current->signal->sigcnt); 2085 refcount_inc(&current->signal->sigcnt);
2086 task_join_group_stop(p); 2086 task_join_group_stop(p);
2087 list_add_tail_rcu(&p->thread_group, 2087 list_add_tail_rcu(&p->thread_group,
2088 &p->group_leader->thread_group); 2088 &p->group_leader->thread_group);
@@ -2439,7 +2439,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
2439 return -EINVAL; 2439 return -EINVAL;
2440 } 2440 }
2441 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { 2441 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
2442 if (atomic_read(&current->sighand->count) > 1) 2442 if (refcount_read(&current->sighand->count) > 1)
2443 return -EINVAL; 2443 return -EINVAL;
2444 } 2444 }
2445 if (unshare_flags & CLONE_VM) { 2445 if (unshare_flags & CLONE_VM) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 65234c89d85b..9cf20cc5ebe3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -605,7 +605,7 @@ void __kthread_init_worker(struct kthread_worker *worker,
605 struct lock_class_key *key) 605 struct lock_class_key *key)
606{ 606{
607 memset(worker, 0, sizeof(struct kthread_worker)); 607 memset(worker, 0, sizeof(struct kthread_worker));
608 spin_lock_init(&worker->lock); 608 raw_spin_lock_init(&worker->lock);
609 lockdep_set_class_and_name(&worker->lock, key, name); 609 lockdep_set_class_and_name(&worker->lock, key, name);
610 INIT_LIST_HEAD(&worker->work_list); 610 INIT_LIST_HEAD(&worker->work_list);
611 INIT_LIST_HEAD(&worker->delayed_work_list); 611 INIT_LIST_HEAD(&worker->delayed_work_list);
@@ -647,21 +647,21 @@ repeat:
647 647
648 if (kthread_should_stop()) { 648 if (kthread_should_stop()) {
649 __set_current_state(TASK_RUNNING); 649 __set_current_state(TASK_RUNNING);
650 spin_lock_irq(&worker->lock); 650 raw_spin_lock_irq(&worker->lock);
651 worker->task = NULL; 651 worker->task = NULL;
652 spin_unlock_irq(&worker->lock); 652 raw_spin_unlock_irq(&worker->lock);
653 return 0; 653 return 0;
654 } 654 }
655 655
656 work = NULL; 656 work = NULL;
657 spin_lock_irq(&worker->lock); 657 raw_spin_lock_irq(&worker->lock);
658 if (!list_empty(&worker->work_list)) { 658 if (!list_empty(&worker->work_list)) {
659 work = list_first_entry(&worker->work_list, 659 work = list_first_entry(&worker->work_list,
660 struct kthread_work, node); 660 struct kthread_work, node);
661 list_del_init(&work->node); 661 list_del_init(&work->node);
662 } 662 }
663 worker->current_work = work; 663 worker->current_work = work;
664 spin_unlock_irq(&worker->lock); 664 raw_spin_unlock_irq(&worker->lock);
665 665
666 if (work) { 666 if (work) {
667 __set_current_state(TASK_RUNNING); 667 __set_current_state(TASK_RUNNING);
@@ -818,12 +818,12 @@ bool kthread_queue_work(struct kthread_worker *worker,
818 bool ret = false; 818 bool ret = false;
819 unsigned long flags; 819 unsigned long flags;
820 820
821 spin_lock_irqsave(&worker->lock, flags); 821 raw_spin_lock_irqsave(&worker->lock, flags);
822 if (!queuing_blocked(worker, work)) { 822 if (!queuing_blocked(worker, work)) {
823 kthread_insert_work(worker, work, &worker->work_list); 823 kthread_insert_work(worker, work, &worker->work_list);
824 ret = true; 824 ret = true;
825 } 825 }
826 spin_unlock_irqrestore(&worker->lock, flags); 826 raw_spin_unlock_irqrestore(&worker->lock, flags);
827 return ret; 827 return ret;
828} 828}
829EXPORT_SYMBOL_GPL(kthread_queue_work); 829EXPORT_SYMBOL_GPL(kthread_queue_work);
@@ -841,6 +841,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
841 struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); 841 struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
842 struct kthread_work *work = &dwork->work; 842 struct kthread_work *work = &dwork->work;
843 struct kthread_worker *worker = work->worker; 843 struct kthread_worker *worker = work->worker;
844 unsigned long flags;
844 845
845 /* 846 /*
846 * This might happen when a pending work is reinitialized. 847 * This might happen when a pending work is reinitialized.
@@ -849,7 +850,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
849 if (WARN_ON_ONCE(!worker)) 850 if (WARN_ON_ONCE(!worker))
850 return; 851 return;
851 852
852 spin_lock(&worker->lock); 853 raw_spin_lock_irqsave(&worker->lock, flags);
853 /* Work must not be used with >1 worker, see kthread_queue_work(). */ 854 /* Work must not be used with >1 worker, see kthread_queue_work(). */
854 WARN_ON_ONCE(work->worker != worker); 855 WARN_ON_ONCE(work->worker != worker);
855 856
@@ -858,7 +859,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
858 list_del_init(&work->node); 859 list_del_init(&work->node);
859 kthread_insert_work(worker, work, &worker->work_list); 860 kthread_insert_work(worker, work, &worker->work_list);
860 861
861 spin_unlock(&worker->lock); 862 raw_spin_unlock_irqrestore(&worker->lock, flags);
862} 863}
863EXPORT_SYMBOL(kthread_delayed_work_timer_fn); 864EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
864 865
@@ -914,14 +915,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker,
914 unsigned long flags; 915 unsigned long flags;
915 bool ret = false; 916 bool ret = false;
916 917
917 spin_lock_irqsave(&worker->lock, flags); 918 raw_spin_lock_irqsave(&worker->lock, flags);
918 919
919 if (!queuing_blocked(worker, work)) { 920 if (!queuing_blocked(worker, work)) {
920 __kthread_queue_delayed_work(worker, dwork, delay); 921 __kthread_queue_delayed_work(worker, dwork, delay);
921 ret = true; 922 ret = true;
922 } 923 }
923 924
924 spin_unlock_irqrestore(&worker->lock, flags); 925 raw_spin_unlock_irqrestore(&worker->lock, flags);
925 return ret; 926 return ret;
926} 927}
927EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); 928EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
@@ -957,7 +958,7 @@ void kthread_flush_work(struct kthread_work *work)
957 if (!worker) 958 if (!worker)
958 return; 959 return;
959 960
960 spin_lock_irq(&worker->lock); 961 raw_spin_lock_irq(&worker->lock);
961 /* Work must not be used with >1 worker, see kthread_queue_work(). */ 962 /* Work must not be used with >1 worker, see kthread_queue_work(). */
962 WARN_ON_ONCE(work->worker != worker); 963 WARN_ON_ONCE(work->worker != worker);
963 964
@@ -969,7 +970,7 @@ void kthread_flush_work(struct kthread_work *work)
969 else 970 else
970 noop = true; 971 noop = true;
971 972
972 spin_unlock_irq(&worker->lock); 973 raw_spin_unlock_irq(&worker->lock);
973 974
974 if (!noop) 975 if (!noop)
975 wait_for_completion(&fwork.done); 976 wait_for_completion(&fwork.done);
@@ -1002,9 +1003,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
1002 * any queuing is blocked by setting the canceling counter. 1003 * any queuing is blocked by setting the canceling counter.
1003 */ 1004 */
1004 work->canceling++; 1005 work->canceling++;
1005 spin_unlock_irqrestore(&worker->lock, *flags); 1006 raw_spin_unlock_irqrestore(&worker->lock, *flags);
1006 del_timer_sync(&dwork->timer); 1007 del_timer_sync(&dwork->timer);
1007 spin_lock_irqsave(&worker->lock, *flags); 1008 raw_spin_lock_irqsave(&worker->lock, *flags);
1008 work->canceling--; 1009 work->canceling--;
1009 } 1010 }
1010 1011
@@ -1051,7 +1052,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
1051 unsigned long flags; 1052 unsigned long flags;
1052 int ret = false; 1053 int ret = false;
1053 1054
1054 spin_lock_irqsave(&worker->lock, flags); 1055 raw_spin_lock_irqsave(&worker->lock, flags);
1055 1056
1056 /* Do not bother with canceling when never queued. */ 1057 /* Do not bother with canceling when never queued. */
1057 if (!work->worker) 1058 if (!work->worker)
@@ -1068,7 +1069,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
1068fast_queue: 1069fast_queue:
1069 __kthread_queue_delayed_work(worker, dwork, delay); 1070 __kthread_queue_delayed_work(worker, dwork, delay);
1070out: 1071out:
1071 spin_unlock_irqrestore(&worker->lock, flags); 1072 raw_spin_unlock_irqrestore(&worker->lock, flags);
1072 return ret; 1073 return ret;
1073} 1074}
1074EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); 1075EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
@@ -1082,7 +1083,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
1082 if (!worker) 1083 if (!worker)
1083 goto out; 1084 goto out;
1084 1085
1085 spin_lock_irqsave(&worker->lock, flags); 1086 raw_spin_lock_irqsave(&worker->lock, flags);
1086 /* Work must not be used with >1 worker, see kthread_queue_work(). */ 1087 /* Work must not be used with >1 worker, see kthread_queue_work(). */
1087 WARN_ON_ONCE(work->worker != worker); 1088 WARN_ON_ONCE(work->worker != worker);
1088 1089
@@ -1096,13 +1097,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
1096 * In the meantime, block any queuing by setting the canceling counter. 1097 * In the meantime, block any queuing by setting the canceling counter.
1097 */ 1098 */
1098 work->canceling++; 1099 work->canceling++;
1099 spin_unlock_irqrestore(&worker->lock, flags); 1100 raw_spin_unlock_irqrestore(&worker->lock, flags);
1100 kthread_flush_work(work); 1101 kthread_flush_work(work);
1101 spin_lock_irqsave(&worker->lock, flags); 1102 raw_spin_lock_irqsave(&worker->lock, flags);
1102 work->canceling--; 1103 work->canceling--;
1103 1104
1104out_fast: 1105out_fast:
1105 spin_unlock_irqrestore(&worker->lock, flags); 1106 raw_spin_unlock_irqrestore(&worker->lock, flags);
1106out: 1107out:
1107 return ret; 1108 return ret;
1108} 1109}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0002995570db..f3901b84d217 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
107 * [L] ->on_rq 107 * [L] ->on_rq
108 * RELEASE (rq->lock) 108 * RELEASE (rq->lock)
109 * 109 *
110 * If we observe the old CPU in task_rq_lock, the acquire of 110 * If we observe the old CPU in task_rq_lock(), the acquire of
111 * the old rq->lock will fully serialize against the stores. 111 * the old rq->lock will fully serialize against the stores.
112 * 112 *
113 * If we observe the new CPU in task_rq_lock, the acquire will 113 * If we observe the new CPU in task_rq_lock(), the address
114 * pair with the WMB to ensure we must then also see migrating. 114 * dependency headed by '[L] rq = task_rq()' and the acquire
115 * will pair with the WMB to ensure we then also see migrating.
115 */ 116 */
116 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 117 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
117 rq_pin_lock(rq, rf); 118 rq_pin_lock(rq, rf);
@@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
180 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 181 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
181 update_irq_load_avg(rq, irq_delta + steal); 182 update_irq_load_avg(rq, irq_delta + steal);
182#endif 183#endif
184 update_rq_clock_pelt(rq, delta);
183} 185}
184 186
185void update_rq_clock(struct rq *rq) 187void update_rq_clock(struct rq *rq)
@@ -956,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
956{ 958{
957 lockdep_assert_held(&rq->lock); 959 lockdep_assert_held(&rq->lock);
958 960
959 p->on_rq = TASK_ON_RQ_MIGRATING; 961 WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
960 dequeue_task(rq, p, DEQUEUE_NOCLOCK); 962 dequeue_task(rq, p, DEQUEUE_NOCLOCK);
961 set_task_cpu(p, new_cpu); 963 set_task_cpu(p, new_cpu);
962 rq_unlock(rq, rf); 964 rq_unlock(rq, rf);
@@ -2459,7 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
2459#endif 2461#endif
2460 rq = __task_rq_lock(p, &rf); 2462 rq = __task_rq_lock(p, &rf);
2461 update_rq_clock(rq); 2463 update_rq_clock(rq);
2462 post_init_entity_util_avg(&p->se); 2464 post_init_entity_util_avg(p);
2463 2465
2464 activate_task(rq, p, ENQUEUE_NOCLOCK); 2466 activate_task(rq, p, ENQUEUE_NOCLOCK);
2465 p->on_rq = TASK_ON_RQ_QUEUED; 2467 p->on_rq = TASK_ON_RQ_QUEUED;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb8b7b5d745d..6a73e41a2016 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1767 deadline_queue_push_tasks(rq); 1767 deadline_queue_push_tasks(rq);
1768 1768
1769 if (rq->curr->sched_class != &dl_sched_class) 1769 if (rq->curr->sched_class != &dl_sched_class)
1770 update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); 1770 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1771 1771
1772 return p; 1772 return p;
1773} 1773}
@@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1776{ 1776{
1777 update_curr_dl(rq); 1777 update_curr_dl(rq);
1778 1778
1779 update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); 1779 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1780 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) 1780 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1781 enqueue_pushable_dl_task(rq, p); 1781 enqueue_pushable_dl_task(rq, p);
1782} 1782}
@@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1793{ 1793{
1794 update_curr_dl(rq); 1794 update_curr_dl(rq);
1795 1795
1796 update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); 1796 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1797 /* 1797 /*
1798 * Even when we have runtime, update_curr_dl() might have resulted in us 1798 * Even when we have runtime, update_curr_dl() might have resulted in us
1799 * not being the leftmost task anymore. In that case NEED_RESCHED will 1799 * not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de3de997e245..8039d62ae36e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void)
315{ 315{
316 static struct ctl_table *cpu_entries; 316 static struct ctl_table *cpu_entries;
317 static struct ctl_table **cpu_idx; 317 static struct ctl_table **cpu_idx;
318 static bool init_done = false;
318 char buf[32]; 319 char buf[32];
319 int i; 320 int i;
320 321
@@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void)
344 if (!cpumask_available(sd_sysctl_cpus)) { 345 if (!cpumask_available(sd_sysctl_cpus)) {
345 if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) 346 if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
346 return; 347 return;
348 }
347 349
350 if (!init_done) {
351 init_done = true;
348 /* init to possible to not have holes in @cpu_entries */ 352 /* init to possible to not have holes in @cpu_entries */
349 cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); 353 cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
350 } 354 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 310d0637fe4b..8213ff6e365d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
248 */ 248 */
249 249
250#ifdef CONFIG_FAIR_GROUP_SCHED 250#ifdef CONFIG_FAIR_GROUP_SCHED
251
252/* cpu runqueue to which this cfs_rq is attached */
253static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
254{
255 return cfs_rq->rq;
256}
257
258static inline struct task_struct *task_of(struct sched_entity *se) 251static inline struct task_struct *task_of(struct sched_entity *se)
259{ 252{
260 SCHED_WARN_ON(!entity_is_task(se)); 253 SCHED_WARN_ON(!entity_is_task(se));
@@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
282 return grp->my_q; 275 return grp->my_q;
283} 276}
284 277
285static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 278static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
286{ 279{
287 if (!cfs_rq->on_list) { 280 struct rq *rq = rq_of(cfs_rq);
288 struct rq *rq = rq_of(cfs_rq); 281 int cpu = cpu_of(rq);
289 int cpu = cpu_of(rq); 282
283 if (cfs_rq->on_list)
284 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
285
286 cfs_rq->on_list = 1;
287
288 /*
289 * Ensure we either appear before our parent (if already
290 * enqueued) or force our parent to appear after us when it is
291 * enqueued. The fact that we always enqueue bottom-up
292 * reduces this to two cases and a special case for the root
293 * cfs_rq. Furthermore, it also means that we will always reset
294 * tmp_alone_branch either when the branch is connected
295 * to a tree or when we reach the top of the tree
296 */
297 if (cfs_rq->tg->parent &&
298 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
290 /* 299 /*
291 * Ensure we either appear before our parent (if already 300 * If parent is already on the list, we add the child
292 * enqueued) or force our parent to appear after us when it is 301 * just before. Thanks to circular linked property of
293 * enqueued. The fact that we always enqueue bottom-up 302 * the list, this means to put the child at the tail
294 * reduces this to two cases and a special case for the root 303 * of the list that starts by parent.
295 * cfs_rq. Furthermore, it also means that we will always reset
296 * tmp_alone_branch either when the branch is connected
297 * to a tree or when we reach the beg of the tree
298 */ 304 */
299 if (cfs_rq->tg->parent && 305 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
300 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { 306 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
301 /* 307 /*
302 * If parent is already on the list, we add the child 308 * The branch is now connected to its tree so we can
303 * just before. Thanks to circular linked property of 309 * reset tmp_alone_branch to the beginning of the
304 * the list, this means to put the child at the tail 310 * list.
305 * of the list that starts by parent. 311 */
306 */ 312 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
307 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, 313 return true;
308 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); 314 }
309 /*
310 * The branch is now connected to its tree so we can
311 * reset tmp_alone_branch to the beginning of the
312 * list.
313 */
314 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
315 } else if (!cfs_rq->tg->parent) {
316 /*
317 * cfs rq without parent should be put
318 * at the tail of the list.
319 */
320 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
321 &rq->leaf_cfs_rq_list);
322 /*
323 * We have reach the beg of a tree so we can reset
324 * tmp_alone_branch to the beginning of the list.
325 */
326 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
327 } else {
328 /*
329 * The parent has not already been added so we want to
330 * make sure that it will be put after us.
331 * tmp_alone_branch points to the beg of the branch
332 * where we will add parent.
333 */
334 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
335 rq->tmp_alone_branch);
336 /*
337 * update tmp_alone_branch to points to the new beg
338 * of the branch
339 */
340 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
341 }
342 315
343 cfs_rq->on_list = 1; 316 if (!cfs_rq->tg->parent) {
317 /*
318 * cfs rq without parent should be put
319 * at the tail of the list.
320 */
321 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
322 &rq->leaf_cfs_rq_list);
323 /*
324 * We have reach the top of a tree so we can reset
325 * tmp_alone_branch to the beginning of the list.
326 */
327 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
328 return true;
344 } 329 }
330
331 /*
332 * The parent has not already been added so we want to
333 * make sure that it will be put after us.
334 * tmp_alone_branch points to the begin of the branch
335 * where we will add parent.
336 */
337 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
338 /*
339 * update tmp_alone_branch to points to the new begin
340 * of the branch
341 */
342 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
343 return false;
345} 344}
346 345
347static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) 346static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
348{ 347{
349 if (cfs_rq->on_list) { 348 if (cfs_rq->on_list) {
349 struct rq *rq = rq_of(cfs_rq);
350
351 /*
352 * With cfs_rq being unthrottled/throttled during an enqueue,
353 * it can happen the tmp_alone_branch points the a leaf that
354 * we finally want to del. In this case, tmp_alone_branch moves
355 * to the prev element but it will point to rq->leaf_cfs_rq_list
356 * at the end of the enqueue.
357 */
358 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
359 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
360
350 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 361 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
351 cfs_rq->on_list = 0; 362 cfs_rq->on_list = 0;
352 } 363 }
353} 364}
354 365
355/* Iterate through all leaf cfs_rq's on a runqueue: */ 366static inline void assert_list_leaf_cfs_rq(struct rq *rq)
356#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 367{
357 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 368 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
369}
370
371/* Iterate thr' all leaf cfs_rq's on a runqueue */
372#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
373 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
374 leaf_cfs_rq_list)
358 375
359/* Do the two (enqueued) entities belong to the same group ? */ 376/* Do the two (enqueued) entities belong to the same group ? */
360static inline struct cfs_rq * 377static inline struct cfs_rq *
@@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
410 return container_of(se, struct task_struct, se); 427 return container_of(se, struct task_struct, se);
411} 428}
412 429
413static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
414{
415 return container_of(cfs_rq, struct rq, cfs);
416}
417
418
419#define for_each_sched_entity(se) \ 430#define for_each_sched_entity(se) \
420 for (; se; se = NULL) 431 for (; se; se = NULL)
421 432
@@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
438 return NULL; 449 return NULL;
439} 450}
440 451
441static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 452static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
442{ 453{
454 return true;
443} 455}
444 456
445static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) 457static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
446{ 458{
447} 459}
448 460
449#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 461static inline void assert_list_leaf_cfs_rq(struct rq *rq)
450 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 462{
463}
464
465#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
466 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
451 467
452static inline struct sched_entity *parent_entity(struct sched_entity *se) 468static inline struct sched_entity *parent_entity(struct sched_entity *se)
453{ 469{
@@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
686 return calc_delta_fair(sched_slice(cfs_rq, se), se); 702 return calc_delta_fair(sched_slice(cfs_rq, se), se);
687} 703}
688 704
689#ifdef CONFIG_SMP
690#include "pelt.h" 705#include "pelt.h"
691#include "sched-pelt.h" 706#ifdef CONFIG_SMP
692 707
693static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); 708static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
694static unsigned long task_h_load(struct task_struct *p); 709static unsigned long task_h_load(struct task_struct *p);
@@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
744 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) 759 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
745 * if util_avg > util_avg_cap. 760 * if util_avg > util_avg_cap.
746 */ 761 */
747void post_init_entity_util_avg(struct sched_entity *se) 762void post_init_entity_util_avg(struct task_struct *p)
748{ 763{
764 struct sched_entity *se = &p->se;
749 struct cfs_rq *cfs_rq = cfs_rq_of(se); 765 struct cfs_rq *cfs_rq = cfs_rq_of(se);
750 struct sched_avg *sa = &se->avg; 766 struct sched_avg *sa = &se->avg;
751 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); 767 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
@@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
763 } 779 }
764 } 780 }
765 781
766 if (entity_is_task(se)) { 782 if (p->sched_class != &fair_sched_class) {
767 struct task_struct *p = task_of(se); 783 /*
768 if (p->sched_class != &fair_sched_class) { 784 * For !fair tasks do:
769 /* 785 *
770 * For !fair tasks do: 786 update_cfs_rq_load_avg(now, cfs_rq);
771 * 787 attach_entity_load_avg(cfs_rq, se, 0);
772 update_cfs_rq_load_avg(now, cfs_rq); 788 switched_from_fair(rq, p);
773 attach_entity_load_avg(cfs_rq, se, 0); 789 *
774 switched_from_fair(rq, p); 790 * such that the next switched_to_fair() has the
775 * 791 * expected state.
776 * such that the next switched_to_fair() has the 792 */
777 * expected state. 793 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
778 */ 794 return;
779 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
780 return;
781 }
782 } 795 }
783 796
784 attach_entity_cfs_rq(se); 797 attach_entity_cfs_rq(se);
@@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
788void init_entity_runnable_average(struct sched_entity *se) 801void init_entity_runnable_average(struct sched_entity *se)
789{ 802{
790} 803}
791void post_init_entity_util_avg(struct sched_entity *se) 804void post_init_entity_util_avg(struct task_struct *p)
792{ 805{
793} 806}
794static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) 807static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
1035unsigned int sysctl_numa_balancing_scan_delay = 1000; 1048unsigned int sysctl_numa_balancing_scan_delay = 1000;
1036 1049
1037struct numa_group { 1050struct numa_group {
1038 atomic_t refcount; 1051 refcount_t refcount;
1039 1052
1040 spinlock_t lock; /* nr_tasks, tasks */ 1053 spinlock_t lock; /* nr_tasks, tasks */
1041 int nr_tasks; 1054 int nr_tasks;
@@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p)
1104 unsigned long shared = group_faults_shared(ng); 1117 unsigned long shared = group_faults_shared(ng);
1105 unsigned long private = group_faults_priv(ng); 1118 unsigned long private = group_faults_priv(ng);
1106 1119
1107 period *= atomic_read(&ng->refcount); 1120 period *= refcount_read(&ng->refcount);
1108 period *= shared + 1; 1121 period *= shared + 1;
1109 period /= private + shared + 1; 1122 period /= private + shared + 1;
1110 } 1123 }
@@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p)
1127 unsigned long private = group_faults_priv(ng); 1140 unsigned long private = group_faults_priv(ng);
1128 unsigned long period = smax; 1141 unsigned long period = smax;
1129 1142
1130 period *= atomic_read(&ng->refcount); 1143 period *= refcount_read(&ng->refcount);
1131 period *= shared + 1; 1144 period *= shared + 1;
1132 period /= private + shared + 1; 1145 period /= private + shared + 1;
1133 1146
@@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p)
2203 2216
2204static inline int get_numa_group(struct numa_group *grp) 2217static inline int get_numa_group(struct numa_group *grp)
2205{ 2218{
2206 return atomic_inc_not_zero(&grp->refcount); 2219 return refcount_inc_not_zero(&grp->refcount);
2207} 2220}
2208 2221
2209static inline void put_numa_group(struct numa_group *grp) 2222static inline void put_numa_group(struct numa_group *grp)
2210{ 2223{
2211 if (atomic_dec_and_test(&grp->refcount)) 2224 if (refcount_dec_and_test(&grp->refcount))
2212 kfree_rcu(grp, rcu); 2225 kfree_rcu(grp, rcu);
2213} 2226}
2214 2227
@@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2229 if (!grp) 2242 if (!grp)
2230 return; 2243 return;
2231 2244
2232 atomic_set(&grp->refcount, 1); 2245 refcount_set(&grp->refcount, 1);
2233 grp->active_nodes = 1; 2246 grp->active_nodes = 1;
2234 grp->max_faults_cpu = 0; 2247 grp->max_faults_cpu = 0;
2235 spin_lock_init(&grp->lock); 2248 spin_lock_init(&grp->lock);
@@ -3122,7 +3135,7 @@ void set_task_rq_fair(struct sched_entity *se,
3122 p_last_update_time = prev->avg.last_update_time; 3135 p_last_update_time = prev->avg.last_update_time;
3123 n_last_update_time = next->avg.last_update_time; 3136 n_last_update_time = next->avg.last_update_time;
3124#endif 3137#endif
3125 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); 3138 __update_load_avg_blocked_se(p_last_update_time, se);
3126 se->avg.last_update_time = n_last_update_time; 3139 se->avg.last_update_time = n_last_update_time;
3127} 3140}
3128 3141
@@ -3257,11 +3270,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
3257 3270
3258 /* 3271 /*
3259 * runnable_sum can't be lower than running_sum 3272 * runnable_sum can't be lower than running_sum
3260 * As running sum is scale with CPU capacity wehreas the runnable sum 3273 * Rescale running sum to be in the same range as runnable sum
3261 * is not we rescale running_sum 1st 3274 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
3275 * runnable_sum is in [0 : LOAD_AVG_MAX]
3262 */ 3276 */
3263 running_sum = se->avg.util_sum / 3277 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3264 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3265 runnable_sum = max(runnable_sum, running_sum); 3278 runnable_sum = max(runnable_sum, running_sum);
3266 3279
3267 load_sum = (s64)se_weight(se) * runnable_sum; 3280 load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3364,7 +3377,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
3364 3377
3365/** 3378/**
3366 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages 3379 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3367 * @now: current time, as per cfs_rq_clock_task() 3380 * @now: current time, as per cfs_rq_clock_pelt()
3368 * @cfs_rq: cfs_rq to update 3381 * @cfs_rq: cfs_rq to update
3369 * 3382 *
3370 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) 3383 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3409,7 +3422,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3409 decayed = 1; 3422 decayed = 1;
3410 } 3423 }
3411 3424
3412 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); 3425 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3413 3426
3414#ifndef CONFIG_64BIT 3427#ifndef CONFIG_64BIT
3415 smp_wmb(); 3428 smp_wmb();
@@ -3499,9 +3512,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3499/* Update task and its cfs_rq load average */ 3512/* Update task and its cfs_rq load average */
3500static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3513static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3501{ 3514{
3502 u64 now = cfs_rq_clock_task(cfs_rq); 3515 u64 now = cfs_rq_clock_pelt(cfs_rq);
3503 struct rq *rq = rq_of(cfs_rq);
3504 int cpu = cpu_of(rq);
3505 int decayed; 3516 int decayed;
3506 3517
3507 /* 3518 /*
@@ -3509,7 +3520,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3509 * track group sched_entity load average for task_h_load calc in migration 3520 * track group sched_entity load average for task_h_load calc in migration
3510 */ 3521 */
3511 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) 3522 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3512 __update_load_avg_se(now, cpu, cfs_rq, se); 3523 __update_load_avg_se(now, cfs_rq, se);
3513 3524
3514 decayed = update_cfs_rq_load_avg(now, cfs_rq); 3525 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3515 decayed |= propagate_entity_load_avg(se); 3526 decayed |= propagate_entity_load_avg(se);
@@ -3561,7 +3572,7 @@ void sync_entity_load_avg(struct sched_entity *se)
3561 u64 last_update_time; 3572 u64 last_update_time;
3562 3573
3563 last_update_time = cfs_rq_last_update_time(cfs_rq); 3574 last_update_time = cfs_rq_last_update_time(cfs_rq);
3564 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); 3575 __update_load_avg_blocked_se(last_update_time, se);
3565} 3576}
3566 3577
3567/* 3578/*
@@ -3577,10 +3588,6 @@ void remove_entity_load_avg(struct sched_entity *se)
3577 * tasks cannot exit without having gone through wake_up_new_task() -> 3588 * tasks cannot exit without having gone through wake_up_new_task() ->
3578 * post_init_entity_util_avg() which will have added things to the 3589 * post_init_entity_util_avg() which will have added things to the
3579 * cfs_rq, so we can remove unconditionally. 3590 * cfs_rq, so we can remove unconditionally.
3580 *
3581 * Similarly for groups, they will have passed through
3582 * post_init_entity_util_avg() before unregister_sched_fair_group()
3583 * calls this.
3584 */ 3591 */
3585 3592
3586 sync_entity_load_avg(se); 3593 sync_entity_load_avg(se);
@@ -3654,6 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3654{ 3661{
3655 long last_ewma_diff; 3662 long last_ewma_diff;
3656 struct util_est ue; 3663 struct util_est ue;
3664 int cpu;
3657 3665
3658 if (!sched_feat(UTIL_EST)) 3666 if (!sched_feat(UTIL_EST))
3659 return; 3667 return;
@@ -3688,6 +3696,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3688 return; 3696 return;
3689 3697
3690 /* 3698 /*
3699 * To avoid overestimation of actual task utilization, skip updates if
3700 * we cannot grant there is idle time in this CPU.
3701 */
3702 cpu = cpu_of(rq_of(cfs_rq));
3703 if (task_util(p) > capacity_orig_of(cpu))
3704 return;
3705
3706 /*
3691 * Update Task's estimated utilization 3707 * Update Task's estimated utilization
3692 * 3708 *
3693 * When *p completes an activation we can consolidate another sample 3709 * When *p completes an activation we can consolidate another sample
@@ -4429,6 +4445,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
4429 /* adjust cfs_rq_clock_task() */ 4445 /* adjust cfs_rq_clock_task() */
4430 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - 4446 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4431 cfs_rq->throttled_clock_task; 4447 cfs_rq->throttled_clock_task;
4448
4449 /* Add cfs_rq with already running entity in the list */
4450 if (cfs_rq->nr_running >= 1)
4451 list_add_leaf_cfs_rq(cfs_rq);
4432 } 4452 }
4433 4453
4434 return 0; 4454 return 0;
@@ -4440,8 +4460,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
4440 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 4460 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4441 4461
4442 /* group is entering throttled state, stop time */ 4462 /* group is entering throttled state, stop time */
4443 if (!cfs_rq->throttle_count) 4463 if (!cfs_rq->throttle_count) {
4444 cfs_rq->throttled_clock_task = rq_clock_task(rq); 4464 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4465 list_del_leaf_cfs_rq(cfs_rq);
4466 }
4445 cfs_rq->throttle_count++; 4467 cfs_rq->throttle_count++;
4446 4468
4447 return 0; 4469 return 0;
@@ -4544,6 +4566,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4544 break; 4566 break;
4545 } 4567 }
4546 4568
4569 assert_list_leaf_cfs_rq(rq);
4570
4547 if (!se) 4571 if (!se)
4548 add_nr_running(rq, task_delta); 4572 add_nr_running(rq, task_delta);
4549 4573
@@ -4565,7 +4589,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4565 struct rq *rq = rq_of(cfs_rq); 4589 struct rq *rq = rq_of(cfs_rq);
4566 struct rq_flags rf; 4590 struct rq_flags rf;
4567 4591
4568 rq_lock(rq, &rf); 4592 rq_lock_irqsave(rq, &rf);
4569 if (!cfs_rq_throttled(cfs_rq)) 4593 if (!cfs_rq_throttled(cfs_rq))
4570 goto next; 4594 goto next;
4571 4595
@@ -4582,7 +4606,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4582 unthrottle_cfs_rq(cfs_rq); 4606 unthrottle_cfs_rq(cfs_rq);
4583 4607
4584next: 4608next:
4585 rq_unlock(rq, &rf); 4609 rq_unlock_irqrestore(rq, &rf);
4586 4610
4587 if (!remaining) 4611 if (!remaining)
4588 break; 4612 break;
@@ -4598,7 +4622,7 @@ next:
4598 * period the timer is deactivated until scheduling resumes; cfs_b->idle is 4622 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4599 * used to track this state. 4623 * used to track this state.
4600 */ 4624 */
4601static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) 4625static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4602{ 4626{
4603 u64 runtime, runtime_expires; 4627 u64 runtime, runtime_expires;
4604 int throttled; 4628 int throttled;
@@ -4640,11 +4664,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4640 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { 4664 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4641 runtime = cfs_b->runtime; 4665 runtime = cfs_b->runtime;
4642 cfs_b->distribute_running = 1; 4666 cfs_b->distribute_running = 1;
4643 raw_spin_unlock(&cfs_b->lock); 4667 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4644 /* we can't nest cfs_b->lock while distributing bandwidth */ 4668 /* we can't nest cfs_b->lock while distributing bandwidth */
4645 runtime = distribute_cfs_runtime(cfs_b, runtime, 4669 runtime = distribute_cfs_runtime(cfs_b, runtime,
4646 runtime_expires); 4670 runtime_expires);
4647 raw_spin_lock(&cfs_b->lock); 4671 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4648 4672
4649 cfs_b->distribute_running = 0; 4673 cfs_b->distribute_running = 0;
4650 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 4674 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@ -4753,17 +4777,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4753static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) 4777static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4754{ 4778{
4755 u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); 4779 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4780 unsigned long flags;
4756 u64 expires; 4781 u64 expires;
4757 4782
4758 /* confirm we're still not at a refresh boundary */ 4783 /* confirm we're still not at a refresh boundary */
4759 raw_spin_lock(&cfs_b->lock); 4784 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4760 if (cfs_b->distribute_running) { 4785 if (cfs_b->distribute_running) {
4761 raw_spin_unlock(&cfs_b->lock); 4786 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4762 return; 4787 return;
4763 } 4788 }
4764 4789
4765 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { 4790 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4766 raw_spin_unlock(&cfs_b->lock); 4791 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4767 return; 4792 return;
4768 } 4793 }
4769 4794
@@ -4774,18 +4799,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4774 if (runtime) 4799 if (runtime)
4775 cfs_b->distribute_running = 1; 4800 cfs_b->distribute_running = 1;
4776 4801
4777 raw_spin_unlock(&cfs_b->lock); 4802 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4778 4803
4779 if (!runtime) 4804 if (!runtime)
4780 return; 4805 return;
4781 4806
4782 runtime = distribute_cfs_runtime(cfs_b, runtime, expires); 4807 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4783 4808
4784 raw_spin_lock(&cfs_b->lock); 4809 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4785 if (expires == cfs_b->runtime_expires) 4810 if (expires == cfs_b->runtime_expires)
4786 lsub_positive(&cfs_b->runtime, runtime); 4811 lsub_positive(&cfs_b->runtime, runtime);
4787 cfs_b->distribute_running = 0; 4812 cfs_b->distribute_running = 0;
4788 raw_spin_unlock(&cfs_b->lock); 4813 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4789} 4814}
4790 4815
4791/* 4816/*
@@ -4863,20 +4888,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4863{ 4888{
4864 struct cfs_bandwidth *cfs_b = 4889 struct cfs_bandwidth *cfs_b =
4865 container_of(timer, struct cfs_bandwidth, period_timer); 4890 container_of(timer, struct cfs_bandwidth, period_timer);
4891 unsigned long flags;
4866 int overrun; 4892 int overrun;
4867 int idle = 0; 4893 int idle = 0;
4868 4894
4869 raw_spin_lock(&cfs_b->lock); 4895 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4870 for (;;) { 4896 for (;;) {
4871 overrun = hrtimer_forward_now(timer, cfs_b->period); 4897 overrun = hrtimer_forward_now(timer, cfs_b->period);
4872 if (!overrun) 4898 if (!overrun)
4873 break; 4899 break;
4874 4900
4875 idle = do_sched_cfs_period_timer(cfs_b, overrun); 4901 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
4876 } 4902 }
4877 if (idle) 4903 if (idle)
4878 cfs_b->period_active = 0; 4904 cfs_b->period_active = 0;
4879 raw_spin_unlock(&cfs_b->lock); 4905 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4880 4906
4881 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 4907 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4882} 4908}
@@ -4986,6 +5012,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4986} 5012}
4987 5013
4988#else /* CONFIG_CFS_BANDWIDTH */ 5014#else /* CONFIG_CFS_BANDWIDTH */
5015
5016static inline bool cfs_bandwidth_used(void)
5017{
5018 return false;
5019}
5020
4989static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) 5021static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4990{ 5022{
4991 return rq_clock_task(rq_of(cfs_rq)); 5023 return rq_clock_task(rq_of(cfs_rq));
@@ -5177,6 +5209,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5177 5209
5178 } 5210 }
5179 5211
5212 if (cfs_bandwidth_used()) {
5213 /*
5214 * When bandwidth control is enabled; the cfs_rq_throttled()
5215 * breaks in the above iteration can result in incomplete
5216 * leaf list maintenance, resulting in triggering the assertion
5217 * below.
5218 */
5219 for_each_sched_entity(se) {
5220 cfs_rq = cfs_rq_of(se);
5221
5222 if (list_add_leaf_cfs_rq(cfs_rq))
5223 break;
5224 }
5225 }
5226
5227 assert_list_leaf_cfs_rq(rq);
5228
5180 hrtick_update(rq); 5229 hrtick_update(rq);
5181} 5230}
5182 5231
@@ -5556,11 +5605,6 @@ static unsigned long capacity_of(int cpu)
5556 return cpu_rq(cpu)->cpu_capacity; 5605 return cpu_rq(cpu)->cpu_capacity;
5557} 5606}
5558 5607
5559static unsigned long capacity_orig_of(int cpu)
5560{
5561 return cpu_rq(cpu)->cpu_capacity_orig;
5562}
5563
5564static unsigned long cpu_avg_load_per_task(int cpu) 5608static unsigned long cpu_avg_load_per_task(int cpu)
5565{ 5609{
5566 struct rq *rq = cpu_rq(cpu); 5610 struct rq *rq = cpu_rq(cpu);
@@ -6053,7 +6097,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
6053 bool idle = true; 6097 bool idle = true;
6054 6098
6055 for_each_cpu(cpu, cpu_smt_mask(core)) { 6099 for_each_cpu(cpu, cpu_smt_mask(core)) {
6056 cpumask_clear_cpu(cpu, cpus); 6100 __cpumask_clear_cpu(cpu, cpus);
6057 if (!available_idle_cpu(cpu)) 6101 if (!available_idle_cpu(cpu))
6058 idle = false; 6102 idle = false;
6059 } 6103 }
@@ -6073,7 +6117,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
6073/* 6117/*
6074 * Scan the local SMT mask for idle CPUs. 6118 * Scan the local SMT mask for idle CPUs.
6075 */ 6119 */
6076static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 6120static int select_idle_smt(struct task_struct *p, int target)
6077{ 6121{
6078 int cpu; 6122 int cpu;
6079 6123
@@ -6097,7 +6141,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
6097 return -1; 6141 return -1;
6098} 6142}
6099 6143
6100static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 6144static inline int select_idle_smt(struct task_struct *p, int target)
6101{ 6145{
6102 return -1; 6146 return -1;
6103} 6147}
@@ -6202,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6202 if ((unsigned)i < nr_cpumask_bits) 6246 if ((unsigned)i < nr_cpumask_bits)
6203 return i; 6247 return i;
6204 6248
6205 i = select_idle_smt(p, sd, target); 6249 i = select_idle_smt(p, target);
6206 if ((unsigned)i < nr_cpumask_bits) 6250 if ((unsigned)i < nr_cpumask_bits)
6207 return i; 6251 return i;
6208 6252
@@ -6608,7 +6652,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6608 if (sd_flag & SD_BALANCE_WAKE) { 6652 if (sd_flag & SD_BALANCE_WAKE) {
6609 record_wakee(p); 6653 record_wakee(p);
6610 6654
6611 if (static_branch_unlikely(&sched_energy_present)) { 6655 if (sched_energy_enabled()) {
6612 new_cpu = find_energy_efficient_cpu(p, prev_cpu); 6656 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6613 if (new_cpu >= 0) 6657 if (new_cpu >= 0)
6614 return new_cpu; 6658 return new_cpu;
@@ -7027,6 +7071,12 @@ idle:
7027 if (new_tasks > 0) 7071 if (new_tasks > 0)
7028 goto again; 7072 goto again;
7029 7073
7074 /*
7075 * rq is about to be idle, check if we need to update the
7076 * lost_idle_time of clock_pelt
7077 */
7078 update_idle_rq_clock_pelt(rq);
7079
7030 return NULL; 7080 return NULL;
7031} 7081}
7032 7082
@@ -7647,10 +7697,27 @@ static inline bool others_have_blocked(struct rq *rq)
7647 7697
7648#ifdef CONFIG_FAIR_GROUP_SCHED 7698#ifdef CONFIG_FAIR_GROUP_SCHED
7649 7699
7700static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7701{
7702 if (cfs_rq->load.weight)
7703 return false;
7704
7705 if (cfs_rq->avg.load_sum)
7706 return false;
7707
7708 if (cfs_rq->avg.util_sum)
7709 return false;
7710
7711 if (cfs_rq->avg.runnable_load_sum)
7712 return false;
7713
7714 return true;
7715}
7716
7650static void update_blocked_averages(int cpu) 7717static void update_blocked_averages(int cpu)
7651{ 7718{
7652 struct rq *rq = cpu_rq(cpu); 7719 struct rq *rq = cpu_rq(cpu);
7653 struct cfs_rq *cfs_rq; 7720 struct cfs_rq *cfs_rq, *pos;
7654 const struct sched_class *curr_class; 7721 const struct sched_class *curr_class;
7655 struct rq_flags rf; 7722 struct rq_flags rf;
7656 bool done = true; 7723 bool done = true;
@@ -7662,14 +7729,10 @@ static void update_blocked_averages(int cpu)
7662 * Iterates the task_group tree in a bottom up fashion, see 7729 * Iterates the task_group tree in a bottom up fashion, see
7663 * list_add_leaf_cfs_rq() for details. 7730 * list_add_leaf_cfs_rq() for details.
7664 */ 7731 */
7665 for_each_leaf_cfs_rq(rq, cfs_rq) { 7732 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7666 struct sched_entity *se; 7733 struct sched_entity *se;
7667 7734
7668 /* throttled entities do not contribute to load */ 7735 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
7669 if (throttled_hierarchy(cfs_rq))
7670 continue;
7671
7672 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
7673 update_tg_load_avg(cfs_rq, 0); 7736 update_tg_load_avg(cfs_rq, 0);
7674 7737
7675 /* Propagate pending load changes to the parent, if any: */ 7738 /* Propagate pending load changes to the parent, if any: */
@@ -7677,14 +7740,21 @@ static void update_blocked_averages(int cpu)
7677 if (se && !skip_blocked_update(se)) 7740 if (se && !skip_blocked_update(se))
7678 update_load_avg(cfs_rq_of(se), se, 0); 7741 update_load_avg(cfs_rq_of(se), se, 0);
7679 7742
7743 /*
7744 * There can be a lot of idle CPU cgroups. Don't let fully
7745 * decayed cfs_rqs linger on the list.
7746 */
7747 if (cfs_rq_is_decayed(cfs_rq))
7748 list_del_leaf_cfs_rq(cfs_rq);
7749
7680 /* Don't need periodic decay once load/util_avg are null */ 7750 /* Don't need periodic decay once load/util_avg are null */
7681 if (cfs_rq_has_blocked(cfs_rq)) 7751 if (cfs_rq_has_blocked(cfs_rq))
7682 done = false; 7752 done = false;
7683 } 7753 }
7684 7754
7685 curr_class = rq->curr->sched_class; 7755 curr_class = rq->curr->sched_class;
7686 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); 7756 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7687 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); 7757 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7688 update_irq_load_avg(rq, 0); 7758 update_irq_load_avg(rq, 0);
7689 /* Don't need periodic decay once load/util_avg are null */ 7759 /* Don't need periodic decay once load/util_avg are null */
7690 if (others_have_blocked(rq)) 7760 if (others_have_blocked(rq))
@@ -7754,11 +7824,11 @@ static inline void update_blocked_averages(int cpu)
7754 7824
7755 rq_lock_irqsave(rq, &rf); 7825 rq_lock_irqsave(rq, &rf);
7756 update_rq_clock(rq); 7826 update_rq_clock(rq);
7757 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); 7827 update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
7758 7828
7759 curr_class = rq->curr->sched_class; 7829 curr_class = rq->curr->sched_class;
7760 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); 7830 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7761 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); 7831 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7762 update_irq_load_avg(rq, 0); 7832 update_irq_load_avg(rq, 0);
7763#ifdef CONFIG_NO_HZ_COMMON 7833#ifdef CONFIG_NO_HZ_COMMON
7764 rq->last_blocked_load_update_tick = jiffies; 7834 rq->last_blocked_load_update_tick = jiffies;
@@ -8452,9 +8522,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8452 if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) 8522 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
8453 return 0; 8523 return 0;
8454 8524
8455 env->imbalance = DIV_ROUND_CLOSEST( 8525 env->imbalance = sds->busiest_stat.group_load;
8456 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
8457 SCHED_CAPACITY_SCALE);
8458 8526
8459 return 1; 8527 return 1;
8460} 8528}
@@ -8636,7 +8704,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8636 */ 8704 */
8637 update_sd_lb_stats(env, &sds); 8705 update_sd_lb_stats(env, &sds);
8638 8706
8639 if (static_branch_unlikely(&sched_energy_present)) { 8707 if (sched_energy_enabled()) {
8640 struct root_domain *rd = env->dst_rq->rd; 8708 struct root_domain *rd = env->dst_rq->rd;
8641 8709
8642 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) 8710 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
@@ -8827,21 +8895,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8827 */ 8895 */
8828#define MAX_PINNED_INTERVAL 512 8896#define MAX_PINNED_INTERVAL 512
8829 8897
8830static int need_active_balance(struct lb_env *env) 8898static inline bool
8899asym_active_balance(struct lb_env *env)
8831{ 8900{
8832 struct sched_domain *sd = env->sd; 8901 /*
8902 * ASYM_PACKING needs to force migrate tasks from busy but
8903 * lower priority CPUs in order to pack all tasks in the
8904 * highest priority CPUs.
8905 */
8906 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
8907 sched_asym_prefer(env->dst_cpu, env->src_cpu);
8908}
8833 8909
8834 if (env->idle == CPU_NEWLY_IDLE) { 8910static inline bool
8911voluntary_active_balance(struct lb_env *env)
8912{
8913 struct sched_domain *sd = env->sd;
8835 8914
8836 /* 8915 if (asym_active_balance(env))
8837 * ASYM_PACKING needs to force migrate tasks from busy but 8916 return 1;
8838 * lower priority CPUs in order to pack all tasks in the
8839 * highest priority CPUs.
8840 */
8841 if ((sd->flags & SD_ASYM_PACKING) &&
8842 sched_asym_prefer(env->dst_cpu, env->src_cpu))
8843 return 1;
8844 }
8845 8917
8846 /* 8918 /*
8847 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. 8919 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -8859,6 +8931,16 @@ static int need_active_balance(struct lb_env *env)
8859 if (env->src_grp_type == group_misfit_task) 8931 if (env->src_grp_type == group_misfit_task)
8860 return 1; 8932 return 1;
8861 8933
8934 return 0;
8935}
8936
8937static int need_active_balance(struct lb_env *env)
8938{
8939 struct sched_domain *sd = env->sd;
8940
8941 if (voluntary_active_balance(env))
8942 return 1;
8943
8862 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 8944 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8863} 8945}
8864 8946
@@ -9023,7 +9105,7 @@ more_balance:
9023 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 9105 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9024 9106
9025 /* Prevent to re-select dst_cpu via env's CPUs */ 9107 /* Prevent to re-select dst_cpu via env's CPUs */
9026 cpumask_clear_cpu(env.dst_cpu, env.cpus); 9108 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9027 9109
9028 env.dst_rq = cpu_rq(env.new_dst_cpu); 9110 env.dst_rq = cpu_rq(env.new_dst_cpu);
9029 env.dst_cpu = env.new_dst_cpu; 9111 env.dst_cpu = env.new_dst_cpu;
@@ -9050,7 +9132,7 @@ more_balance:
9050 9132
9051 /* All tasks on this runqueue were pinned by CPU affinity */ 9133 /* All tasks on this runqueue were pinned by CPU affinity */
9052 if (unlikely(env.flags & LBF_ALL_PINNED)) { 9134 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9053 cpumask_clear_cpu(cpu_of(busiest), cpus); 9135 __cpumask_clear_cpu(cpu_of(busiest), cpus);
9054 /* 9136 /*
9055 * Attempting to continue load balancing at the current 9137 * Attempting to continue load balancing at the current
9056 * sched_domain level only makes sense if there are 9138 * sched_domain level only makes sense if there are
@@ -9120,7 +9202,7 @@ more_balance:
9120 } else 9202 } else
9121 sd->nr_balance_failed = 0; 9203 sd->nr_balance_failed = 0;
9122 9204
9123 if (likely(!active_balance)) { 9205 if (likely(!active_balance) || voluntary_active_balance(&env)) {
9124 /* We were unbalanced, so reset the balancing interval */ 9206 /* We were unbalanced, so reset the balancing interval */
9125 sd->balance_interval = sd->min_interval; 9207 sd->balance_interval = sd->min_interval;
9126 } else { 9208 } else {
@@ -9469,15 +9551,8 @@ static void kick_ilb(unsigned int flags)
9469} 9551}
9470 9552
9471/* 9553/*
9472 * Current heuristic for kicking the idle load balancer in the presence 9554 * Current decision point for kicking the idle load balancer in the presence
9473 * of an idle cpu in the system. 9555 * of idle CPUs in the system.
9474 * - This rq has more than one task.
9475 * - This rq has at least one CFS task and the capacity of the CPU is
9476 * significantly reduced because of RT tasks or IRQs.
9477 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
9478 * multiple busy cpu.
9479 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9480 * domain span are idle.
9481 */ 9556 */
9482static void nohz_balancer_kick(struct rq *rq) 9557static void nohz_balancer_kick(struct rq *rq)
9483{ 9558{
@@ -9519,8 +9594,13 @@ static void nohz_balancer_kick(struct rq *rq)
9519 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 9594 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9520 if (sds) { 9595 if (sds) {
9521 /* 9596 /*
9522 * XXX: write a coherent comment on why we do this. 9597 * If there is an imbalance between LLC domains (IOW we could
9523 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com 9598 * increase the overall cache use), we need some less-loaded LLC
9599 * domain to pull some load. Likewise, we may need to spread
9600 * load within the current LLC domain (e.g. packed SMT cores but
9601 * other CPUs are idle). We can't really know from here how busy
9602 * the others are - so just get a nohz balance going if it looks
9603 * like this LLC domain has tasks we could move.
9524 */ 9604 */
9525 nr_busy = atomic_read(&sds->nr_busy_cpus); 9605 nr_busy = atomic_read(&sds->nr_busy_cpus);
9526 if (nr_busy > 1) { 9606 if (nr_busy > 1) {
@@ -9533,7 +9613,7 @@ static void nohz_balancer_kick(struct rq *rq)
9533 sd = rcu_dereference(rq->sd); 9613 sd = rcu_dereference(rq->sd);
9534 if (sd) { 9614 if (sd) {
9535 if ((rq->cfs.h_nr_running >= 1) && 9615 if ((rq->cfs.h_nr_running >= 1) &&
9536 check_cpu_capacity(rq, sd)) { 9616 check_cpu_capacity(rq, sd)) {
9537 flags = NOHZ_KICK_MASK; 9617 flags = NOHZ_KICK_MASK;
9538 goto unlock; 9618 goto unlock;
9539 } 9619 }
@@ -9541,11 +9621,7 @@ static void nohz_balancer_kick(struct rq *rq)
9541 9621
9542 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); 9622 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
9543 if (sd) { 9623 if (sd) {
9544 for_each_cpu(i, sched_domain_span(sd)) { 9624 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
9545 if (i == cpu ||
9546 !cpumask_test_cpu(i, nohz.idle_cpus_mask))
9547 continue;
9548
9549 if (sched_asym_prefer(i, cpu)) { 9625 if (sched_asym_prefer(i, cpu)) {
9550 flags = NOHZ_KICK_MASK; 9626 flags = NOHZ_KICK_MASK;
9551 goto unlock; 9627 goto unlock;
@@ -10546,10 +10622,10 @@ const struct sched_class fair_sched_class = {
10546#ifdef CONFIG_SCHED_DEBUG 10622#ifdef CONFIG_SCHED_DEBUG
10547void print_cfs_stats(struct seq_file *m, int cpu) 10623void print_cfs_stats(struct seq_file *m, int cpu)
10548{ 10624{
10549 struct cfs_rq *cfs_rq; 10625 struct cfs_rq *cfs_rq, *pos;
10550 10626
10551 rcu_read_lock(); 10627 rcu_read_lock();
10552 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 10628 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10553 print_cfs_rq(m, cpu, cfs_rq); 10629 print_cfs_rq(m, cpu, cfs_rq);
10554 rcu_read_unlock(); 10630 rcu_read_unlock();
10555} 10631}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 81faddba9e20..b02d148e7672 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
80 cpumask_andnot(housekeeping_mask, 80 cpumask_andnot(housekeeping_mask,
81 cpu_possible_mask, non_housekeeping_mask); 81 cpu_possible_mask, non_housekeeping_mask);
82 if (cpumask_empty(housekeeping_mask)) 82 if (cpumask_empty(housekeeping_mask))
83 cpumask_set_cpu(smp_processor_id(), housekeeping_mask); 83 __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
84 } else { 84 } else {
85 cpumask_var_t tmp; 85 cpumask_var_t tmp;
86 86
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 90fb5bc12ad4..befce29bd882 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include "sched.h" 28#include "sched.h"
29#include "sched-pelt.h"
30#include "pelt.h" 29#include "pelt.h"
31 30
32/* 31/*
@@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
106 * n=1 105 * n=1
107 */ 106 */
108static __always_inline u32 107static __always_inline u32
109accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, 108accumulate_sum(u64 delta, struct sched_avg *sa,
110 unsigned long load, unsigned long runnable, int running) 109 unsigned long load, unsigned long runnable, int running)
111{ 110{
112 unsigned long scale_freq, scale_cpu;
113 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ 111 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
114 u64 periods; 112 u64 periods;
115 113
116 scale_freq = arch_scale_freq_capacity(cpu);
117 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
118
119 delta += sa->period_contrib; 114 delta += sa->period_contrib;
120 periods = delta / 1024; /* A period is 1024us (~1ms) */ 115 periods = delta / 1024; /* A period is 1024us (~1ms) */
121 116
@@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
137 } 132 }
138 sa->period_contrib = delta; 133 sa->period_contrib = delta;
139 134
140 contrib = cap_scale(contrib, scale_freq);
141 if (load) 135 if (load)
142 sa->load_sum += load * contrib; 136 sa->load_sum += load * contrib;
143 if (runnable) 137 if (runnable)
144 sa->runnable_load_sum += runnable * contrib; 138 sa->runnable_load_sum += runnable * contrib;
145 if (running) 139 if (running)
146 sa->util_sum += contrib * scale_cpu; 140 sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
147 141
148 return periods; 142 return periods;
149} 143}
@@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
177 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 171 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
178 */ 172 */
179static __always_inline int 173static __always_inline int
180___update_load_sum(u64 now, int cpu, struct sched_avg *sa, 174___update_load_sum(u64 now, struct sched_avg *sa,
181 unsigned long load, unsigned long runnable, int running) 175 unsigned long load, unsigned long runnable, int running)
182{ 176{
183 u64 delta; 177 u64 delta;
@@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
221 * Step 1: accumulate *_sum since last_update_time. If we haven't 215 * Step 1: accumulate *_sum since last_update_time. If we haven't
222 * crossed period boundaries, finish. 216 * crossed period boundaries, finish.
223 */ 217 */
224 if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) 218 if (!accumulate_sum(delta, sa, load, runnable, running))
225 return 0; 219 return 0;
226 220
227 return 1; 221 return 1;
@@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
267 * runnable_load_avg = \Sum se->avg.runable_load_avg 261 * runnable_load_avg = \Sum se->avg.runable_load_avg
268 */ 262 */
269 263
270int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) 264int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
271{ 265{
272 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { 266 if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
273 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 267 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
274 return 1; 268 return 1;
275 } 269 }
@@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
277 return 0; 271 return 0;
278} 272}
279 273
280int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) 274int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
281{ 275{
282 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, 276 if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
283 cfs_rq->curr == se)) { 277 cfs_rq->curr == se)) {
284 278
285 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 279 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
@@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e
290 return 0; 284 return 0;
291} 285}
292 286
293int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) 287int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
294{ 288{
295 if (___update_load_sum(now, cpu, &cfs_rq->avg, 289 if (___update_load_sum(now, &cfs_rq->avg,
296 scale_load_down(cfs_rq->load.weight), 290 scale_load_down(cfs_rq->load.weight),
297 scale_load_down(cfs_rq->runnable_weight), 291 scale_load_down(cfs_rq->runnable_weight),
298 cfs_rq->curr != NULL)) { 292 cfs_rq->curr != NULL)) {
@@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
317 311
318int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) 312int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
319{ 313{
320 if (___update_load_sum(now, rq->cpu, &rq->avg_rt, 314 if (___update_load_sum(now, &rq->avg_rt,
321 running, 315 running,
322 running, 316 running,
323 running)) { 317 running)) {
@@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
340 334
341int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) 335int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
342{ 336{
343 if (___update_load_sum(now, rq->cpu, &rq->avg_dl, 337 if (___update_load_sum(now, &rq->avg_dl,
344 running, 338 running,
345 running, 339 running,
346 running)) { 340 running)) {
@@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
365int update_irq_load_avg(struct rq *rq, u64 running) 359int update_irq_load_avg(struct rq *rq, u64 running)
366{ 360{
367 int ret = 0; 361 int ret = 0;
362
363 /*
364 * We can't use clock_pelt because irq time is not accounted in
365 * clock_task. Instead we directly scale the running time to
366 * reflect the real amount of computation
367 */
368 running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
369 running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
370
368 /* 371 /*
369 * We know the time that has been used by interrupt since last update 372 * We know the time that has been used by interrupt since last update
370 * but we don't when. Let be pessimistic and assume that interrupt has 373 * but we don't when. Let be pessimistic and assume that interrupt has
371 * happened just before the update. This is not so far from reality 374 * happened just before the update. This is not so far from reality
372 * because interrupt will most probably wake up task and trig an update 375 * because interrupt will most probably wake up task and trig an update
373 * of rq clock during which the metric si updated. 376 * of rq clock during which the metric is updated.
374 * We start to decay with normal context time and then we add the 377 * We start to decay with normal context time and then we add the
375 * interrupt context time. 378 * interrupt context time.
376 * We can safely remove running from rq->clock because 379 * We can safely remove running from rq->clock because
377 * rq->clock += delta with delta >= running 380 * rq->clock += delta with delta >= running
378 */ 381 */
379 ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, 382 ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
380 0, 383 0,
381 0, 384 0,
382 0); 385 0);
383 ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, 386 ret += ___update_load_sum(rq->clock, &rq->avg_irq,
384 1, 387 1,
385 1, 388 1,
386 1); 389 1);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7e56b489ff32..7489d5f56960 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
1#ifdef CONFIG_SMP 1#ifdef CONFIG_SMP
2#include "sched-pelt.h"
2 3
3int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); 4int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
4int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); 5int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
5int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); 6int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
6int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); 7int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
7int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); 8int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
8 9
@@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
42 WRITE_ONCE(avg->util_est.enqueued, enqueued); 43 WRITE_ONCE(avg->util_est.enqueued, enqueued);
43} 44}
44 45
46/*
47 * The clock_pelt scales the time to reflect the effective amount of
48 * computation done during the running delta time but then sync back to
49 * clock_task when rq is idle.
50 *
51 *
52 * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
53 * @ max capacity ------******---------------******---------------
54 * @ half capacity ------************---------************---------
55 * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
56 *
57 */
58static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
59{
60 if (unlikely(is_idle_task(rq->curr))) {
61 /* The rq is idle, we can sync to clock_task */
62 rq->clock_pelt = rq_clock_task(rq);
63 return;
64 }
65
66 /*
67 * When a rq runs at a lower compute capacity, it will need
68 * more time to do the same amount of work than at max
69 * capacity. In order to be invariant, we scale the delta to
70 * reflect how much work has been really done.
71 * Running longer results in stealing idle time that will
72 * disturb the load signal compared to max capacity. This
73 * stolen idle time will be automatically reflected when the
74 * rq will be idle and the clock will be synced with
75 * rq_clock_task.
76 */
77
78 /*
79 * Scale the elapsed time to reflect the real amount of
80 * computation
81 */
82 delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
83 delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
84
85 rq->clock_pelt += delta;
86}
87
88/*
89 * When rq becomes idle, we have to check if it has lost idle time
90 * because it was fully busy. A rq is fully used when the /Sum util_sum
91 * is greater or equal to:
92 * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
93 * For optimization and computing rounding purpose, we don't take into account
94 * the position in the current window (period_contrib) and we use the higher
95 * bound of util_sum to decide.
96 */
97static inline void update_idle_rq_clock_pelt(struct rq *rq)
98{
99 u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
100 u32 util_sum = rq->cfs.avg.util_sum;
101 util_sum += rq->avg_rt.util_sum;
102 util_sum += rq->avg_dl.util_sum;
103
104 /*
105 * Reflecting stolen time makes sense only if the idle
106 * phase would be present at max capacity. As soon as the
107 * utilization of a rq has reached the maximum value, it is
108 * considered as an always runnig rq without idle time to
109 * steal. This potential idle time is considered as lost in
110 * this case. We keep track of this lost idle time compare to
111 * rq's clock_task.
112 */
113 if (util_sum >= divider)
114 rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
115}
116
117static inline u64 rq_clock_pelt(struct rq *rq)
118{
119 lockdep_assert_held(&rq->lock);
120 assert_clock_updated(rq);
121
122 return rq->clock_pelt - rq->lost_idle_time;
123}
124
125#ifdef CONFIG_CFS_BANDWIDTH
126/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
127static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
128{
129 if (unlikely(cfs_rq->throttle_count))
130 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
131
132 return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
133}
134#else
135static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
136{
137 return rq_clock_pelt(rq_of(cfs_rq));
138}
139#endif
140
45#else 141#else
46 142
47static inline int 143static inline int
@@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
67{ 163{
68 return 0; 164 return 0;
69} 165}
166
167static inline u64 rq_clock_pelt(struct rq *rq)
168{
169 return rq_clock_task(rq);
170}
171
172static inline void
173update_rq_clock_pelt(struct rq *rq, s64 delta) { }
174
175static inline void
176update_idle_rq_clock_pelt(struct rq *rq) { }
177
70#endif 178#endif
71 179
72 180
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e4f398ad9e73..90fa23d36565 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1587 * rt task 1587 * rt task
1588 */ 1588 */
1589 if (rq->curr->sched_class != &rt_sched_class) 1589 if (rq->curr->sched_class != &rt_sched_class)
1590 update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); 1590 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1591 1591
1592 return p; 1592 return p;
1593} 1593}
@@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1596{ 1596{
1597 update_curr_rt(rq); 1597 update_curr_rt(rq);
1598 1598
1599 update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); 1599 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1600 1600
1601 /* 1601 /*
1602 * The previous task needs to be made eligible for pushing 1602 * The previous task needs to be made eligible for pushing
@@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2325 struct sched_rt_entity *rt_se = &p->rt; 2325 struct sched_rt_entity *rt_se = &p->rt;
2326 2326
2327 update_curr_rt(rq); 2327 update_curr_rt(rq);
2328 update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); 2328 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2329 2329
2330 watchdog(rq, p); 2330 watchdog(rq, p);
2331 2331
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6665b9c02e2f..efa686eeff26 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -861,7 +861,10 @@ struct rq {
861 861
862 unsigned int clock_update_flags; 862 unsigned int clock_update_flags;
863 u64 clock; 863 u64 clock;
864 u64 clock_task; 864 /* Ensure that all clocks are in the same cache line */
865 u64 clock_task ____cacheline_aligned;
866 u64 clock_pelt;
867 unsigned long lost_idle_time;
865 868
866 atomic_t nr_iowait; 869 atomic_t nr_iowait;
867 870
@@ -951,6 +954,22 @@ struct rq {
951#endif 954#endif
952}; 955};
953 956
957#ifdef CONFIG_FAIR_GROUP_SCHED
958
959/* CPU runqueue to which this cfs_rq is attached */
960static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
961{
962 return cfs_rq->rq;
963}
964
965#else
966
967static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
968{
969 return container_of(cfs_rq, struct rq, cfs);
970}
971#endif
972
954static inline int cpu_of(struct rq *rq) 973static inline int cpu_of(struct rq *rq)
955{ 974{
956#ifdef CONFIG_SMP 975#ifdef CONFIG_SMP
@@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1460 */ 1479 */
1461 smp_wmb(); 1480 smp_wmb();
1462#ifdef CONFIG_THREAD_INFO_IN_TASK 1481#ifdef CONFIG_THREAD_INFO_IN_TASK
1463 p->cpu = cpu; 1482 WRITE_ONCE(p->cpu, cpu);
1464#else 1483#else
1465 task_thread_info(p)->cpu = cpu; 1484 WRITE_ONCE(task_thread_info(p)->cpu, cpu);
1466#endif 1485#endif
1467 p->wake_cpu = cpu; 1486 p->wake_cpu = cpu;
1468#endif 1487#endif
@@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p)
1563 1582
1564static inline int task_on_rq_migrating(struct task_struct *p) 1583static inline int task_on_rq_migrating(struct task_struct *p)
1565{ 1584{
1566 return p->on_rq == TASK_ON_RQ_MIGRATING; 1585 return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
1567} 1586}
1568 1587
1569/* 1588/*
@@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
1781unsigned long to_ratio(u64 period, u64 runtime); 1800unsigned long to_ratio(u64 period, u64 runtime);
1782 1801
1783extern void init_entity_runnable_average(struct sched_entity *se); 1802extern void init_entity_runnable_average(struct sched_entity *se);
1784extern void post_init_entity_util_avg(struct sched_entity *se); 1803extern void post_init_entity_util_avg(struct task_struct *p);
1785 1804
1786#ifdef CONFIG_NO_HZ_FULL 1805#ifdef CONFIG_NO_HZ_FULL
1787extern bool sched_can_stop_tick(struct rq *rq); 1806extern bool sched_can_stop_tick(struct rq *rq);
@@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2211# define arch_scale_freq_invariant() false 2230# define arch_scale_freq_invariant() false
2212#endif 2231#endif
2213 2232
2233#ifdef CONFIG_SMP
2234static inline unsigned long capacity_orig_of(int cpu)
2235{
2236 return cpu_rq(cpu)->cpu_capacity_orig;
2237}
2238#endif
2239
2214#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2240#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2215/** 2241/**
2216 * enum schedutil_type - CPU utilization type 2242 * enum schedutil_type - CPU utilization type
@@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
2299#endif 2325#endif
2300 2326
2301#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 2327#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2328
2302#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) 2329#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
2303#else 2330
2331DECLARE_STATIC_KEY_FALSE(sched_energy_present);
2332
2333static inline bool sched_energy_enabled(void)
2334{
2335 return static_branch_unlikely(&sched_energy_present);
2336}
2337
2338#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
2339
2304#define perf_domain_span(pd) NULL 2340#define perf_domain_span(pd) NULL
2305#endif 2341static inline bool sched_energy_enabled(void) { return false; }
2306 2342
2307#ifdef CONFIG_SMP 2343#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2308extern struct static_key_false sched_energy_present;
2309#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 7d905f55e7fa..ab7f371a3a17 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
201 return 1; 201 return 1;
202} 202}
203 203
204DEFINE_STATIC_KEY_FALSE(sched_energy_present);
205#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 204#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
205DEFINE_STATIC_KEY_FALSE(sched_energy_present);
206unsigned int sysctl_sched_energy_aware = 1;
206DEFINE_MUTEX(sched_energy_mutex); 207DEFINE_MUTEX(sched_energy_mutex);
207bool sched_energy_update; 208bool sched_energy_update;
208 209
210#ifdef CONFIG_PROC_SYSCTL
211int sched_energy_aware_handler(struct ctl_table *table, int write,
212 void __user *buffer, size_t *lenp, loff_t *ppos)
213{
214 int ret, state;
215
216 if (write && !capable(CAP_SYS_ADMIN))
217 return -EPERM;
218
219 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
220 if (!ret && write) {
221 state = static_branch_unlikely(&sched_energy_present);
222 if (state != sysctl_sched_energy_aware) {
223 mutex_lock(&sched_energy_mutex);
224 sched_energy_update = 1;
225 rebuild_sched_domains();
226 sched_energy_update = 0;
227 mutex_unlock(&sched_energy_mutex);
228 }
229 }
230
231 return ret;
232}
233#endif
234
209static void free_pd(struct perf_domain *pd) 235static void free_pd(struct perf_domain *pd)
210{ 236{
211 struct perf_domain *tmp; 237 struct perf_domain *tmp;
@@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
322 struct cpufreq_policy *policy; 348 struct cpufreq_policy *policy;
323 struct cpufreq_governor *gov; 349 struct cpufreq_governor *gov;
324 350
351 if (!sysctl_sched_energy_aware)
352 goto free;
353
325 /* EAS is enabled for asymmetric CPU capacity topologies. */ 354 /* EAS is enabled for asymmetric CPU capacity topologies. */
326 if (!per_cpu(sd_asym_cpucapacity, cpu)) { 355 if (!per_cpu(sd_asym_cpucapacity, cpu)) {
327 if (sched_debug()) { 356 if (sched_debug()) {
@@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
676} 705}
677 706
678struct s_data { 707struct s_data {
679 struct sched_domain ** __percpu sd; 708 struct sched_domain * __percpu *sd;
680 struct root_domain *rd; 709 struct root_domain *rd;
681}; 710};
682 711
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7578e21a711b..7c2b9bc88ee8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -472,6 +472,17 @@ static struct ctl_table kern_table[] = {
472 .extra1 = &one, 472 .extra1 = &one,
473 }, 473 },
474#endif 474#endif
475#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
476 {
477 .procname = "sched_energy_aware",
478 .data = &sysctl_sched_energy_aware,
479 .maxlen = sizeof(unsigned int),
480 .mode = 0644,
481 .proc_handler = sched_energy_aware_handler,
482 .extra1 = &zero,
483 .extra2 = &one,
484 },
485#endif
475#ifdef CONFIG_PROVE_LOCKING 486#ifdef CONFIG_PROVE_LOCKING
476 { 487 {
477 .procname = "prove_locking", 488 .procname = "prove_locking",