diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-06 11:14:05 -0500 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-06 11:14:05 -0500 |
| commit | 45802da05e666a81b421422d3e302930c0e24e77 (patch) | |
| tree | feca43796693395bb2912c59768dc809022e7583 | |
| parent | 203b6609e0ede49eb0b97008b1150c69e9d2ffd3 (diff) | |
| parent | ad01423aedaa7c6dd62d560b73a3cb39e6da3901 (diff) | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main changes in this cycle were:
- refcount conversions
- Solve the rq->leaf_cfs_rq_list can of worms for real.
- improve power-aware scheduling
- add sysctl knob for Energy Aware Scheduling
- documentation updates
- misc other changes"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
kthread: Do not use TIMER_IRQSAFE
kthread: Convert worker lock to raw spinlock
sched/fair: Use non-atomic cpumask_{set,clear}_cpu()
sched/fair: Remove unused 'sd' parameter from select_idle_smt()
sched/wait: Use freezable_schedule() when possible
sched/fair: Prune, fix and simplify the nohz_balancer_kick() comment block
sched/fair: Explain LLC nohz kick condition
sched/fair: Simplify nohz_balancer_kick()
sched/topology: Fix percpu data types in struct sd_data & struct s_data
sched/fair: Simplify post_init_entity_util_avg() by calling it with a task_struct pointer argument
sched/fair: Fix O(nr_cgroups) in the load balancing path
sched/fair: Optimize update_blocked_averages()
sched/fair: Fix insertion in rq->leaf_cfs_rq_list
sched/fair: Add tmp_alone_branch assertion
sched/core: Use READ_ONCE()/WRITE_ONCE() in move_queued_task()/task_rq_lock()
sched/debug: Initialize sd_sysctl_cpus if !CONFIG_CPUMASK_OFFSTACK
sched/pelt: Skip updating util_est when utilization is higher than CPU's capacity
sched/fair: Update scale invariance of PELT
sched/fair: Move the rq_of() helper function
sched/core: Convert task_struct.stack_refcount to refcount_t
...
29 files changed, 1165 insertions, 324 deletions
diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt new file mode 100644 index 000000000000..a2b0ae4c76bd --- /dev/null +++ b/Documentation/power/energy-model.txt | |||
| @@ -0,0 +1,144 @@ | |||
| 1 | ==================== | ||
| 2 | Energy Model of CPUs | ||
| 3 | ==================== | ||
| 4 | |||
| 5 | 1. Overview | ||
| 6 | ----------- | ||
| 7 | |||
| 8 | The Energy Model (EM) framework serves as an interface between drivers knowing | ||
| 9 | the power consumed by CPUs at various performance levels, and the kernel | ||
| 10 | subsystems willing to use that information to make energy-aware decisions. | ||
| 11 | |||
| 12 | The source of the information about the power consumed by CPUs can vary greatly | ||
| 13 | from one platform to another. These power costs can be estimated using | ||
| 14 | devicetree data in some cases. In others, the firmware will know better. | ||
| 15 | Alternatively, userspace might be best positioned. And so on. In order to avoid | ||
| 16 | each and every client subsystem to re-implement support for each and every | ||
| 17 | possible source of information on its own, the EM framework intervenes as an | ||
| 18 | abstraction layer which standardizes the format of power cost tables in the | ||
| 19 | kernel, hence enabling to avoid redundant work. | ||
| 20 | |||
| 21 | The figure below depicts an example of drivers (Arm-specific here, but the | ||
| 22 | approach is applicable to any architecture) providing power costs to the EM | ||
| 23 | framework, and interested clients reading the data from it. | ||
| 24 | |||
| 25 | +---------------+ +-----------------+ +---------------+ | ||
| 26 | | Thermal (IPA) | | Scheduler (EAS) | | Other | | ||
| 27 | +---------------+ +-----------------+ +---------------+ | ||
| 28 | | | em_pd_energy() | | ||
| 29 | | | em_cpu_get() | | ||
| 30 | +---------+ | +---------+ | ||
| 31 | | | | | ||
| 32 | v v v | ||
| 33 | +---------------------+ | ||
| 34 | | Energy Model | | ||
| 35 | | Framework | | ||
| 36 | +---------------------+ | ||
| 37 | ^ ^ ^ | ||
| 38 | | | | em_register_perf_domain() | ||
| 39 | +----------+ | +---------+ | ||
| 40 | | | | | ||
| 41 | +---------------+ +---------------+ +--------------+ | ||
| 42 | | cpufreq-dt | | arm_scmi | | Other | | ||
| 43 | +---------------+ +---------------+ +--------------+ | ||
| 44 | ^ ^ ^ | ||
| 45 | | | | | ||
| 46 | +--------------+ +---------------+ +--------------+ | ||
| 47 | | Device Tree | | Firmware | | ? | | ||
| 48 | +--------------+ +---------------+ +--------------+ | ||
| 49 | |||
| 50 | The EM framework manages power cost tables per 'performance domain' in the | ||
| 51 | system. A performance domain is a group of CPUs whose performance is scaled | ||
| 52 | together. Performance domains generally have a 1-to-1 mapping with CPUFreq | ||
| 53 | policies. All CPUs in a performance domain are required to have the same | ||
| 54 | micro-architecture. CPUs in different performance domains can have different | ||
| 55 | micro-architectures. | ||
| 56 | |||
| 57 | |||
| 58 | 2. Core APIs | ||
| 59 | ------------ | ||
| 60 | |||
| 61 | 2.1 Config options | ||
| 62 | |||
| 63 | CONFIG_ENERGY_MODEL must be enabled to use the EM framework. | ||
| 64 | |||
| 65 | |||
| 66 | 2.2 Registration of performance domains | ||
| 67 | |||
| 68 | Drivers are expected to register performance domains into the EM framework by | ||
| 69 | calling the following API: | ||
| 70 | |||
| 71 | int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, | ||
| 72 | struct em_data_callback *cb); | ||
| 73 | |||
| 74 | Drivers must specify the CPUs of the performance domains using the cpumask | ||
| 75 | argument, and provide a callback function returning <frequency, power> tuples | ||
| 76 | for each capacity state. The callback function provided by the driver is free | ||
| 77 | to fetch data from any relevant location (DT, firmware, ...), and by any mean | ||
| 78 | deemed necessary. See Section 3. for an example of driver implementing this | ||
| 79 | callback, and kernel/power/energy_model.c for further documentation on this | ||
| 80 | API. | ||
| 81 | |||
| 82 | |||
| 83 | 2.3 Accessing performance domains | ||
| 84 | |||
| 85 | Subsystems interested in the energy model of a CPU can retrieve it using the | ||
| 86 | em_cpu_get() API. The energy model tables are allocated once upon creation of | ||
| 87 | the performance domains, and kept in memory untouched. | ||
| 88 | |||
| 89 | The energy consumed by a performance domain can be estimated using the | ||
| 90 | em_pd_energy() API. The estimation is performed assuming that the schedutil | ||
| 91 | CPUfreq governor is in use. | ||
| 92 | |||
| 93 | More details about the above APIs can be found in include/linux/energy_model.h. | ||
| 94 | |||
| 95 | |||
| 96 | 3. Example driver | ||
| 97 | ----------------- | ||
| 98 | |||
| 99 | This section provides a simple example of a CPUFreq driver registering a | ||
| 100 | performance domain in the Energy Model framework using the (fake) 'foo' | ||
| 101 | protocol. The driver implements an est_power() function to be provided to the | ||
| 102 | EM framework. | ||
| 103 | |||
| 104 | -> drivers/cpufreq/foo_cpufreq.c | ||
| 105 | |||
| 106 | 01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu) | ||
| 107 | 02 { | ||
| 108 | 03 long freq, power; | ||
| 109 | 04 | ||
| 110 | 05 /* Use the 'foo' protocol to ceil the frequency */ | ||
| 111 | 06 freq = foo_get_freq_ceil(cpu, *KHz); | ||
| 112 | 07 if (freq < 0); | ||
| 113 | 08 return freq; | ||
| 114 | 09 | ||
| 115 | 10 /* Estimate the power cost for the CPU at the relevant freq. */ | ||
| 116 | 11 power = foo_estimate_power(cpu, freq); | ||
| 117 | 12 if (power < 0); | ||
| 118 | 13 return power; | ||
| 119 | 14 | ||
| 120 | 15 /* Return the values to the EM framework */ | ||
| 121 | 16 *mW = power; | ||
| 122 | 17 *KHz = freq; | ||
| 123 | 18 | ||
| 124 | 19 return 0; | ||
| 125 | 20 } | ||
| 126 | 21 | ||
| 127 | 22 static int foo_cpufreq_init(struct cpufreq_policy *policy) | ||
| 128 | 23 { | ||
| 129 | 24 struct em_data_callback em_cb = EM_DATA_CB(est_power); | ||
| 130 | 25 int nr_opp, ret; | ||
| 131 | 26 | ||
| 132 | 27 /* Do the actual CPUFreq init work ... */ | ||
| 133 | 28 ret = do_foo_cpufreq_init(policy); | ||
| 134 | 29 if (ret) | ||
| 135 | 30 return ret; | ||
| 136 | 31 | ||
| 137 | 32 /* Find the number of OPPs for this policy */ | ||
| 138 | 33 nr_opp = foo_get_nr_opp(policy); | ||
| 139 | 34 | ||
| 140 | 35 /* And register the new performance domain */ | ||
| 141 | 36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb); | ||
| 142 | 37 | ||
| 143 | 38 return 0; | ||
| 144 | 39 } | ||
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt new file mode 100644 index 000000000000..197d81f4b836 --- /dev/null +++ b/Documentation/scheduler/sched-energy.txt | |||
| @@ -0,0 +1,425 @@ | |||
| 1 | ======================= | ||
| 2 | Energy Aware Scheduling | ||
| 3 | ======================= | ||
| 4 | |||
| 5 | 1. Introduction | ||
| 6 | --------------- | ||
| 7 | |||
| 8 | Energy Aware Scheduling (or EAS) gives the scheduler the ability to predict | ||
| 9 | the impact of its decisions on the energy consumed by CPUs. EAS relies on an | ||
| 10 | Energy Model (EM) of the CPUs to select an energy efficient CPU for each task, | ||
| 11 | with a minimal impact on throughput. This document aims at providing an | ||
| 12 | introduction on how EAS works, what are the main design decisions behind it, and | ||
| 13 | details what is needed to get it to run. | ||
| 14 | |||
| 15 | Before going any further, please note that at the time of writing: | ||
| 16 | |||
| 17 | /!\ EAS does not support platforms with symmetric CPU topologies /!\ | ||
| 18 | |||
| 19 | EAS operates only on heterogeneous CPU topologies (such as Arm big.LITTLE) | ||
| 20 | because this is where the potential for saving energy through scheduling is | ||
| 21 | the highest. | ||
| 22 | |||
| 23 | The actual EM used by EAS is _not_ maintained by the scheduler, but by a | ||
| 24 | dedicated framework. For details about this framework and what it provides, | ||
| 25 | please refer to its documentation (see Documentation/power/energy-model.txt). | ||
| 26 | |||
| 27 | |||
| 28 | 2. Background and Terminology | ||
| 29 | ----------------------------- | ||
| 30 | |||
| 31 | To make it clear from the start: | ||
| 32 | - energy = [joule] (resource like a battery on powered devices) | ||
| 33 | - power = energy/time = [joule/second] = [watt] | ||
| 34 | |||
| 35 | The goal of EAS is to minimize energy, while still getting the job done. That | ||
| 36 | is, we want to maximize: | ||
| 37 | |||
| 38 | performance [inst/s] | ||
| 39 | -------------------- | ||
| 40 | power [W] | ||
| 41 | |||
| 42 | which is equivalent to minimizing: | ||
| 43 | |||
| 44 | energy [J] | ||
| 45 | ----------- | ||
| 46 | instruction | ||
| 47 | |||
| 48 | while still getting 'good' performance. It is essentially an alternative | ||
| 49 | optimization objective to the current performance-only objective for the | ||
| 50 | scheduler. This alternative considers two objectives: energy-efficiency and | ||
| 51 | performance. | ||
| 52 | |||
| 53 | The idea behind introducing an EM is to allow the scheduler to evaluate the | ||
| 54 | implications of its decisions rather than blindly applying energy-saving | ||
| 55 | techniques that may have positive effects only on some platforms. At the same | ||
| 56 | time, the EM must be as simple as possible to minimize the scheduler latency | ||
| 57 | impact. | ||
| 58 | |||
| 59 | In short, EAS changes the way CFS tasks are assigned to CPUs. When it is time | ||
| 60 | for the scheduler to decide where a task should run (during wake-up), the EM | ||
| 61 | is used to break the tie between several good CPU candidates and pick the one | ||
| 62 | that is predicted to yield the best energy consumption without harming the | ||
| 63 | system's throughput. The predictions made by EAS rely on specific elements of | ||
| 64 | knowledge about the platform's topology, which include the 'capacity' of CPUs, | ||
| 65 | and their respective energy costs. | ||
| 66 | |||
| 67 | |||
| 68 | 3. Topology information | ||
| 69 | ----------------------- | ||
| 70 | |||
| 71 | EAS (as well as the rest of the scheduler) uses the notion of 'capacity' to | ||
| 72 | differentiate CPUs with different computing throughput. The 'capacity' of a CPU | ||
| 73 | represents the amount of work it can absorb when running at its highest | ||
| 74 | frequency compared to the most capable CPU of the system. Capacity values are | ||
| 75 | normalized in a 1024 range, and are comparable with the utilization signals of | ||
| 76 | tasks and CPUs computed by the Per-Entity Load Tracking (PELT) mechanism. Thanks | ||
| 77 | to capacity and utilization values, EAS is able to estimate how big/busy a | ||
| 78 | task/CPU is, and to take this into consideration when evaluating performance vs | ||
| 79 | energy trade-offs. The capacity of CPUs is provided via arch-specific code | ||
| 80 | through the arch_scale_cpu_capacity() callback. | ||
| 81 | |||
| 82 | The rest of platform knowledge used by EAS is directly read from the Energy | ||
| 83 | Model (EM) framework. The EM of a platform is composed of a power cost table | ||
| 84 | per 'performance domain' in the system (see Documentation/power/energy-model.txt | ||
| 85 | for futher details about performance domains). | ||
| 86 | |||
| 87 | The scheduler manages references to the EM objects in the topology code when the | ||
| 88 | scheduling domains are built, or re-built. For each root domain (rd), the | ||
| 89 | scheduler maintains a singly linked list of all performance domains intersecting | ||
| 90 | the current rd->span. Each node in the list contains a pointer to a struct | ||
| 91 | em_perf_domain as provided by the EM framework. | ||
| 92 | |||
| 93 | The lists are attached to the root domains in order to cope with exclusive | ||
| 94 | cpuset configurations. Since the boundaries of exclusive cpusets do not | ||
| 95 | necessarily match those of performance domains, the lists of different root | ||
| 96 | domains can contain duplicate elements. | ||
| 97 | |||
| 98 | Example 1. | ||
| 99 | Let us consider a platform with 12 CPUs, split in 3 performance domains | ||
| 100 | (pd0, pd4 and pd8), organized as follows: | ||
| 101 | |||
| 102 | CPUs: 0 1 2 3 4 5 6 7 8 9 10 11 | ||
| 103 | PDs: |--pd0--|--pd4--|---pd8---| | ||
| 104 | RDs: |----rd1----|-----rd2-----| | ||
| 105 | |||
| 106 | Now, consider that userspace decided to split the system with two | ||
| 107 | exclusive cpusets, hence creating two independent root domains, each | ||
| 108 | containing 6 CPUs. The two root domains are denoted rd1 and rd2 in the | ||
| 109 | above figure. Since pd4 intersects with both rd1 and rd2, it will be | ||
| 110 | present in the linked list '->pd' attached to each of them: | ||
| 111 | * rd1->pd: pd0 -> pd4 | ||
| 112 | * rd2->pd: pd4 -> pd8 | ||
| 113 | |||
| 114 | Please note that the scheduler will create two duplicate list nodes for | ||
| 115 | pd4 (one for each list). However, both just hold a pointer to the same | ||
| 116 | shared data structure of the EM framework. | ||
| 117 | |||
| 118 | Since the access to these lists can happen concurrently with hotplug and other | ||
| 119 | things, they are protected by RCU, like the rest of topology structures | ||
| 120 | manipulated by the scheduler. | ||
| 121 | |||
| 122 | EAS also maintains a static key (sched_energy_present) which is enabled when at | ||
| 123 | least one root domain meets all conditions for EAS to start. Those conditions | ||
| 124 | are summarized in Section 6. | ||
| 125 | |||
| 126 | |||
| 127 | 4. Energy-Aware task placement | ||
| 128 | ------------------------------ | ||
| 129 | |||
| 130 | EAS overrides the CFS task wake-up balancing code. It uses the EM of the | ||
| 131 | platform and the PELT signals to choose an energy-efficient target CPU during | ||
| 132 | wake-up balance. When EAS is enabled, select_task_rq_fair() calls | ||
| 133 | find_energy_efficient_cpu() to do the placement decision. This function looks | ||
| 134 | for the CPU with the highest spare capacity (CPU capacity - CPU utilization) in | ||
| 135 | each performance domain since it is the one which will allow us to keep the | ||
| 136 | frequency the lowest. Then, the function checks if placing the task there could | ||
| 137 | save energy compared to leaving it on prev_cpu, i.e. the CPU where the task ran | ||
| 138 | in its previous activation. | ||
| 139 | |||
| 140 | find_energy_efficient_cpu() uses compute_energy() to estimate what will be the | ||
| 141 | energy consumed by the system if the waking task was migrated. compute_energy() | ||
| 142 | looks at the current utilization landscape of the CPUs and adjusts it to | ||
| 143 | 'simulate' the task migration. The EM framework provides the em_pd_energy() API | ||
| 144 | which computes the expected energy consumption of each performance domain for | ||
| 145 | the given utilization landscape. | ||
| 146 | |||
| 147 | An example of energy-optimized task placement decision is detailed below. | ||
| 148 | |||
| 149 | Example 2. | ||
| 150 | Let us consider a (fake) platform with 2 independent performance domains | ||
| 151 | composed of two CPUs each. CPU0 and CPU1 are little CPUs; CPU2 and CPU3 | ||
| 152 | are big. | ||
| 153 | |||
| 154 | The scheduler must decide where to place a task P whose util_avg = 200 | ||
| 155 | and prev_cpu = 0. | ||
| 156 | |||
| 157 | The current utilization landscape of the CPUs is depicted on the graph | ||
| 158 | below. CPUs 0-3 have a util_avg of 400, 100, 600 and 500 respectively | ||
| 159 | Each performance domain has three Operating Performance Points (OPPs). | ||
| 160 | The CPU capacity and power cost associated with each OPP is listed in | ||
| 161 | the Energy Model table. The util_avg of P is shown on the figures | ||
| 162 | below as 'PP'. | ||
| 163 | |||
| 164 | CPU util. | ||
| 165 | 1024 - - - - - - - Energy Model | ||
| 166 | +-----------+-------------+ | ||
| 167 | | Little | Big | | ||
| 168 | 768 ============= +-----+-----+------+------+ | ||
| 169 | | Cap | Pwr | Cap | Pwr | | ||
| 170 | +-----+-----+------+------+ | ||
| 171 | 512 =========== - ##- - - - - | 170 | 50 | 512 | 400 | | ||
| 172 | ## ## | 341 | 150 | 768 | 800 | | ||
| 173 | 341 -PP - - - - ## ## | 512 | 300 | 1024 | 1700 | | ||
| 174 | PP ## ## +-----+-----+------+------+ | ||
| 175 | 170 -## - - - - ## ## | ||
| 176 | ## ## ## ## | ||
| 177 | ------------ ------------- | ||
| 178 | CPU0 CPU1 CPU2 CPU3 | ||
| 179 | |||
| 180 | Current OPP: ===== Other OPP: - - - util_avg (100 each): ## | ||
| 181 | |||
| 182 | |||
| 183 | find_energy_efficient_cpu() will first look for the CPUs with the | ||
| 184 | maximum spare capacity in the two performance domains. In this example, | ||
| 185 | CPU1 and CPU3. Then it will estimate the energy of the system if P was | ||
| 186 | placed on either of them, and check if that would save some energy | ||
| 187 | compared to leaving P on CPU0. EAS assumes that OPPs follow utilization | ||
| 188 | (which is coherent with the behaviour of the schedutil CPUFreq | ||
| 189 | governor, see Section 6. for more details on this topic). | ||
| 190 | |||
| 191 | Case 1. P is migrated to CPU1 | ||
| 192 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
| 193 | |||
| 194 | 1024 - - - - - - - | ||
| 195 | |||
| 196 | Energy calculation: | ||
| 197 | 768 ============= * CPU0: 200 / 341 * 150 = 88 | ||
| 198 | * CPU1: 300 / 341 * 150 = 131 | ||
| 199 | * CPU2: 600 / 768 * 800 = 625 | ||
| 200 | 512 - - - - - - - ##- - - - - * CPU3: 500 / 768 * 800 = 520 | ||
| 201 | ## ## => total_energy = 1364 | ||
| 202 | 341 =========== ## ## | ||
| 203 | PP ## ## | ||
| 204 | 170 -## - - PP- ## ## | ||
| 205 | ## ## ## ## | ||
| 206 | ------------ ------------- | ||
| 207 | CPU0 CPU1 CPU2 CPU3 | ||
| 208 | |||
| 209 | |||
| 210 | Case 2. P is migrated to CPU3 | ||
| 211 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
| 212 | |||
| 213 | 1024 - - - - - - - | ||
| 214 | |||
| 215 | Energy calculation: | ||
| 216 | 768 ============= * CPU0: 200 / 341 * 150 = 88 | ||
| 217 | * CPU1: 100 / 341 * 150 = 43 | ||
| 218 | PP * CPU2: 600 / 768 * 800 = 625 | ||
| 219 | 512 - - - - - - - ##- - -PP - * CPU3: 700 / 768 * 800 = 729 | ||
| 220 | ## ## => total_energy = 1485 | ||
| 221 | 341 =========== ## ## | ||
| 222 | ## ## | ||
| 223 | 170 -## - - - - ## ## | ||
| 224 | ## ## ## ## | ||
| 225 | ------------ ------------- | ||
| 226 | CPU0 CPU1 CPU2 CPU3 | ||
| 227 | |||
| 228 | |||
| 229 | Case 3. P stays on prev_cpu / CPU 0 | ||
| 230 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
| 231 | |||
| 232 | 1024 - - - - - - - | ||
| 233 | |||
| 234 | Energy calculation: | ||
| 235 | 768 ============= * CPU0: 400 / 512 * 300 = 234 | ||
| 236 | * CPU1: 100 / 512 * 300 = 58 | ||
| 237 | * CPU2: 600 / 768 * 800 = 625 | ||
| 238 | 512 =========== - ##- - - - - * CPU3: 500 / 768 * 800 = 520 | ||
| 239 | ## ## => total_energy = 1437 | ||
| 240 | 341 -PP - - - - ## ## | ||
| 241 | PP ## ## | ||
| 242 | 170 -## - - - - ## ## | ||
| 243 | ## ## ## ## | ||
| 244 | ------------ ------------- | ||
| 245 | CPU0 CPU1 CPU2 CPU3 | ||
| 246 | |||
| 247 | |||
| 248 | From these calculations, the Case 1 has the lowest total energy. So CPU 1 | ||
| 249 | is be the best candidate from an energy-efficiency standpoint. | ||
| 250 | |||
| 251 | Big CPUs are generally more power hungry than the little ones and are thus used | ||
| 252 | mainly when a task doesn't fit the littles. However, little CPUs aren't always | ||
| 253 | necessarily more energy-efficient than big CPUs. For some systems, the high OPPs | ||
| 254 | of the little CPUs can be less energy-efficient than the lowest OPPs of the | ||
| 255 | bigs, for example. So, if the little CPUs happen to have enough utilization at | ||
| 256 | a specific point in time, a small task waking up at that moment could be better | ||
| 257 | of executing on the big side in order to save energy, even though it would fit | ||
| 258 | on the little side. | ||
| 259 | |||
| 260 | And even in the case where all OPPs of the big CPUs are less energy-efficient | ||
| 261 | than those of the little, using the big CPUs for a small task might still, under | ||
| 262 | specific conditions, save energy. Indeed, placing a task on a little CPU can | ||
| 263 | result in raising the OPP of the entire performance domain, and that will | ||
| 264 | increase the cost of the tasks already running there. If the waking task is | ||
| 265 | placed on a big CPU, its own execution cost might be higher than if it was | ||
| 266 | running on a little, but it won't impact the other tasks of the little CPUs | ||
| 267 | which will keep running at a lower OPP. So, when considering the total energy | ||
| 268 | consumed by CPUs, the extra cost of running that one task on a big core can be | ||
| 269 | smaller than the cost of raising the OPP on the little CPUs for all the other | ||
| 270 | tasks. | ||
| 271 | |||
| 272 | The examples above would be nearly impossible to get right in a generic way, and | ||
| 273 | for all platforms, without knowing the cost of running at different OPPs on all | ||
| 274 | CPUs of the system. Thanks to its EM-based design, EAS should cope with them | ||
| 275 | correctly without too many troubles. However, in order to ensure a minimal | ||
| 276 | impact on throughput for high-utilization scenarios, EAS also implements another | ||
| 277 | mechanism called 'over-utilization'. | ||
| 278 | |||
| 279 | |||
| 280 | 5. Over-utilization | ||
| 281 | ------------------- | ||
| 282 | |||
| 283 | From a general standpoint, the use-cases where EAS can help the most are those | ||
| 284 | involving a light/medium CPU utilization. Whenever long CPU-bound tasks are | ||
| 285 | being run, they will require all of the available CPU capacity, and there isn't | ||
| 286 | much that can be done by the scheduler to save energy without severly harming | ||
| 287 | throughput. In order to avoid hurting performance with EAS, CPUs are flagged as | ||
| 288 | 'over-utilized' as soon as they are used at more than 80% of their compute | ||
| 289 | capacity. As long as no CPUs are over-utilized in a root domain, load balancing | ||
| 290 | is disabled and EAS overridess the wake-up balancing code. EAS is likely to load | ||
| 291 | the most energy efficient CPUs of the system more than the others if that can be | ||
| 292 | done without harming throughput. So, the load-balancer is disabled to prevent | ||
| 293 | it from breaking the energy-efficient task placement found by EAS. It is safe to | ||
| 294 | do so when the system isn't overutilized since being below the 80% tipping point | ||
| 295 | implies that: | ||
| 296 | |||
| 297 | a. there is some idle time on all CPUs, so the utilization signals used by | ||
| 298 | EAS are likely to accurately represent the 'size' of the various tasks | ||
| 299 | in the system; | ||
| 300 | b. all tasks should already be provided with enough CPU capacity, | ||
| 301 | regardless of their nice values; | ||
| 302 | c. since there is spare capacity all tasks must be blocking/sleeping | ||
| 303 | regularly and balancing at wake-up is sufficient. | ||
| 304 | |||
| 305 | As soon as one CPU goes above the 80% tipping point, at least one of the three | ||
| 306 | assumptions above becomes incorrect. In this scenario, the 'overutilized' flag | ||
| 307 | is raised for the entire root domain, EAS is disabled, and the load-balancer is | ||
| 308 | re-enabled. By doing so, the scheduler falls back onto load-based algorithms for | ||
| 309 | wake-up and load balance under CPU-bound conditions. This provides a better | ||
| 310 | respect of the nice values of tasks. | ||
| 311 | |||
| 312 | Since the notion of overutilization largely relies on detecting whether or not | ||
| 313 | there is some idle time in the system, the CPU capacity 'stolen' by higher | ||
| 314 | (than CFS) scheduling classes (as well as IRQ) must be taken into account. As | ||
| 315 | such, the detection of overutilization accounts for the capacity used not only | ||
| 316 | by CFS tasks, but also by the other scheduling classes and IRQ. | ||
| 317 | |||
| 318 | |||
| 319 | 6. Dependencies and requirements for EAS | ||
| 320 | ---------------------------------------- | ||
| 321 | |||
| 322 | Energy Aware Scheduling depends on the CPUs of the system having specific | ||
| 323 | hardware properties and on other features of the kernel being enabled. This | ||
| 324 | section lists these dependencies and provides hints as to how they can be met. | ||
| 325 | |||
| 326 | |||
| 327 | 6.1 - Asymmetric CPU topology | ||
| 328 | |||
| 329 | As mentioned in the introduction, EAS is only supported on platforms with | ||
| 330 | asymmetric CPU topologies for now. This requirement is checked at run-time by | ||
| 331 | looking for the presence of the SD_ASYM_CPUCAPACITY flag when the scheduling | ||
| 332 | domains are built. | ||
| 333 | |||
| 334 | The flag is set/cleared automatically by the scheduler topology code whenever | ||
| 335 | there are CPUs with different capacities in a root domain. The capacities of | ||
| 336 | CPUs are provided by arch-specific code through the arch_scale_cpu_capacity() | ||
| 337 | callback. As an example, arm and arm64 share an implementation of this callback | ||
| 338 | which uses a combination of CPUFreq data and device-tree bindings to compute the | ||
| 339 | capacity of CPUs (see drivers/base/arch_topology.c for more details). | ||
| 340 | |||
| 341 | So, in order to use EAS on your platform your architecture must implement the | ||
| 342 | arch_scale_cpu_capacity() callback, and some of the CPUs must have a lower | ||
| 343 | capacity than others. | ||
| 344 | |||
| 345 | Please note that EAS is not fundamentally incompatible with SMP, but no | ||
| 346 | significant savings on SMP platforms have been observed yet. This restriction | ||
| 347 | could be amended in the future if proven otherwise. | ||
| 348 | |||
| 349 | |||
| 350 | 6.2 - Energy Model presence | ||
| 351 | |||
| 352 | EAS uses the EM of a platform to estimate the impact of scheduling decisions on | ||
| 353 | energy. So, your platform must provide power cost tables to the EM framework in | ||
| 354 | order to make EAS start. To do so, please refer to documentation of the | ||
| 355 | independent EM framework in Documentation/power/energy-model.txt. | ||
| 356 | |||
| 357 | Please also note that the scheduling domains need to be re-built after the | ||
| 358 | EM has been registered in order to start EAS. | ||
| 359 | |||
| 360 | |||
| 361 | 6.3 - Energy Model complexity | ||
| 362 | |||
| 363 | The task wake-up path is very latency-sensitive. When the EM of a platform is | ||
| 364 | too complex (too many CPUs, too many performance domains, too many performance | ||
| 365 | states, ...), the cost of using it in the wake-up path can become prohibitive. | ||
| 366 | The energy-aware wake-up algorithm has a complexity of: | ||
| 367 | |||
| 368 | C = Nd * (Nc + Ns) | ||
| 369 | |||
| 370 | with: Nd the number of performance domains; Nc the number of CPUs; and Ns the | ||
| 371 | total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8). | ||
| 372 | |||
| 373 | A complexity check is performed at the root domain level, when scheduling | ||
| 374 | domains are built. EAS will not start on a root domain if its C happens to be | ||
| 375 | higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the | ||
| 376 | time of writing). | ||
| 377 | |||
| 378 | If you really want to use EAS but the complexity of your platform's Energy | ||
| 379 | Model is too high to be used with a single root domain, you're left with only | ||
| 380 | two possible options: | ||
| 381 | |||
| 382 | 1. split your system into separate, smaller, root domains using exclusive | ||
| 383 | cpusets and enable EAS locally on each of them. This option has the | ||
| 384 | benefit to work out of the box but the drawback of preventing load | ||
| 385 | balance between root domains, which can result in an unbalanced system | ||
| 386 | overall; | ||
| 387 | 2. submit patches to reduce the complexity of the EAS wake-up algorithm, | ||
| 388 | hence enabling it to cope with larger EMs in reasonable time. | ||
| 389 | |||
| 390 | |||
| 391 | 6.4 - Schedutil governor | ||
| 392 | |||
| 393 | EAS tries to predict at which OPP will the CPUs be running in the close future | ||
| 394 | in order to estimate their energy consumption. To do so, it is assumed that OPPs | ||
| 395 | of CPUs follow their utilization. | ||
| 396 | |||
| 397 | Although it is very difficult to provide hard guarantees regarding the accuracy | ||
| 398 | of this assumption in practice (because the hardware might not do what it is | ||
| 399 | told to do, for example), schedutil as opposed to other CPUFreq governors at | ||
| 400 | least _requests_ frequencies calculated using the utilization signals. | ||
| 401 | Consequently, the only sane governor to use together with EAS is schedutil, | ||
| 402 | because it is the only one providing some degree of consistency between | ||
| 403 | frequency requests and energy predictions. | ||
| 404 | |||
| 405 | Using EAS with any other governor than schedutil is not supported. | ||
| 406 | |||
| 407 | |||
| 408 | 6.5 Scale-invariant utilization signals | ||
| 409 | |||
| 410 | In order to make accurate prediction across CPUs and for all performance | ||
| 411 | states, EAS needs frequency-invariant and CPU-invariant PELT signals. These can | ||
| 412 | be obtained using the architecture-defined arch_scale{cpu,freq}_capacity() | ||
| 413 | callbacks. | ||
| 414 | |||
| 415 | Using EAS on a platform that doesn't implement these two callbacks is not | ||
| 416 | supported. | ||
| 417 | |||
| 418 | |||
| 419 | 6.6 Multithreading (SMT) | ||
| 420 | |||
| 421 | EAS in its current form is SMT unaware and is not able to leverage | ||
| 422 | multithreaded hardware to save energy. EAS considers threads as independent | ||
| 423 | CPUs, which can actually be counter-productive for both performance and energy. | ||
| 424 | |||
| 425 | EAS on SMT is not supported. | ||
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c0527d8a468a..379063e58326 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
| @@ -79,6 +79,7 @@ show up in /proc/sys/kernel: | |||
| 79 | - reboot-cmd [ SPARC only ] | 79 | - reboot-cmd [ SPARC only ] |
| 80 | - rtsig-max | 80 | - rtsig-max |
| 81 | - rtsig-nr | 81 | - rtsig-nr |
| 82 | - sched_energy_aware | ||
| 82 | - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst | 83 | - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst |
| 83 | - sem | 84 | - sem |
| 84 | - sem_next_id [ sysv ipc ] | 85 | - sem_next_id [ sysv ipc ] |
| @@ -890,6 +891,17 @@ rtsig-nr shows the number of RT signals currently queued. | |||
| 890 | 891 | ||
| 891 | ============================================================== | 892 | ============================================================== |
| 892 | 893 | ||
| 894 | sched_energy_aware: | ||
| 895 | |||
| 896 | Enables/disables Energy Aware Scheduling (EAS). EAS starts | ||
| 897 | automatically on platforms where it can run (that is, | ||
| 898 | platforms with asymmetric CPU topologies and having an Energy | ||
| 899 | Model available). If your platform happens to meet the | ||
| 900 | requirements for EAS but you do not want to use it, change | ||
| 901 | this value to 0. | ||
| 902 | |||
| 903 | ============================================================== | ||
| 904 | |||
| 893 | sched_schedstats: | 905 | sched_schedstats: |
| 894 | 906 | ||
| 895 | Enables/disables scheduler statistics. Enabling this feature | 907 | Enables/disables scheduler statistics. Enabling this feature |
diff --git a/MAINTAINERS b/MAINTAINERS index 5e5529b9ffc8..366362b16f34 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -12280,14 +12280,6 @@ S: Maintained | |||
| 12280 | F: drivers/net/ppp/pptp.c | 12280 | F: drivers/net/ppp/pptp.c |
| 12281 | W: http://sourceforge.net/projects/accel-pptp | 12281 | W: http://sourceforge.net/projects/accel-pptp |
| 12282 | 12282 | ||
| 12283 | PREEMPTIBLE KERNEL | ||
| 12284 | M: Robert Love <rml@tech9.net> | ||
| 12285 | L: kpreempt-tech@lists.sourceforge.net | ||
| 12286 | W: https://www.kernel.org/pub/linux/kernel/people/rml/preempt-kernel | ||
| 12287 | S: Supported | ||
| 12288 | F: Documentation/preempt-locking.txt | ||
| 12289 | F: include/linux/preempt.h | ||
| 12290 | |||
| 12291 | PRINTK | 12283 | PRINTK |
| 12292 | M: Petr Mladek <pmladek@suse.com> | 12284 | M: Petr Mladek <pmladek@suse.com> |
| 12293 | M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> | 12285 | M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> |
| @@ -13525,6 +13517,7 @@ F: kernel/sched/ | |||
| 13525 | F: include/linux/sched.h | 13517 | F: include/linux/sched.h |
| 13526 | F: include/uapi/linux/sched.h | 13518 | F: include/uapi/linux/sched.h |
| 13527 | F: include/linux/wait.h | 13519 | F: include/linux/wait.h |
| 13520 | F: include/linux/preempt.h | ||
| 13528 | 13521 | ||
| 13529 | SCR24X CHIP CARD INTERFACE DRIVER | 13522 | SCR24X CHIP CARD INTERFACE DRIVER |
| 13530 | M: Lubomir Rintel <lkundrak@v3.sk> | 13523 | M: Lubomir Rintel <lkundrak@v3.sk> |
| @@ -1189,7 +1189,7 @@ no_thread_group: | |||
| 1189 | flush_itimer_signals(); | 1189 | flush_itimer_signals(); |
| 1190 | #endif | 1190 | #endif |
| 1191 | 1191 | ||
| 1192 | if (atomic_read(&oldsighand->count) != 1) { | 1192 | if (refcount_read(&oldsighand->count) != 1) { |
| 1193 | struct sighand_struct *newsighand; | 1193 | struct sighand_struct *newsighand; |
| 1194 | /* | 1194 | /* |
| 1195 | * This ->sighand is shared with the CLONE_SIGHAND | 1195 | * This ->sighand is shared with the CLONE_SIGHAND |
| @@ -1199,7 +1199,7 @@ no_thread_group: | |||
| 1199 | if (!newsighand) | 1199 | if (!newsighand) |
| 1200 | return -ENOMEM; | 1200 | return -ENOMEM; |
| 1201 | 1201 | ||
| 1202 | atomic_set(&newsighand->count, 1); | 1202 | refcount_set(&newsighand->count, 1); |
| 1203 | memcpy(newsighand->action, oldsighand->action, | 1203 | memcpy(newsighand->action, oldsighand->action, |
| 1204 | sizeof(newsighand->action)); | 1204 | sizeof(newsighand->action)); |
| 1205 | 1205 | ||
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 0b63d68dedb2..f912872fbf91 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c | |||
| @@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
| 64 | else | 64 | else |
| 65 | bytes += kobjsize(current->files); | 65 | bytes += kobjsize(current->files); |
| 66 | 66 | ||
| 67 | if (current->sighand && atomic_read(¤t->sighand->count) > 1) | 67 | if (current->sighand && refcount_read(¤t->sighand->count) > 1) |
| 68 | sbytes += kobjsize(current->sighand); | 68 | sbytes += kobjsize(current->sighand); |
| 69 | else | 69 | else |
| 70 | bytes += kobjsize(current->sighand); | 70 | bytes += kobjsize(current->sighand); |
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a7083a45a26c..6049baa5b8bc 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/securebits.h> | 13 | #include <linux/securebits.h> |
| 14 | #include <linux/seqlock.h> | 14 | #include <linux/seqlock.h> |
| 15 | #include <linux/rbtree.h> | 15 | #include <linux/rbtree.h> |
| 16 | #include <linux/refcount.h> | ||
| 16 | #include <linux/sched/autogroup.h> | 17 | #include <linux/sched/autogroup.h> |
| 17 | #include <net/net_namespace.h> | 18 | #include <net/net_namespace.h> |
| 18 | #include <linux/sched/rt.h> | 19 | #include <linux/sched/rt.h> |
diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 1577a2d56e9d..2c89e60bc752 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h | |||
| @@ -86,7 +86,7 @@ enum { | |||
| 86 | 86 | ||
| 87 | struct kthread_worker { | 87 | struct kthread_worker { |
| 88 | unsigned int flags; | 88 | unsigned int flags; |
| 89 | spinlock_t lock; | 89 | raw_spinlock_t lock; |
| 90 | struct list_head work_list; | 90 | struct list_head work_list; |
| 91 | struct list_head delayed_work_list; | 91 | struct list_head delayed_work_list; |
| 92 | struct task_struct *task; | 92 | struct task_struct *task; |
| @@ -107,7 +107,7 @@ struct kthread_delayed_work { | |||
| 107 | }; | 107 | }; |
| 108 | 108 | ||
| 109 | #define KTHREAD_WORKER_INIT(worker) { \ | 109 | #define KTHREAD_WORKER_INIT(worker) { \ |
| 110 | .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \ | 110 | .lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock), \ |
| 111 | .work_list = LIST_HEAD_INIT((worker).work_list), \ | 111 | .work_list = LIST_HEAD_INIT((worker).work_list), \ |
| 112 | .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\ | 112 | .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\ |
| 113 | } | 113 | } |
| @@ -165,9 +165,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker, | |||
| 165 | #define kthread_init_delayed_work(dwork, fn) \ | 165 | #define kthread_init_delayed_work(dwork, fn) \ |
| 166 | do { \ | 166 | do { \ |
| 167 | kthread_init_work(&(dwork)->work, (fn)); \ | 167 | kthread_init_work(&(dwork)->work, (fn)); \ |
| 168 | __init_timer(&(dwork)->timer, \ | 168 | timer_setup(&(dwork)->timer, \ |
| 169 | kthread_delayed_work_timer_fn, \ | 169 | kthread_delayed_work_timer_fn, 0); \ |
| 170 | TIMER_IRQSAFE); \ | ||
| 171 | } while (0) | 170 | } while (0) |
| 172 | 171 | ||
| 173 | int kthread_worker_fn(void *worker_ptr); | 172 | int kthread_worker_fn(void *worker_ptr); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 89ddece0b003..903ef29b62c3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/seccomp.h> | 21 | #include <linux/seccomp.h> |
| 22 | #include <linux/nodemask.h> | 22 | #include <linux/nodemask.h> |
| 23 | #include <linux/rcupdate.h> | 23 | #include <linux/rcupdate.h> |
| 24 | #include <linux/refcount.h> | ||
| 24 | #include <linux/resource.h> | 25 | #include <linux/resource.h> |
| 25 | #include <linux/latencytop.h> | 26 | #include <linux/latencytop.h> |
| 26 | #include <linux/sched/prio.h> | 27 | #include <linux/sched/prio.h> |
| @@ -356,12 +357,6 @@ struct util_est { | |||
| 356 | * For cfs_rq, it is the aggregated load_avg of all runnable and | 357 | * For cfs_rq, it is the aggregated load_avg of all runnable and |
| 357 | * blocked sched_entities. | 358 | * blocked sched_entities. |
| 358 | * | 359 | * |
| 359 | * load_avg may also take frequency scaling into account: | ||
| 360 | * | ||
| 361 | * load_avg = runnable% * scale_load_down(load) * freq% | ||
| 362 | * | ||
| 363 | * where freq% is the CPU frequency normalized to the highest frequency. | ||
| 364 | * | ||
| 365 | * [util_avg definition] | 360 | * [util_avg definition] |
| 366 | * | 361 | * |
| 367 | * util_avg = running% * SCHED_CAPACITY_SCALE | 362 | * util_avg = running% * SCHED_CAPACITY_SCALE |
| @@ -370,17 +365,14 @@ struct util_est { | |||
| 370 | * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable | 365 | * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable |
| 371 | * and blocked sched_entities. | 366 | * and blocked sched_entities. |
| 372 | * | 367 | * |
| 373 | * util_avg may also factor frequency scaling and CPU capacity scaling: | 368 | * load_avg and util_avg don't direcly factor frequency scaling and CPU |
| 374 | * | 369 | * capacity scaling. The scaling is done through the rq_clock_pelt that |
| 375 | * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% | 370 | * is used for computing those signals (see update_rq_clock_pelt()) |
| 376 | * | ||
| 377 | * where freq% is the same as above, and capacity% is the CPU capacity | ||
| 378 | * normalized to the greatest capacity (due to uarch differences, etc). | ||
| 379 | * | 371 | * |
| 380 | * N.B., the above ratios (runnable%, running%, freq%, and capacity%) | 372 | * N.B., the above ratios (runnable% and running%) themselves are in the |
| 381 | * themselves are in the range of [0, 1]. To do fixed point arithmetics, | 373 | * range of [0, 1]. To do fixed point arithmetics, we therefore scale them |
| 382 | * we therefore scale them to as large a range as necessary. This is for | 374 | * to as large a range as necessary. This is for example reflected by |
| 383 | * example reflected by util_avg's SCHED_CAPACITY_SCALE. | 375 | * util_avg's SCHED_CAPACITY_SCALE. |
| 384 | * | 376 | * |
| 385 | * [Overflow issue] | 377 | * [Overflow issue] |
| 386 | * | 378 | * |
| @@ -607,7 +599,7 @@ struct task_struct { | |||
| 607 | randomized_struct_fields_start | 599 | randomized_struct_fields_start |
| 608 | 600 | ||
| 609 | void *stack; | 601 | void *stack; |
| 610 | atomic_t usage; | 602 | refcount_t usage; |
| 611 | /* Per task flags (PF_*), defined further below: */ | 603 | /* Per task flags (PF_*), defined further below: */ |
| 612 | unsigned int flags; | 604 | unsigned int flags; |
| 613 | unsigned int ptrace; | 605 | unsigned int ptrace; |
| @@ -1187,7 +1179,7 @@ struct task_struct { | |||
| 1187 | #endif | 1179 | #endif |
| 1188 | #ifdef CONFIG_THREAD_INFO_IN_TASK | 1180 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
| 1189 | /* A live task holds one reference: */ | 1181 | /* A live task holds one reference: */ |
| 1190 | atomic_t stack_refcount; | 1182 | refcount_t stack_refcount; |
| 1191 | #endif | 1183 | #endif |
| 1192 | #ifdef CONFIG_LIVEPATCH | 1184 | #ifdef CONFIG_LIVEPATCH |
| 1193 | int patch_state; | 1185 | int patch_state; |
| @@ -1403,7 +1395,6 @@ extern struct pid *cad_pid; | |||
| 1403 | #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ | 1395 | #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ |
| 1404 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ | 1396 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ |
| 1405 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ | 1397 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ |
| 1406 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | ||
| 1407 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ | 1398 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ |
| 1408 | #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ | 1399 | #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ |
| 1409 | 1400 | ||
| @@ -1753,9 +1744,9 @@ static __always_inline bool need_resched(void) | |||
| 1753 | static inline unsigned int task_cpu(const struct task_struct *p) | 1744 | static inline unsigned int task_cpu(const struct task_struct *p) |
| 1754 | { | 1745 | { |
| 1755 | #ifdef CONFIG_THREAD_INFO_IN_TASK | 1746 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
| 1756 | return p->cpu; | 1747 | return READ_ONCE(p->cpu); |
| 1757 | #else | 1748 | #else |
| 1758 | return task_thread_info(p)->cpu; | 1749 | return READ_ONCE(task_thread_info(p)->cpu); |
| 1759 | #endif | 1750 | #endif |
| 1760 | } | 1751 | } |
| 1761 | 1752 | ||
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 13789d10a50e..ae5655197698 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h | |||
| @@ -8,13 +8,14 @@ | |||
| 8 | #include <linux/sched/jobctl.h> | 8 | #include <linux/sched/jobctl.h> |
| 9 | #include <linux/sched/task.h> | 9 | #include <linux/sched/task.h> |
| 10 | #include <linux/cred.h> | 10 | #include <linux/cred.h> |
| 11 | #include <linux/refcount.h> | ||
| 11 | 12 | ||
| 12 | /* | 13 | /* |
| 13 | * Types defining task->signal and task->sighand and APIs using them: | 14 | * Types defining task->signal and task->sighand and APIs using them: |
| 14 | */ | 15 | */ |
| 15 | 16 | ||
| 16 | struct sighand_struct { | 17 | struct sighand_struct { |
| 17 | atomic_t count; | 18 | refcount_t count; |
| 18 | struct k_sigaction action[_NSIG]; | 19 | struct k_sigaction action[_NSIG]; |
| 19 | spinlock_t siglock; | 20 | spinlock_t siglock; |
| 20 | wait_queue_head_t signalfd_wqh; | 21 | wait_queue_head_t signalfd_wqh; |
| @@ -82,7 +83,7 @@ struct multiprocess_signals { | |||
| 82 | * the locking of signal_struct. | 83 | * the locking of signal_struct. |
| 83 | */ | 84 | */ |
| 84 | struct signal_struct { | 85 | struct signal_struct { |
| 85 | atomic_t sigcnt; | 86 | refcount_t sigcnt; |
| 86 | atomic_t live; | 87 | atomic_t live; |
| 87 | int nr_threads; | 88 | int nr_threads; |
| 88 | struct list_head thread_head; | 89 | struct list_head thread_head; |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index a9c32daeb9d8..99ce6d728df7 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
| @@ -83,4 +83,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write, | |||
| 83 | void __user *buffer, size_t *lenp, | 83 | void __user *buffer, size_t *lenp, |
| 84 | loff_t *ppos); | 84 | loff_t *ppos); |
| 85 | 85 | ||
| 86 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) | ||
| 87 | extern unsigned int sysctl_sched_energy_aware; | ||
| 88 | extern int sched_energy_aware_handler(struct ctl_table *table, int write, | ||
| 89 | void __user *buffer, size_t *lenp, | ||
| 90 | loff_t *ppos); | ||
| 91 | #endif | ||
| 92 | |||
| 86 | #endif /* _LINUX_SCHED_SYSCTL_H */ | 93 | #endif /* _LINUX_SCHED_SYSCTL_H */ |
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 44c6f15800ff..2e97a2227045 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h | |||
| @@ -88,13 +88,13 @@ extern void sched_exec(void); | |||
| 88 | #define sched_exec() {} | 88 | #define sched_exec() {} |
| 89 | #endif | 89 | #endif |
| 90 | 90 | ||
| 91 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) | 91 | #define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0) |
| 92 | 92 | ||
| 93 | extern void __put_task_struct(struct task_struct *t); | 93 | extern void __put_task_struct(struct task_struct *t); |
| 94 | 94 | ||
| 95 | static inline void put_task_struct(struct task_struct *t) | 95 | static inline void put_task_struct(struct task_struct *t) |
| 96 | { | 96 | { |
| 97 | if (atomic_dec_and_test(&t->usage)) | 97 | if (refcount_dec_and_test(&t->usage)) |
| 98 | __put_task_struct(t); | 98 | __put_task_struct(t); |
| 99 | } | 99 | } |
| 100 | 100 | ||
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 6a841929073f..2413427e439c 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h | |||
| @@ -61,7 +61,7 @@ static inline unsigned long *end_of_stack(struct task_struct *p) | |||
| 61 | #ifdef CONFIG_THREAD_INFO_IN_TASK | 61 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
| 62 | static inline void *try_get_task_stack(struct task_struct *tsk) | 62 | static inline void *try_get_task_stack(struct task_struct *tsk) |
| 63 | { | 63 | { |
| 64 | return atomic_inc_not_zero(&tsk->stack_refcount) ? | 64 | return refcount_inc_not_zero(&tsk->stack_refcount) ? |
| 65 | task_stack_page(tsk) : NULL; | 65 | task_stack_page(tsk) : NULL; |
| 66 | } | 66 | } |
| 67 | 67 | ||
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index c31d3a47a47c..57c7ed3fe465 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h | |||
| @@ -176,10 +176,10 @@ typedef int (*sched_domain_flags_f)(void); | |||
| 176 | #define SDTL_OVERLAP 0x01 | 176 | #define SDTL_OVERLAP 0x01 |
| 177 | 177 | ||
| 178 | struct sd_data { | 178 | struct sd_data { |
| 179 | struct sched_domain **__percpu sd; | 179 | struct sched_domain *__percpu *sd; |
| 180 | struct sched_domain_shared **__percpu sds; | 180 | struct sched_domain_shared *__percpu *sds; |
| 181 | struct sched_group **__percpu sg; | 181 | struct sched_group *__percpu *sg; |
| 182 | struct sched_group_capacity **__percpu sgc; | 182 | struct sched_group_capacity *__percpu *sgc; |
| 183 | }; | 183 | }; |
| 184 | 184 | ||
| 185 | struct sched_domain_topology_level { | 185 | struct sched_domain_topology_level { |
diff --git a/include/linux/wait.h b/include/linux/wait.h index ed7c122cb31f..5f3efabc36f4 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
| @@ -308,7 +308,7 @@ do { \ | |||
| 308 | 308 | ||
| 309 | #define __wait_event_freezable(wq_head, condition) \ | 309 | #define __wait_event_freezable(wq_head, condition) \ |
| 310 | ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ | 310 | ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ |
| 311 | schedule(); try_to_freeze()) | 311 | freezable_schedule()) |
| 312 | 312 | ||
| 313 | /** | 313 | /** |
| 314 | * wait_event_freezable - sleep (or freeze) until a condition gets true | 314 | * wait_event_freezable - sleep (or freeze) until a condition gets true |
| @@ -367,7 +367,7 @@ do { \ | |||
| 367 | #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ | 367 | #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ |
| 368 | ___wait_event(wq_head, ___wait_cond_timeout(condition), \ | 368 | ___wait_event(wq_head, ___wait_cond_timeout(condition), \ |
| 369 | TASK_INTERRUPTIBLE, 0, timeout, \ | 369 | TASK_INTERRUPTIBLE, 0, timeout, \ |
| 370 | __ret = schedule_timeout(__ret); try_to_freeze()) | 370 | __ret = freezable_schedule_timeout(__ret)) |
| 371 | 371 | ||
| 372 | /* | 372 | /* |
| 373 | * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid | 373 | * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid |
| @@ -588,7 +588,7 @@ do { \ | |||
| 588 | 588 | ||
| 589 | #define __wait_event_freezable_exclusive(wq, condition) \ | 589 | #define __wait_event_freezable_exclusive(wq, condition) \ |
| 590 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ | 590 | ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ |
| 591 | schedule(); try_to_freeze()) | 591 | freezable_schedule()) |
| 592 | 592 | ||
| 593 | #define wait_event_freezable_exclusive(wq, condition) \ | 593 | #define wait_event_freezable_exclusive(wq, condition) \ |
| 594 | ({ \ | 594 | ({ \ |
diff --git a/init/init_task.c b/init/init_task.c index 5aebe3be4d7c..46dbf546264d 100644 --- a/init/init_task.c +++ b/init/init_task.c | |||
| @@ -44,7 +44,7 @@ static struct signal_struct init_signals = { | |||
| 44 | }; | 44 | }; |
| 45 | 45 | ||
| 46 | static struct sighand_struct init_sighand = { | 46 | static struct sighand_struct init_sighand = { |
| 47 | .count = ATOMIC_INIT(1), | 47 | .count = REFCOUNT_INIT(1), |
| 48 | .action = { { { .sa_handler = SIG_DFL, } }, }, | 48 | .action = { { { .sa_handler = SIG_DFL, } }, }, |
| 49 | .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock), | 49 | .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock), |
| 50 | .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), | 50 | .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), |
| @@ -61,11 +61,11 @@ struct task_struct init_task | |||
| 61 | = { | 61 | = { |
| 62 | #ifdef CONFIG_THREAD_INFO_IN_TASK | 62 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
| 63 | .thread_info = INIT_THREAD_INFO(init_task), | 63 | .thread_info = INIT_THREAD_INFO(init_task), |
| 64 | .stack_refcount = ATOMIC_INIT(1), | 64 | .stack_refcount = REFCOUNT_INIT(1), |
| 65 | #endif | 65 | #endif |
| 66 | .state = 0, | 66 | .state = 0, |
| 67 | .stack = init_stack, | 67 | .stack = init_stack, |
| 68 | .usage = ATOMIC_INIT(2), | 68 | .usage = REFCOUNT_INIT(2), |
| 69 | .flags = PF_KTHREAD, | 69 | .flags = PF_KTHREAD, |
| 70 | .prio = MAX_PRIO - 20, | 70 | .prio = MAX_PRIO - 20, |
| 71 | .static_prio = MAX_PRIO - 20, | 71 | .static_prio = MAX_PRIO - 20, |
diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..77059b211608 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -429,7 +429,7 @@ static void release_task_stack(struct task_struct *tsk) | |||
| 429 | #ifdef CONFIG_THREAD_INFO_IN_TASK | 429 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
| 430 | void put_task_stack(struct task_struct *tsk) | 430 | void put_task_stack(struct task_struct *tsk) |
| 431 | { | 431 | { |
| 432 | if (atomic_dec_and_test(&tsk->stack_refcount)) | 432 | if (refcount_dec_and_test(&tsk->stack_refcount)) |
| 433 | release_task_stack(tsk); | 433 | release_task_stack(tsk); |
| 434 | } | 434 | } |
| 435 | #endif | 435 | #endif |
| @@ -447,7 +447,7 @@ void free_task(struct task_struct *tsk) | |||
| 447 | * If the task had a separate stack allocation, it should be gone | 447 | * If the task had a separate stack allocation, it should be gone |
| 448 | * by now. | 448 | * by now. |
| 449 | */ | 449 | */ |
| 450 | WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); | 450 | WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); |
| 451 | #endif | 451 | #endif |
| 452 | rt_mutex_debug_task_free(tsk); | 452 | rt_mutex_debug_task_free(tsk); |
| 453 | ftrace_graph_exit_task(tsk); | 453 | ftrace_graph_exit_task(tsk); |
| @@ -710,14 +710,14 @@ static inline void free_signal_struct(struct signal_struct *sig) | |||
| 710 | 710 | ||
| 711 | static inline void put_signal_struct(struct signal_struct *sig) | 711 | static inline void put_signal_struct(struct signal_struct *sig) |
| 712 | { | 712 | { |
| 713 | if (atomic_dec_and_test(&sig->sigcnt)) | 713 | if (refcount_dec_and_test(&sig->sigcnt)) |
| 714 | free_signal_struct(sig); | 714 | free_signal_struct(sig); |
| 715 | } | 715 | } |
| 716 | 716 | ||
| 717 | void __put_task_struct(struct task_struct *tsk) | 717 | void __put_task_struct(struct task_struct *tsk) |
| 718 | { | 718 | { |
| 719 | WARN_ON(!tsk->exit_state); | 719 | WARN_ON(!tsk->exit_state); |
| 720 | WARN_ON(atomic_read(&tsk->usage)); | 720 | WARN_ON(refcount_read(&tsk->usage)); |
| 721 | WARN_ON(tsk == current); | 721 | WARN_ON(tsk == current); |
| 722 | 722 | ||
| 723 | cgroup_free(tsk); | 723 | cgroup_free(tsk); |
| @@ -867,7 +867,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 867 | tsk->stack_vm_area = stack_vm_area; | 867 | tsk->stack_vm_area = stack_vm_area; |
| 868 | #endif | 868 | #endif |
| 869 | #ifdef CONFIG_THREAD_INFO_IN_TASK | 869 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
| 870 | atomic_set(&tsk->stack_refcount, 1); | 870 | refcount_set(&tsk->stack_refcount, 1); |
| 871 | #endif | 871 | #endif |
| 872 | 872 | ||
| 873 | if (err) | 873 | if (err) |
| @@ -896,7 +896,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 896 | * One for us, one for whoever does the "release_task()" (usually | 896 | * One for us, one for whoever does the "release_task()" (usually |
| 897 | * parent) | 897 | * parent) |
| 898 | */ | 898 | */ |
| 899 | atomic_set(&tsk->usage, 2); | 899 | refcount_set(&tsk->usage, 2); |
| 900 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 900 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
| 901 | tsk->btrace_seq = 0; | 901 | tsk->btrace_seq = 0; |
| 902 | #endif | 902 | #endif |
| @@ -1463,7 +1463,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1463 | struct sighand_struct *sig; | 1463 | struct sighand_struct *sig; |
| 1464 | 1464 | ||
| 1465 | if (clone_flags & CLONE_SIGHAND) { | 1465 | if (clone_flags & CLONE_SIGHAND) { |
| 1466 | atomic_inc(¤t->sighand->count); | 1466 | refcount_inc(¤t->sighand->count); |
| 1467 | return 0; | 1467 | return 0; |
| 1468 | } | 1468 | } |
| 1469 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); | 1469 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
| @@ -1471,7 +1471,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1471 | if (!sig) | 1471 | if (!sig) |
| 1472 | return -ENOMEM; | 1472 | return -ENOMEM; |
| 1473 | 1473 | ||
| 1474 | atomic_set(&sig->count, 1); | 1474 | refcount_set(&sig->count, 1); |
| 1475 | spin_lock_irq(¤t->sighand->siglock); | 1475 | spin_lock_irq(¤t->sighand->siglock); |
| 1476 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); | 1476 | memcpy(sig->action, current->sighand->action, sizeof(sig->action)); |
| 1477 | spin_unlock_irq(¤t->sighand->siglock); | 1477 | spin_unlock_irq(¤t->sighand->siglock); |
| @@ -1480,7 +1480,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1480 | 1480 | ||
| 1481 | void __cleanup_sighand(struct sighand_struct *sighand) | 1481 | void __cleanup_sighand(struct sighand_struct *sighand) |
| 1482 | { | 1482 | { |
| 1483 | if (atomic_dec_and_test(&sighand->count)) { | 1483 | if (refcount_dec_and_test(&sighand->count)) { |
| 1484 | signalfd_cleanup(sighand); | 1484 | signalfd_cleanup(sighand); |
| 1485 | /* | 1485 | /* |
| 1486 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it | 1486 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
| @@ -1527,7 +1527,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1527 | 1527 | ||
| 1528 | sig->nr_threads = 1; | 1528 | sig->nr_threads = 1; |
| 1529 | atomic_set(&sig->live, 1); | 1529 | atomic_set(&sig->live, 1); |
| 1530 | atomic_set(&sig->sigcnt, 1); | 1530 | refcount_set(&sig->sigcnt, 1); |
| 1531 | 1531 | ||
| 1532 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ | 1532 | /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ |
| 1533 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); | 1533 | sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); |
| @@ -2082,7 +2082,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 2082 | } else { | 2082 | } else { |
| 2083 | current->signal->nr_threads++; | 2083 | current->signal->nr_threads++; |
| 2084 | atomic_inc(¤t->signal->live); | 2084 | atomic_inc(¤t->signal->live); |
| 2085 | atomic_inc(¤t->signal->sigcnt); | 2085 | refcount_inc(¤t->signal->sigcnt); |
| 2086 | task_join_group_stop(p); | 2086 | task_join_group_stop(p); |
| 2087 | list_add_tail_rcu(&p->thread_group, | 2087 | list_add_tail_rcu(&p->thread_group, |
| 2088 | &p->group_leader->thread_group); | 2088 | &p->group_leader->thread_group); |
| @@ -2439,7 +2439,7 @@ static int check_unshare_flags(unsigned long unshare_flags) | |||
| 2439 | return -EINVAL; | 2439 | return -EINVAL; |
| 2440 | } | 2440 | } |
| 2441 | if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { | 2441 | if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { |
| 2442 | if (atomic_read(¤t->sighand->count) > 1) | 2442 | if (refcount_read(¤t->sighand->count) > 1) |
| 2443 | return -EINVAL; | 2443 | return -EINVAL; |
| 2444 | } | 2444 | } |
| 2445 | if (unshare_flags & CLONE_VM) { | 2445 | if (unshare_flags & CLONE_VM) { |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 65234c89d85b..9cf20cc5ebe3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -605,7 +605,7 @@ void __kthread_init_worker(struct kthread_worker *worker, | |||
| 605 | struct lock_class_key *key) | 605 | struct lock_class_key *key) |
| 606 | { | 606 | { |
| 607 | memset(worker, 0, sizeof(struct kthread_worker)); | 607 | memset(worker, 0, sizeof(struct kthread_worker)); |
| 608 | spin_lock_init(&worker->lock); | 608 | raw_spin_lock_init(&worker->lock); |
| 609 | lockdep_set_class_and_name(&worker->lock, key, name); | 609 | lockdep_set_class_and_name(&worker->lock, key, name); |
| 610 | INIT_LIST_HEAD(&worker->work_list); | 610 | INIT_LIST_HEAD(&worker->work_list); |
| 611 | INIT_LIST_HEAD(&worker->delayed_work_list); | 611 | INIT_LIST_HEAD(&worker->delayed_work_list); |
| @@ -647,21 +647,21 @@ repeat: | |||
| 647 | 647 | ||
| 648 | if (kthread_should_stop()) { | 648 | if (kthread_should_stop()) { |
| 649 | __set_current_state(TASK_RUNNING); | 649 | __set_current_state(TASK_RUNNING); |
| 650 | spin_lock_irq(&worker->lock); | 650 | raw_spin_lock_irq(&worker->lock); |
| 651 | worker->task = NULL; | 651 | worker->task = NULL; |
| 652 | spin_unlock_irq(&worker->lock); | 652 | raw_spin_unlock_irq(&worker->lock); |
| 653 | return 0; | 653 | return 0; |
| 654 | } | 654 | } |
| 655 | 655 | ||
| 656 | work = NULL; | 656 | work = NULL; |
| 657 | spin_lock_irq(&worker->lock); | 657 | raw_spin_lock_irq(&worker->lock); |
| 658 | if (!list_empty(&worker->work_list)) { | 658 | if (!list_empty(&worker->work_list)) { |
| 659 | work = list_first_entry(&worker->work_list, | 659 | work = list_first_entry(&worker->work_list, |
| 660 | struct kthread_work, node); | 660 | struct kthread_work, node); |
| 661 | list_del_init(&work->node); | 661 | list_del_init(&work->node); |
| 662 | } | 662 | } |
| 663 | worker->current_work = work; | 663 | worker->current_work = work; |
| 664 | spin_unlock_irq(&worker->lock); | 664 | raw_spin_unlock_irq(&worker->lock); |
| 665 | 665 | ||
| 666 | if (work) { | 666 | if (work) { |
| 667 | __set_current_state(TASK_RUNNING); | 667 | __set_current_state(TASK_RUNNING); |
| @@ -818,12 +818,12 @@ bool kthread_queue_work(struct kthread_worker *worker, | |||
| 818 | bool ret = false; | 818 | bool ret = false; |
| 819 | unsigned long flags; | 819 | unsigned long flags; |
| 820 | 820 | ||
| 821 | spin_lock_irqsave(&worker->lock, flags); | 821 | raw_spin_lock_irqsave(&worker->lock, flags); |
| 822 | if (!queuing_blocked(worker, work)) { | 822 | if (!queuing_blocked(worker, work)) { |
| 823 | kthread_insert_work(worker, work, &worker->work_list); | 823 | kthread_insert_work(worker, work, &worker->work_list); |
| 824 | ret = true; | 824 | ret = true; |
| 825 | } | 825 | } |
| 826 | spin_unlock_irqrestore(&worker->lock, flags); | 826 | raw_spin_unlock_irqrestore(&worker->lock, flags); |
| 827 | return ret; | 827 | return ret; |
| 828 | } | 828 | } |
| 829 | EXPORT_SYMBOL_GPL(kthread_queue_work); | 829 | EXPORT_SYMBOL_GPL(kthread_queue_work); |
| @@ -841,6 +841,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) | |||
| 841 | struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); | 841 | struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); |
| 842 | struct kthread_work *work = &dwork->work; | 842 | struct kthread_work *work = &dwork->work; |
| 843 | struct kthread_worker *worker = work->worker; | 843 | struct kthread_worker *worker = work->worker; |
| 844 | unsigned long flags; | ||
| 844 | 845 | ||
| 845 | /* | 846 | /* |
| 846 | * This might happen when a pending work is reinitialized. | 847 | * This might happen when a pending work is reinitialized. |
| @@ -849,7 +850,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) | |||
| 849 | if (WARN_ON_ONCE(!worker)) | 850 | if (WARN_ON_ONCE(!worker)) |
| 850 | return; | 851 | return; |
| 851 | 852 | ||
| 852 | spin_lock(&worker->lock); | 853 | raw_spin_lock_irqsave(&worker->lock, flags); |
| 853 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ | 854 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ |
| 854 | WARN_ON_ONCE(work->worker != worker); | 855 | WARN_ON_ONCE(work->worker != worker); |
| 855 | 856 | ||
| @@ -858,7 +859,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) | |||
| 858 | list_del_init(&work->node); | 859 | list_del_init(&work->node); |
| 859 | kthread_insert_work(worker, work, &worker->work_list); | 860 | kthread_insert_work(worker, work, &worker->work_list); |
| 860 | 861 | ||
| 861 | spin_unlock(&worker->lock); | 862 | raw_spin_unlock_irqrestore(&worker->lock, flags); |
| 862 | } | 863 | } |
| 863 | EXPORT_SYMBOL(kthread_delayed_work_timer_fn); | 864 | EXPORT_SYMBOL(kthread_delayed_work_timer_fn); |
| 864 | 865 | ||
| @@ -914,14 +915,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, | |||
| 914 | unsigned long flags; | 915 | unsigned long flags; |
| 915 | bool ret = false; | 916 | bool ret = false; |
| 916 | 917 | ||
| 917 | spin_lock_irqsave(&worker->lock, flags); | 918 | raw_spin_lock_irqsave(&worker->lock, flags); |
| 918 | 919 | ||
| 919 | if (!queuing_blocked(worker, work)) { | 920 | if (!queuing_blocked(worker, work)) { |
| 920 | __kthread_queue_delayed_work(worker, dwork, delay); | 921 | __kthread_queue_delayed_work(worker, dwork, delay); |
| 921 | ret = true; | 922 | ret = true; |
| 922 | } | 923 | } |
| 923 | 924 | ||
| 924 | spin_unlock_irqrestore(&worker->lock, flags); | 925 | raw_spin_unlock_irqrestore(&worker->lock, flags); |
| 925 | return ret; | 926 | return ret; |
| 926 | } | 927 | } |
| 927 | EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); | 928 | EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); |
| @@ -957,7 +958,7 @@ void kthread_flush_work(struct kthread_work *work) | |||
| 957 | if (!worker) | 958 | if (!worker) |
| 958 | return; | 959 | return; |
| 959 | 960 | ||
| 960 | spin_lock_irq(&worker->lock); | 961 | raw_spin_lock_irq(&worker->lock); |
| 961 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ | 962 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ |
| 962 | WARN_ON_ONCE(work->worker != worker); | 963 | WARN_ON_ONCE(work->worker != worker); |
| 963 | 964 | ||
| @@ -969,7 +970,7 @@ void kthread_flush_work(struct kthread_work *work) | |||
| 969 | else | 970 | else |
| 970 | noop = true; | 971 | noop = true; |
| 971 | 972 | ||
| 972 | spin_unlock_irq(&worker->lock); | 973 | raw_spin_unlock_irq(&worker->lock); |
| 973 | 974 | ||
| 974 | if (!noop) | 975 | if (!noop) |
| 975 | wait_for_completion(&fwork.done); | 976 | wait_for_completion(&fwork.done); |
| @@ -1002,9 +1003,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, | |||
| 1002 | * any queuing is blocked by setting the canceling counter. | 1003 | * any queuing is blocked by setting the canceling counter. |
| 1003 | */ | 1004 | */ |
| 1004 | work->canceling++; | 1005 | work->canceling++; |
| 1005 | spin_unlock_irqrestore(&worker->lock, *flags); | 1006 | raw_spin_unlock_irqrestore(&worker->lock, *flags); |
| 1006 | del_timer_sync(&dwork->timer); | 1007 | del_timer_sync(&dwork->timer); |
| 1007 | spin_lock_irqsave(&worker->lock, *flags); | 1008 | raw_spin_lock_irqsave(&worker->lock, *flags); |
| 1008 | work->canceling--; | 1009 | work->canceling--; |
| 1009 | } | 1010 | } |
| 1010 | 1011 | ||
| @@ -1051,7 +1052,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, | |||
| 1051 | unsigned long flags; | 1052 | unsigned long flags; |
| 1052 | int ret = false; | 1053 | int ret = false; |
| 1053 | 1054 | ||
| 1054 | spin_lock_irqsave(&worker->lock, flags); | 1055 | raw_spin_lock_irqsave(&worker->lock, flags); |
| 1055 | 1056 | ||
| 1056 | /* Do not bother with canceling when never queued. */ | 1057 | /* Do not bother with canceling when never queued. */ |
| 1057 | if (!work->worker) | 1058 | if (!work->worker) |
| @@ -1068,7 +1069,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, | |||
| 1068 | fast_queue: | 1069 | fast_queue: |
| 1069 | __kthread_queue_delayed_work(worker, dwork, delay); | 1070 | __kthread_queue_delayed_work(worker, dwork, delay); |
| 1070 | out: | 1071 | out: |
| 1071 | spin_unlock_irqrestore(&worker->lock, flags); | 1072 | raw_spin_unlock_irqrestore(&worker->lock, flags); |
| 1072 | return ret; | 1073 | return ret; |
| 1073 | } | 1074 | } |
| 1074 | EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); | 1075 | EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); |
| @@ -1082,7 +1083,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) | |||
| 1082 | if (!worker) | 1083 | if (!worker) |
| 1083 | goto out; | 1084 | goto out; |
| 1084 | 1085 | ||
| 1085 | spin_lock_irqsave(&worker->lock, flags); | 1086 | raw_spin_lock_irqsave(&worker->lock, flags); |
| 1086 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ | 1087 | /* Work must not be used with >1 worker, see kthread_queue_work(). */ |
| 1087 | WARN_ON_ONCE(work->worker != worker); | 1088 | WARN_ON_ONCE(work->worker != worker); |
| 1088 | 1089 | ||
| @@ -1096,13 +1097,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) | |||
| 1096 | * In the meantime, block any queuing by setting the canceling counter. | 1097 | * In the meantime, block any queuing by setting the canceling counter. |
| 1097 | */ | 1098 | */ |
| 1098 | work->canceling++; | 1099 | work->canceling++; |
| 1099 | spin_unlock_irqrestore(&worker->lock, flags); | 1100 | raw_spin_unlock_irqrestore(&worker->lock, flags); |
| 1100 | kthread_flush_work(work); | 1101 | kthread_flush_work(work); |
| 1101 | spin_lock_irqsave(&worker->lock, flags); | 1102 | raw_spin_lock_irqsave(&worker->lock, flags); |
| 1102 | work->canceling--; | 1103 | work->canceling--; |
| 1103 | 1104 | ||
| 1104 | out_fast: | 1105 | out_fast: |
| 1105 | spin_unlock_irqrestore(&worker->lock, flags); | 1106 | raw_spin_unlock_irqrestore(&worker->lock, flags); |
| 1106 | out: | 1107 | out: |
| 1107 | return ret; | 1108 | return ret; |
| 1108 | } | 1109 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0002995570db..f3901b84d217 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
| 107 | * [L] ->on_rq | 107 | * [L] ->on_rq |
| 108 | * RELEASE (rq->lock) | 108 | * RELEASE (rq->lock) |
| 109 | * | 109 | * |
| 110 | * If we observe the old CPU in task_rq_lock, the acquire of | 110 | * If we observe the old CPU in task_rq_lock(), the acquire of |
| 111 | * the old rq->lock will fully serialize against the stores. | 111 | * the old rq->lock will fully serialize against the stores. |
| 112 | * | 112 | * |
| 113 | * If we observe the new CPU in task_rq_lock, the acquire will | 113 | * If we observe the new CPU in task_rq_lock(), the address |
| 114 | * pair with the WMB to ensure we must then also see migrating. | 114 | * dependency headed by '[L] rq = task_rq()' and the acquire |
| 115 | * will pair with the WMB to ensure we then also see migrating. | ||
| 115 | */ | 116 | */ |
| 116 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { | 117 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { |
| 117 | rq_pin_lock(rq, rf); | 118 | rq_pin_lock(rq, rf); |
| @@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 180 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | 181 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) |
| 181 | update_irq_load_avg(rq, irq_delta + steal); | 182 | update_irq_load_avg(rq, irq_delta + steal); |
| 182 | #endif | 183 | #endif |
| 184 | update_rq_clock_pelt(rq, delta); | ||
| 183 | } | 185 | } |
| 184 | 186 | ||
| 185 | void update_rq_clock(struct rq *rq) | 187 | void update_rq_clock(struct rq *rq) |
| @@ -956,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, | |||
| 956 | { | 958 | { |
| 957 | lockdep_assert_held(&rq->lock); | 959 | lockdep_assert_held(&rq->lock); |
| 958 | 960 | ||
| 959 | p->on_rq = TASK_ON_RQ_MIGRATING; | 961 | WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); |
| 960 | dequeue_task(rq, p, DEQUEUE_NOCLOCK); | 962 | dequeue_task(rq, p, DEQUEUE_NOCLOCK); |
| 961 | set_task_cpu(p, new_cpu); | 963 | set_task_cpu(p, new_cpu); |
| 962 | rq_unlock(rq, rf); | 964 | rq_unlock(rq, rf); |
| @@ -2459,7 +2461,7 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2459 | #endif | 2461 | #endif |
| 2460 | rq = __task_rq_lock(p, &rf); | 2462 | rq = __task_rq_lock(p, &rf); |
| 2461 | update_rq_clock(rq); | 2463 | update_rq_clock(rq); |
| 2462 | post_init_entity_util_avg(&p->se); | 2464 | post_init_entity_util_avg(p); |
| 2463 | 2465 | ||
| 2464 | activate_task(rq, p, ENQUEUE_NOCLOCK); | 2466 | activate_task(rq, p, ENQUEUE_NOCLOCK); |
| 2465 | p->on_rq = TASK_ON_RQ_QUEUED; | 2467 | p->on_rq = TASK_ON_RQ_QUEUED; |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb8b7b5d745d..6a73e41a2016 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
| 1767 | deadline_queue_push_tasks(rq); | 1767 | deadline_queue_push_tasks(rq); |
| 1768 | 1768 | ||
| 1769 | if (rq->curr->sched_class != &dl_sched_class) | 1769 | if (rq->curr->sched_class != &dl_sched_class) |
| 1770 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); | 1770 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); |
| 1771 | 1771 | ||
| 1772 | return p; | 1772 | return p; |
| 1773 | } | 1773 | } |
| @@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | |||
| 1776 | { | 1776 | { |
| 1777 | update_curr_dl(rq); | 1777 | update_curr_dl(rq); |
| 1778 | 1778 | ||
| 1779 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); | 1779 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); |
| 1780 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) | 1780 | if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) |
| 1781 | enqueue_pushable_dl_task(rq, p); | 1781 | enqueue_pushable_dl_task(rq, p); |
| 1782 | } | 1782 | } |
| @@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | |||
| 1793 | { | 1793 | { |
| 1794 | update_curr_dl(rq); | 1794 | update_curr_dl(rq); |
| 1795 | 1795 | ||
| 1796 | update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); | 1796 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); |
| 1797 | /* | 1797 | /* |
| 1798 | * Even when we have runtime, update_curr_dl() might have resulted in us | 1798 | * Even when we have runtime, update_curr_dl() might have resulted in us |
| 1799 | * not being the leftmost task anymore. In that case NEED_RESCHED will | 1799 | * not being the leftmost task anymore. In that case NEED_RESCHED will |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index de3de997e245..8039d62ae36e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void) | |||
| 315 | { | 315 | { |
| 316 | static struct ctl_table *cpu_entries; | 316 | static struct ctl_table *cpu_entries; |
| 317 | static struct ctl_table **cpu_idx; | 317 | static struct ctl_table **cpu_idx; |
| 318 | static bool init_done = false; | ||
| 318 | char buf[32]; | 319 | char buf[32]; |
| 319 | int i; | 320 | int i; |
| 320 | 321 | ||
| @@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void) | |||
| 344 | if (!cpumask_available(sd_sysctl_cpus)) { | 345 | if (!cpumask_available(sd_sysctl_cpus)) { |
| 345 | if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) | 346 | if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) |
| 346 | return; | 347 | return; |
| 348 | } | ||
| 347 | 349 | ||
| 350 | if (!init_done) { | ||
| 351 | init_done = true; | ||
| 348 | /* init to possible to not have holes in @cpu_entries */ | 352 | /* init to possible to not have holes in @cpu_entries */ |
| 349 | cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); | 353 | cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); |
| 350 | } | 354 | } |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 310d0637fe4b..8213ff6e365d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -248,13 +248,6 @@ const struct sched_class fair_sched_class; | |||
| 248 | */ | 248 | */ |
| 249 | 249 | ||
| 250 | #ifdef CONFIG_FAIR_GROUP_SCHED | 250 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 251 | |||
| 252 | /* cpu runqueue to which this cfs_rq is attached */ | ||
| 253 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | ||
| 254 | { | ||
| 255 | return cfs_rq->rq; | ||
| 256 | } | ||
| 257 | |||
| 258 | static inline struct task_struct *task_of(struct sched_entity *se) | 251 | static inline struct task_struct *task_of(struct sched_entity *se) |
| 259 | { | 252 | { |
| 260 | SCHED_WARN_ON(!entity_is_task(se)); | 253 | SCHED_WARN_ON(!entity_is_task(se)); |
| @@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
| 282 | return grp->my_q; | 275 | return grp->my_q; |
| 283 | } | 276 | } |
| 284 | 277 | ||
| 285 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 278 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 286 | { | 279 | { |
| 287 | if (!cfs_rq->on_list) { | 280 | struct rq *rq = rq_of(cfs_rq); |
| 288 | struct rq *rq = rq_of(cfs_rq); | 281 | int cpu = cpu_of(rq); |
| 289 | int cpu = cpu_of(rq); | 282 | |
| 283 | if (cfs_rq->on_list) | ||
| 284 | return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; | ||
| 285 | |||
| 286 | cfs_rq->on_list = 1; | ||
| 287 | |||
| 288 | /* | ||
| 289 | * Ensure we either appear before our parent (if already | ||
| 290 | * enqueued) or force our parent to appear after us when it is | ||
| 291 | * enqueued. The fact that we always enqueue bottom-up | ||
| 292 | * reduces this to two cases and a special case for the root | ||
| 293 | * cfs_rq. Furthermore, it also means that we will always reset | ||
| 294 | * tmp_alone_branch either when the branch is connected | ||
| 295 | * to a tree or when we reach the top of the tree | ||
| 296 | */ | ||
| 297 | if (cfs_rq->tg->parent && | ||
| 298 | cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { | ||
| 290 | /* | 299 | /* |
| 291 | * Ensure we either appear before our parent (if already | 300 | * If parent is already on the list, we add the child |
| 292 | * enqueued) or force our parent to appear after us when it is | 301 | * just before. Thanks to circular linked property of |
| 293 | * enqueued. The fact that we always enqueue bottom-up | 302 | * the list, this means to put the child at the tail |
| 294 | * reduces this to two cases and a special case for the root | 303 | * of the list that starts by parent. |
| 295 | * cfs_rq. Furthermore, it also means that we will always reset | ||
| 296 | * tmp_alone_branch either when the branch is connected | ||
| 297 | * to a tree or when we reach the beg of the tree | ||
| 298 | */ | 304 | */ |
| 299 | if (cfs_rq->tg->parent && | 305 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, |
| 300 | cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { | 306 | &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); |
| 301 | /* | 307 | /* |
| 302 | * If parent is already on the list, we add the child | 308 | * The branch is now connected to its tree so we can |
| 303 | * just before. Thanks to circular linked property of | 309 | * reset tmp_alone_branch to the beginning of the |
| 304 | * the list, this means to put the child at the tail | 310 | * list. |
| 305 | * of the list that starts by parent. | 311 | */ |
| 306 | */ | 312 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; |
| 307 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | 313 | return true; |
| 308 | &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); | 314 | } |
| 309 | /* | ||
| 310 | * The branch is now connected to its tree so we can | ||
| 311 | * reset tmp_alone_branch to the beginning of the | ||
| 312 | * list. | ||
| 313 | */ | ||
| 314 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | ||
| 315 | } else if (!cfs_rq->tg->parent) { | ||
| 316 | /* | ||
| 317 | * cfs rq without parent should be put | ||
| 318 | * at the tail of the list. | ||
| 319 | */ | ||
| 320 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
| 321 | &rq->leaf_cfs_rq_list); | ||
| 322 | /* | ||
| 323 | * We have reach the beg of a tree so we can reset | ||
| 324 | * tmp_alone_branch to the beginning of the list. | ||
| 325 | */ | ||
| 326 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | ||
| 327 | } else { | ||
| 328 | /* | ||
| 329 | * The parent has not already been added so we want to | ||
| 330 | * make sure that it will be put after us. | ||
| 331 | * tmp_alone_branch points to the beg of the branch | ||
| 332 | * where we will add parent. | ||
| 333 | */ | ||
| 334 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
| 335 | rq->tmp_alone_branch); | ||
| 336 | /* | ||
| 337 | * update tmp_alone_branch to points to the new beg | ||
| 338 | * of the branch | ||
| 339 | */ | ||
| 340 | rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; | ||
| 341 | } | ||
| 342 | 315 | ||
| 343 | cfs_rq->on_list = 1; | 316 | if (!cfs_rq->tg->parent) { |
| 317 | /* | ||
| 318 | * cfs rq without parent should be put | ||
| 319 | * at the tail of the list. | ||
| 320 | */ | ||
| 321 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
| 322 | &rq->leaf_cfs_rq_list); | ||
| 323 | /* | ||
| 324 | * We have reach the top of a tree so we can reset | ||
| 325 | * tmp_alone_branch to the beginning of the list. | ||
| 326 | */ | ||
| 327 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | ||
| 328 | return true; | ||
| 344 | } | 329 | } |
| 330 | |||
| 331 | /* | ||
| 332 | * The parent has not already been added so we want to | ||
| 333 | * make sure that it will be put after us. | ||
| 334 | * tmp_alone_branch points to the begin of the branch | ||
| 335 | * where we will add parent. | ||
| 336 | */ | ||
| 337 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); | ||
| 338 | /* | ||
| 339 | * update tmp_alone_branch to points to the new begin | ||
| 340 | * of the branch | ||
| 341 | */ | ||
| 342 | rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; | ||
| 343 | return false; | ||
| 345 | } | 344 | } |
| 346 | 345 | ||
| 347 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 346 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 348 | { | 347 | { |
| 349 | if (cfs_rq->on_list) { | 348 | if (cfs_rq->on_list) { |
| 349 | struct rq *rq = rq_of(cfs_rq); | ||
| 350 | |||
| 351 | /* | ||
| 352 | * With cfs_rq being unthrottled/throttled during an enqueue, | ||
| 353 | * it can happen the tmp_alone_branch points the a leaf that | ||
| 354 | * we finally want to del. In this case, tmp_alone_branch moves | ||
| 355 | * to the prev element but it will point to rq->leaf_cfs_rq_list | ||
| 356 | * at the end of the enqueue. | ||
| 357 | */ | ||
| 358 | if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) | ||
| 359 | rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; | ||
| 360 | |||
| 350 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 361 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
| 351 | cfs_rq->on_list = 0; | 362 | cfs_rq->on_list = 0; |
| 352 | } | 363 | } |
| 353 | } | 364 | } |
| 354 | 365 | ||
| 355 | /* Iterate through all leaf cfs_rq's on a runqueue: */ | 366 | static inline void assert_list_leaf_cfs_rq(struct rq *rq) |
| 356 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 367 | { |
| 357 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 368 | SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); |
| 369 | } | ||
| 370 | |||
| 371 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | ||
| 372 | #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ | ||
| 373 | list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ | ||
| 374 | leaf_cfs_rq_list) | ||
| 358 | 375 | ||
| 359 | /* Do the two (enqueued) entities belong to the same group ? */ | 376 | /* Do the two (enqueued) entities belong to the same group ? */ |
| 360 | static inline struct cfs_rq * | 377 | static inline struct cfs_rq * |
| @@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se) | |||
| 410 | return container_of(se, struct task_struct, se); | 427 | return container_of(se, struct task_struct, se); |
| 411 | } | 428 | } |
| 412 | 429 | ||
| 413 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | ||
| 414 | { | ||
| 415 | return container_of(cfs_rq, struct rq, cfs); | ||
| 416 | } | ||
| 417 | |||
| 418 | |||
| 419 | #define for_each_sched_entity(se) \ | 430 | #define for_each_sched_entity(se) \ |
| 420 | for (; se; se = NULL) | 431 | for (; se; se = NULL) |
| 421 | 432 | ||
| @@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
| 438 | return NULL; | 449 | return NULL; |
| 439 | } | 450 | } |
| 440 | 451 | ||
| 441 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 452 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 442 | { | 453 | { |
| 454 | return true; | ||
| 443 | } | 455 | } |
| 444 | 456 | ||
| 445 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 457 | static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 446 | { | 458 | { |
| 447 | } | 459 | } |
| 448 | 460 | ||
| 449 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 461 | static inline void assert_list_leaf_cfs_rq(struct rq *rq) |
| 450 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 462 | { |
| 463 | } | ||
| 464 | |||
| 465 | #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ | ||
| 466 | for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) | ||
| 451 | 467 | ||
| 452 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | 468 | static inline struct sched_entity *parent_entity(struct sched_entity *se) |
| 453 | { | 469 | { |
| @@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 686 | return calc_delta_fair(sched_slice(cfs_rq, se), se); | 702 | return calc_delta_fair(sched_slice(cfs_rq, se), se); |
| 687 | } | 703 | } |
| 688 | 704 | ||
| 689 | #ifdef CONFIG_SMP | ||
| 690 | #include "pelt.h" | 705 | #include "pelt.h" |
| 691 | #include "sched-pelt.h" | 706 | #ifdef CONFIG_SMP |
| 692 | 707 | ||
| 693 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); | 708 | static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); |
| 694 | static unsigned long task_h_load(struct task_struct *p); | 709 | static unsigned long task_h_load(struct task_struct *p); |
| @@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se); | |||
| 744 | * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) | 759 | * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) |
| 745 | * if util_avg > util_avg_cap. | 760 | * if util_avg > util_avg_cap. |
| 746 | */ | 761 | */ |
| 747 | void post_init_entity_util_avg(struct sched_entity *se) | 762 | void post_init_entity_util_avg(struct task_struct *p) |
| 748 | { | 763 | { |
| 764 | struct sched_entity *se = &p->se; | ||
| 749 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 765 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 750 | struct sched_avg *sa = &se->avg; | 766 | struct sched_avg *sa = &se->avg; |
| 751 | long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); | 767 | long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); |
| @@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
| 763 | } | 779 | } |
| 764 | } | 780 | } |
| 765 | 781 | ||
| 766 | if (entity_is_task(se)) { | 782 | if (p->sched_class != &fair_sched_class) { |
| 767 | struct task_struct *p = task_of(se); | 783 | /* |
| 768 | if (p->sched_class != &fair_sched_class) { | 784 | * For !fair tasks do: |
| 769 | /* | 785 | * |
| 770 | * For !fair tasks do: | 786 | update_cfs_rq_load_avg(now, cfs_rq); |
| 771 | * | 787 | attach_entity_load_avg(cfs_rq, se, 0); |
| 772 | update_cfs_rq_load_avg(now, cfs_rq); | 788 | switched_from_fair(rq, p); |
| 773 | attach_entity_load_avg(cfs_rq, se, 0); | 789 | * |
| 774 | switched_from_fair(rq, p); | 790 | * such that the next switched_to_fair() has the |
| 775 | * | 791 | * expected state. |
| 776 | * such that the next switched_to_fair() has the | 792 | */ |
| 777 | * expected state. | 793 | se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); |
| 778 | */ | 794 | return; |
| 779 | se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); | ||
| 780 | return; | ||
| 781 | } | ||
| 782 | } | 795 | } |
| 783 | 796 | ||
| 784 | attach_entity_cfs_rq(se); | 797 | attach_entity_cfs_rq(se); |
| @@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
| 788 | void init_entity_runnable_average(struct sched_entity *se) | 801 | void init_entity_runnable_average(struct sched_entity *se) |
| 789 | { | 802 | { |
| 790 | } | 803 | } |
| 791 | void post_init_entity_util_avg(struct sched_entity *se) | 804 | void post_init_entity_util_avg(struct task_struct *p) |
| 792 | { | 805 | { |
| 793 | } | 806 | } |
| 794 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | 807 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
| @@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
| 1035 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 1048 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
| 1036 | 1049 | ||
| 1037 | struct numa_group { | 1050 | struct numa_group { |
| 1038 | atomic_t refcount; | 1051 | refcount_t refcount; |
| 1039 | 1052 | ||
| 1040 | spinlock_t lock; /* nr_tasks, tasks */ | 1053 | spinlock_t lock; /* nr_tasks, tasks */ |
| 1041 | int nr_tasks; | 1054 | int nr_tasks; |
| @@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p) | |||
| 1104 | unsigned long shared = group_faults_shared(ng); | 1117 | unsigned long shared = group_faults_shared(ng); |
| 1105 | unsigned long private = group_faults_priv(ng); | 1118 | unsigned long private = group_faults_priv(ng); |
| 1106 | 1119 | ||
| 1107 | period *= atomic_read(&ng->refcount); | 1120 | period *= refcount_read(&ng->refcount); |
| 1108 | period *= shared + 1; | 1121 | period *= shared + 1; |
| 1109 | period /= private + shared + 1; | 1122 | period /= private + shared + 1; |
| 1110 | } | 1123 | } |
| @@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p) | |||
| 1127 | unsigned long private = group_faults_priv(ng); | 1140 | unsigned long private = group_faults_priv(ng); |
| 1128 | unsigned long period = smax; | 1141 | unsigned long period = smax; |
| 1129 | 1142 | ||
| 1130 | period *= atomic_read(&ng->refcount); | 1143 | period *= refcount_read(&ng->refcount); |
| 1131 | period *= shared + 1; | 1144 | period *= shared + 1; |
| 1132 | period /= private + shared + 1; | 1145 | period /= private + shared + 1; |
| 1133 | 1146 | ||
| @@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p) | |||
| 2203 | 2216 | ||
| 2204 | static inline int get_numa_group(struct numa_group *grp) | 2217 | static inline int get_numa_group(struct numa_group *grp) |
| 2205 | { | 2218 | { |
| 2206 | return atomic_inc_not_zero(&grp->refcount); | 2219 | return refcount_inc_not_zero(&grp->refcount); |
| 2207 | } | 2220 | } |
| 2208 | 2221 | ||
| 2209 | static inline void put_numa_group(struct numa_group *grp) | 2222 | static inline void put_numa_group(struct numa_group *grp) |
| 2210 | { | 2223 | { |
| 2211 | if (atomic_dec_and_test(&grp->refcount)) | 2224 | if (refcount_dec_and_test(&grp->refcount)) |
| 2212 | kfree_rcu(grp, rcu); | 2225 | kfree_rcu(grp, rcu); |
| 2213 | } | 2226 | } |
| 2214 | 2227 | ||
| @@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 2229 | if (!grp) | 2242 | if (!grp) |
| 2230 | return; | 2243 | return; |
| 2231 | 2244 | ||
| 2232 | atomic_set(&grp->refcount, 1); | 2245 | refcount_set(&grp->refcount, 1); |
| 2233 | grp->active_nodes = 1; | 2246 | grp->active_nodes = 1; |
| 2234 | grp->max_faults_cpu = 0; | 2247 | grp->max_faults_cpu = 0; |
| 2235 | spin_lock_init(&grp->lock); | 2248 | spin_lock_init(&grp->lock); |
| @@ -3122,7 +3135,7 @@ void set_task_rq_fair(struct sched_entity *se, | |||
| 3122 | p_last_update_time = prev->avg.last_update_time; | 3135 | p_last_update_time = prev->avg.last_update_time; |
| 3123 | n_last_update_time = next->avg.last_update_time; | 3136 | n_last_update_time = next->avg.last_update_time; |
| 3124 | #endif | 3137 | #endif |
| 3125 | __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); | 3138 | __update_load_avg_blocked_se(p_last_update_time, se); |
| 3126 | se->avg.last_update_time = n_last_update_time; | 3139 | se->avg.last_update_time = n_last_update_time; |
| 3127 | } | 3140 | } |
| 3128 | 3141 | ||
| @@ -3257,11 +3270,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf | |||
| 3257 | 3270 | ||
| 3258 | /* | 3271 | /* |
| 3259 | * runnable_sum can't be lower than running_sum | 3272 | * runnable_sum can't be lower than running_sum |
| 3260 | * As running sum is scale with CPU capacity wehreas the runnable sum | 3273 | * Rescale running sum to be in the same range as runnable sum |
| 3261 | * is not we rescale running_sum 1st | 3274 | * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] |
| 3275 | * runnable_sum is in [0 : LOAD_AVG_MAX] | ||
| 3262 | */ | 3276 | */ |
| 3263 | running_sum = se->avg.util_sum / | 3277 | running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; |
| 3264 | arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); | ||
| 3265 | runnable_sum = max(runnable_sum, running_sum); | 3278 | runnable_sum = max(runnable_sum, running_sum); |
| 3266 | 3279 | ||
| 3267 | load_sum = (s64)se_weight(se) * runnable_sum; | 3280 | load_sum = (s64)se_weight(se) * runnable_sum; |
| @@ -3364,7 +3377,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum | |||
| 3364 | 3377 | ||
| 3365 | /** | 3378 | /** |
| 3366 | * update_cfs_rq_load_avg - update the cfs_rq's load/util averages | 3379 | * update_cfs_rq_load_avg - update the cfs_rq's load/util averages |
| 3367 | * @now: current time, as per cfs_rq_clock_task() | 3380 | * @now: current time, as per cfs_rq_clock_pelt() |
| 3368 | * @cfs_rq: cfs_rq to update | 3381 | * @cfs_rq: cfs_rq to update |
| 3369 | * | 3382 | * |
| 3370 | * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) | 3383 | * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) |
| @@ -3409,7 +3422,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
| 3409 | decayed = 1; | 3422 | decayed = 1; |
| 3410 | } | 3423 | } |
| 3411 | 3424 | ||
| 3412 | decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); | 3425 | decayed |= __update_load_avg_cfs_rq(now, cfs_rq); |
| 3413 | 3426 | ||
| 3414 | #ifndef CONFIG_64BIT | 3427 | #ifndef CONFIG_64BIT |
| 3415 | smp_wmb(); | 3428 | smp_wmb(); |
| @@ -3499,9 +3512,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 3499 | /* Update task and its cfs_rq load average */ | 3512 | /* Update task and its cfs_rq load average */ |
| 3500 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 3513 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
| 3501 | { | 3514 | { |
| 3502 | u64 now = cfs_rq_clock_task(cfs_rq); | 3515 | u64 now = cfs_rq_clock_pelt(cfs_rq); |
| 3503 | struct rq *rq = rq_of(cfs_rq); | ||
| 3504 | int cpu = cpu_of(rq); | ||
| 3505 | int decayed; | 3516 | int decayed; |
| 3506 | 3517 | ||
| 3507 | /* | 3518 | /* |
| @@ -3509,7 +3520,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 3509 | * track group sched_entity load average for task_h_load calc in migration | 3520 | * track group sched_entity load average for task_h_load calc in migration |
| 3510 | */ | 3521 | */ |
| 3511 | if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) | 3522 | if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) |
| 3512 | __update_load_avg_se(now, cpu, cfs_rq, se); | 3523 | __update_load_avg_se(now, cfs_rq, se); |
| 3513 | 3524 | ||
| 3514 | decayed = update_cfs_rq_load_avg(now, cfs_rq); | 3525 | decayed = update_cfs_rq_load_avg(now, cfs_rq); |
| 3515 | decayed |= propagate_entity_load_avg(se); | 3526 | decayed |= propagate_entity_load_avg(se); |
| @@ -3561,7 +3572,7 @@ void sync_entity_load_avg(struct sched_entity *se) | |||
| 3561 | u64 last_update_time; | 3572 | u64 last_update_time; |
| 3562 | 3573 | ||
| 3563 | last_update_time = cfs_rq_last_update_time(cfs_rq); | 3574 | last_update_time = cfs_rq_last_update_time(cfs_rq); |
| 3564 | __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); | 3575 | __update_load_avg_blocked_se(last_update_time, se); |
| 3565 | } | 3576 | } |
| 3566 | 3577 | ||
| 3567 | /* | 3578 | /* |
| @@ -3577,10 +3588,6 @@ void remove_entity_load_avg(struct sched_entity *se) | |||
| 3577 | * tasks cannot exit without having gone through wake_up_new_task() -> | 3588 | * tasks cannot exit without having gone through wake_up_new_task() -> |
| 3578 | * post_init_entity_util_avg() which will have added things to the | 3589 | * post_init_entity_util_avg() which will have added things to the |
| 3579 | * cfs_rq, so we can remove unconditionally. | 3590 | * cfs_rq, so we can remove unconditionally. |
| 3580 | * | ||
| 3581 | * Similarly for groups, they will have passed through | ||
| 3582 | * post_init_entity_util_avg() before unregister_sched_fair_group() | ||
| 3583 | * calls this. | ||
| 3584 | */ | 3591 | */ |
| 3585 | 3592 | ||
| 3586 | sync_entity_load_avg(se); | 3593 | sync_entity_load_avg(se); |
| @@ -3654,6 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | |||
| 3654 | { | 3661 | { |
| 3655 | long last_ewma_diff; | 3662 | long last_ewma_diff; |
| 3656 | struct util_est ue; | 3663 | struct util_est ue; |
| 3664 | int cpu; | ||
| 3657 | 3665 | ||
| 3658 | if (!sched_feat(UTIL_EST)) | 3666 | if (!sched_feat(UTIL_EST)) |
| 3659 | return; | 3667 | return; |
| @@ -3688,6 +3696,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) | |||
| 3688 | return; | 3696 | return; |
| 3689 | 3697 | ||
| 3690 | /* | 3698 | /* |
| 3699 | * To avoid overestimation of actual task utilization, skip updates if | ||
| 3700 | * we cannot grant there is idle time in this CPU. | ||
| 3701 | */ | ||
| 3702 | cpu = cpu_of(rq_of(cfs_rq)); | ||
| 3703 | if (task_util(p) > capacity_orig_of(cpu)) | ||
| 3704 | return; | ||
| 3705 | |||
| 3706 | /* | ||
| 3691 | * Update Task's estimated utilization | 3707 | * Update Task's estimated utilization |
| 3692 | * | 3708 | * |
| 3693 | * When *p completes an activation we can consolidate another sample | 3709 | * When *p completes an activation we can consolidate another sample |
| @@ -4429,6 +4445,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
| 4429 | /* adjust cfs_rq_clock_task() */ | 4445 | /* adjust cfs_rq_clock_task() */ |
| 4430 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - | 4446 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - |
| 4431 | cfs_rq->throttled_clock_task; | 4447 | cfs_rq->throttled_clock_task; |
| 4448 | |||
| 4449 | /* Add cfs_rq with already running entity in the list */ | ||
| 4450 | if (cfs_rq->nr_running >= 1) | ||
| 4451 | list_add_leaf_cfs_rq(cfs_rq); | ||
| 4432 | } | 4452 | } |
| 4433 | 4453 | ||
| 4434 | return 0; | 4454 | return 0; |
| @@ -4440,8 +4460,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) | |||
| 4440 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | 4460 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; |
| 4441 | 4461 | ||
| 4442 | /* group is entering throttled state, stop time */ | 4462 | /* group is entering throttled state, stop time */ |
| 4443 | if (!cfs_rq->throttle_count) | 4463 | if (!cfs_rq->throttle_count) { |
| 4444 | cfs_rq->throttled_clock_task = rq_clock_task(rq); | 4464 | cfs_rq->throttled_clock_task = rq_clock_task(rq); |
| 4465 | list_del_leaf_cfs_rq(cfs_rq); | ||
| 4466 | } | ||
| 4445 | cfs_rq->throttle_count++; | 4467 | cfs_rq->throttle_count++; |
| 4446 | 4468 | ||
| 4447 | return 0; | 4469 | return 0; |
| @@ -4544,6 +4566,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 4544 | break; | 4566 | break; |
| 4545 | } | 4567 | } |
| 4546 | 4568 | ||
| 4569 | assert_list_leaf_cfs_rq(rq); | ||
| 4570 | |||
| 4547 | if (!se) | 4571 | if (!se) |
| 4548 | add_nr_running(rq, task_delta); | 4572 | add_nr_running(rq, task_delta); |
| 4549 | 4573 | ||
| @@ -4565,7 +4589,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | |||
| 4565 | struct rq *rq = rq_of(cfs_rq); | 4589 | struct rq *rq = rq_of(cfs_rq); |
| 4566 | struct rq_flags rf; | 4590 | struct rq_flags rf; |
| 4567 | 4591 | ||
| 4568 | rq_lock(rq, &rf); | 4592 | rq_lock_irqsave(rq, &rf); |
| 4569 | if (!cfs_rq_throttled(cfs_rq)) | 4593 | if (!cfs_rq_throttled(cfs_rq)) |
| 4570 | goto next; | 4594 | goto next; |
| 4571 | 4595 | ||
| @@ -4582,7 +4606,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | |||
| 4582 | unthrottle_cfs_rq(cfs_rq); | 4606 | unthrottle_cfs_rq(cfs_rq); |
| 4583 | 4607 | ||
| 4584 | next: | 4608 | next: |
| 4585 | rq_unlock(rq, &rf); | 4609 | rq_unlock_irqrestore(rq, &rf); |
| 4586 | 4610 | ||
| 4587 | if (!remaining) | 4611 | if (!remaining) |
| 4588 | break; | 4612 | break; |
| @@ -4598,7 +4622,7 @@ next: | |||
| 4598 | * period the timer is deactivated until scheduling resumes; cfs_b->idle is | 4622 | * period the timer is deactivated until scheduling resumes; cfs_b->idle is |
| 4599 | * used to track this state. | 4623 | * used to track this state. |
| 4600 | */ | 4624 | */ |
| 4601 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | 4625 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) |
| 4602 | { | 4626 | { |
| 4603 | u64 runtime, runtime_expires; | 4627 | u64 runtime, runtime_expires; |
| 4604 | int throttled; | 4628 | int throttled; |
| @@ -4640,11 +4664,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
| 4640 | while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { | 4664 | while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { |
| 4641 | runtime = cfs_b->runtime; | 4665 | runtime = cfs_b->runtime; |
| 4642 | cfs_b->distribute_running = 1; | 4666 | cfs_b->distribute_running = 1; |
| 4643 | raw_spin_unlock(&cfs_b->lock); | 4667 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
| 4644 | /* we can't nest cfs_b->lock while distributing bandwidth */ | 4668 | /* we can't nest cfs_b->lock while distributing bandwidth */ |
| 4645 | runtime = distribute_cfs_runtime(cfs_b, runtime, | 4669 | runtime = distribute_cfs_runtime(cfs_b, runtime, |
| 4646 | runtime_expires); | 4670 | runtime_expires); |
| 4647 | raw_spin_lock(&cfs_b->lock); | 4671 | raw_spin_lock_irqsave(&cfs_b->lock, flags); |
| 4648 | 4672 | ||
| 4649 | cfs_b->distribute_running = 0; | 4673 | cfs_b->distribute_running = 0; |
| 4650 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | 4674 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); |
| @@ -4753,17 +4777,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 4753 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | 4777 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) |
| 4754 | { | 4778 | { |
| 4755 | u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); | 4779 | u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); |
| 4780 | unsigned long flags; | ||
| 4756 | u64 expires; | 4781 | u64 expires; |
| 4757 | 4782 | ||
| 4758 | /* confirm we're still not at a refresh boundary */ | 4783 | /* confirm we're still not at a refresh boundary */ |
| 4759 | raw_spin_lock(&cfs_b->lock); | 4784 | raw_spin_lock_irqsave(&cfs_b->lock, flags); |
| 4760 | if (cfs_b->distribute_running) { | 4785 | if (cfs_b->distribute_running) { |
| 4761 | raw_spin_unlock(&cfs_b->lock); | 4786 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
| 4762 | return; | 4787 | return; |
| 4763 | } | 4788 | } |
| 4764 | 4789 | ||
| 4765 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { | 4790 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { |
| 4766 | raw_spin_unlock(&cfs_b->lock); | 4791 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
| 4767 | return; | 4792 | return; |
| 4768 | } | 4793 | } |
| 4769 | 4794 | ||
| @@ -4774,18 +4799,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
| 4774 | if (runtime) | 4799 | if (runtime) |
| 4775 | cfs_b->distribute_running = 1; | 4800 | cfs_b->distribute_running = 1; |
| 4776 | 4801 | ||
| 4777 | raw_spin_unlock(&cfs_b->lock); | 4802 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
| 4778 | 4803 | ||
| 4779 | if (!runtime) | 4804 | if (!runtime) |
| 4780 | return; | 4805 | return; |
| 4781 | 4806 | ||
| 4782 | runtime = distribute_cfs_runtime(cfs_b, runtime, expires); | 4807 | runtime = distribute_cfs_runtime(cfs_b, runtime, expires); |
| 4783 | 4808 | ||
| 4784 | raw_spin_lock(&cfs_b->lock); | 4809 | raw_spin_lock_irqsave(&cfs_b->lock, flags); |
| 4785 | if (expires == cfs_b->runtime_expires) | 4810 | if (expires == cfs_b->runtime_expires) |
| 4786 | lsub_positive(&cfs_b->runtime, runtime); | 4811 | lsub_positive(&cfs_b->runtime, runtime); |
| 4787 | cfs_b->distribute_running = 0; | 4812 | cfs_b->distribute_running = 0; |
| 4788 | raw_spin_unlock(&cfs_b->lock); | 4813 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
| 4789 | } | 4814 | } |
| 4790 | 4815 | ||
| 4791 | /* | 4816 | /* |
| @@ -4863,20 +4888,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | |||
| 4863 | { | 4888 | { |
| 4864 | struct cfs_bandwidth *cfs_b = | 4889 | struct cfs_bandwidth *cfs_b = |
| 4865 | container_of(timer, struct cfs_bandwidth, period_timer); | 4890 | container_of(timer, struct cfs_bandwidth, period_timer); |
| 4891 | unsigned long flags; | ||
| 4866 | int overrun; | 4892 | int overrun; |
| 4867 | int idle = 0; | 4893 | int idle = 0; |
| 4868 | 4894 | ||
| 4869 | raw_spin_lock(&cfs_b->lock); | 4895 | raw_spin_lock_irqsave(&cfs_b->lock, flags); |
| 4870 | for (;;) { | 4896 | for (;;) { |
| 4871 | overrun = hrtimer_forward_now(timer, cfs_b->period); | 4897 | overrun = hrtimer_forward_now(timer, cfs_b->period); |
| 4872 | if (!overrun) | 4898 | if (!overrun) |
| 4873 | break; | 4899 | break; |
| 4874 | 4900 | ||
| 4875 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | 4901 | idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); |
| 4876 | } | 4902 | } |
| 4877 | if (idle) | 4903 | if (idle) |
| 4878 | cfs_b->period_active = 0; | 4904 | cfs_b->period_active = 0; |
| 4879 | raw_spin_unlock(&cfs_b->lock); | 4905 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
| 4880 | 4906 | ||
| 4881 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | 4907 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; |
| 4882 | } | 4908 | } |
| @@ -4986,6 +5012,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
| 4986 | } | 5012 | } |
| 4987 | 5013 | ||
| 4988 | #else /* CONFIG_CFS_BANDWIDTH */ | 5014 | #else /* CONFIG_CFS_BANDWIDTH */ |
| 5015 | |||
| 5016 | static inline bool cfs_bandwidth_used(void) | ||
| 5017 | { | ||
| 5018 | return false; | ||
| 5019 | } | ||
| 5020 | |||
| 4989 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | 5021 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) |
| 4990 | { | 5022 | { |
| 4991 | return rq_clock_task(rq_of(cfs_rq)); | 5023 | return rq_clock_task(rq_of(cfs_rq)); |
| @@ -5177,6 +5209,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 5177 | 5209 | ||
| 5178 | } | 5210 | } |
| 5179 | 5211 | ||
| 5212 | if (cfs_bandwidth_used()) { | ||
| 5213 | /* | ||
| 5214 | * When bandwidth control is enabled; the cfs_rq_throttled() | ||
| 5215 | * breaks in the above iteration can result in incomplete | ||
| 5216 | * leaf list maintenance, resulting in triggering the assertion | ||
| 5217 | * below. | ||
| 5218 | */ | ||
| 5219 | for_each_sched_entity(se) { | ||
| 5220 | cfs_rq = cfs_rq_of(se); | ||
| 5221 | |||
| 5222 | if (list_add_leaf_cfs_rq(cfs_rq)) | ||
| 5223 | break; | ||
| 5224 | } | ||
| 5225 | } | ||
| 5226 | |||
| 5227 | assert_list_leaf_cfs_rq(rq); | ||
| 5228 | |||
| 5180 | hrtick_update(rq); | 5229 | hrtick_update(rq); |
| 5181 | } | 5230 | } |
| 5182 | 5231 | ||
| @@ -5556,11 +5605,6 @@ static unsigned long capacity_of(int cpu) | |||
| 5556 | return cpu_rq(cpu)->cpu_capacity; | 5605 | return cpu_rq(cpu)->cpu_capacity; |
| 5557 | } | 5606 | } |
| 5558 | 5607 | ||
| 5559 | static unsigned long capacity_orig_of(int cpu) | ||
| 5560 | { | ||
| 5561 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
| 5562 | } | ||
| 5563 | |||
| 5564 | static unsigned long cpu_avg_load_per_task(int cpu) | 5608 | static unsigned long cpu_avg_load_per_task(int cpu) |
| 5565 | { | 5609 | { |
| 5566 | struct rq *rq = cpu_rq(cpu); | 5610 | struct rq *rq = cpu_rq(cpu); |
| @@ -6053,7 +6097,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int | |||
| 6053 | bool idle = true; | 6097 | bool idle = true; |
| 6054 | 6098 | ||
| 6055 | for_each_cpu(cpu, cpu_smt_mask(core)) { | 6099 | for_each_cpu(cpu, cpu_smt_mask(core)) { |
| 6056 | cpumask_clear_cpu(cpu, cpus); | 6100 | __cpumask_clear_cpu(cpu, cpus); |
| 6057 | if (!available_idle_cpu(cpu)) | 6101 | if (!available_idle_cpu(cpu)) |
| 6058 | idle = false; | 6102 | idle = false; |
| 6059 | } | 6103 | } |
| @@ -6073,7 +6117,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int | |||
| 6073 | /* | 6117 | /* |
| 6074 | * Scan the local SMT mask for idle CPUs. | 6118 | * Scan the local SMT mask for idle CPUs. |
| 6075 | */ | 6119 | */ |
| 6076 | static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | 6120 | static int select_idle_smt(struct task_struct *p, int target) |
| 6077 | { | 6121 | { |
| 6078 | int cpu; | 6122 | int cpu; |
| 6079 | 6123 | ||
| @@ -6097,7 +6141,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s | |||
| 6097 | return -1; | 6141 | return -1; |
| 6098 | } | 6142 | } |
| 6099 | 6143 | ||
| 6100 | static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) | 6144 | static inline int select_idle_smt(struct task_struct *p, int target) |
| 6101 | { | 6145 | { |
| 6102 | return -1; | 6146 | return -1; |
| 6103 | } | 6147 | } |
| @@ -6202,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
| 6202 | if ((unsigned)i < nr_cpumask_bits) | 6246 | if ((unsigned)i < nr_cpumask_bits) |
| 6203 | return i; | 6247 | return i; |
| 6204 | 6248 | ||
| 6205 | i = select_idle_smt(p, sd, target); | 6249 | i = select_idle_smt(p, target); |
| 6206 | if ((unsigned)i < nr_cpumask_bits) | 6250 | if ((unsigned)i < nr_cpumask_bits) |
| 6207 | return i; | 6251 | return i; |
| 6208 | 6252 | ||
| @@ -6608,7 +6652,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 6608 | if (sd_flag & SD_BALANCE_WAKE) { | 6652 | if (sd_flag & SD_BALANCE_WAKE) { |
| 6609 | record_wakee(p); | 6653 | record_wakee(p); |
| 6610 | 6654 | ||
| 6611 | if (static_branch_unlikely(&sched_energy_present)) { | 6655 | if (sched_energy_enabled()) { |
| 6612 | new_cpu = find_energy_efficient_cpu(p, prev_cpu); | 6656 | new_cpu = find_energy_efficient_cpu(p, prev_cpu); |
| 6613 | if (new_cpu >= 0) | 6657 | if (new_cpu >= 0) |
| 6614 | return new_cpu; | 6658 | return new_cpu; |
| @@ -7027,6 +7071,12 @@ idle: | |||
| 7027 | if (new_tasks > 0) | 7071 | if (new_tasks > 0) |
| 7028 | goto again; | 7072 | goto again; |
| 7029 | 7073 | ||
| 7074 | /* | ||
| 7075 | * rq is about to be idle, check if we need to update the | ||
| 7076 | * lost_idle_time of clock_pelt | ||
| 7077 | */ | ||
| 7078 | update_idle_rq_clock_pelt(rq); | ||
| 7079 | |||
| 7030 | return NULL; | 7080 | return NULL; |
| 7031 | } | 7081 | } |
| 7032 | 7082 | ||
| @@ -7647,10 +7697,27 @@ static inline bool others_have_blocked(struct rq *rq) | |||
| 7647 | 7697 | ||
| 7648 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7698 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7649 | 7699 | ||
| 7700 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | ||
| 7701 | { | ||
| 7702 | if (cfs_rq->load.weight) | ||
| 7703 | return false; | ||
| 7704 | |||
| 7705 | if (cfs_rq->avg.load_sum) | ||
| 7706 | return false; | ||
| 7707 | |||
| 7708 | if (cfs_rq->avg.util_sum) | ||
| 7709 | return false; | ||
| 7710 | |||
| 7711 | if (cfs_rq->avg.runnable_load_sum) | ||
| 7712 | return false; | ||
| 7713 | |||
| 7714 | return true; | ||
| 7715 | } | ||
| 7716 | |||
| 7650 | static void update_blocked_averages(int cpu) | 7717 | static void update_blocked_averages(int cpu) |
| 7651 | { | 7718 | { |
| 7652 | struct rq *rq = cpu_rq(cpu); | 7719 | struct rq *rq = cpu_rq(cpu); |
| 7653 | struct cfs_rq *cfs_rq; | 7720 | struct cfs_rq *cfs_rq, *pos; |
| 7654 | const struct sched_class *curr_class; | 7721 | const struct sched_class *curr_class; |
| 7655 | struct rq_flags rf; | 7722 | struct rq_flags rf; |
| 7656 | bool done = true; | 7723 | bool done = true; |
| @@ -7662,14 +7729,10 @@ static void update_blocked_averages(int cpu) | |||
| 7662 | * Iterates the task_group tree in a bottom up fashion, see | 7729 | * Iterates the task_group tree in a bottom up fashion, see |
| 7663 | * list_add_leaf_cfs_rq() for details. | 7730 | * list_add_leaf_cfs_rq() for details. |
| 7664 | */ | 7731 | */ |
| 7665 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 7732 | for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { |
| 7666 | struct sched_entity *se; | 7733 | struct sched_entity *se; |
| 7667 | 7734 | ||
| 7668 | /* throttled entities do not contribute to load */ | 7735 | if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) |
| 7669 | if (throttled_hierarchy(cfs_rq)) | ||
| 7670 | continue; | ||
| 7671 | |||
| 7672 | if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) | ||
| 7673 | update_tg_load_avg(cfs_rq, 0); | 7736 | update_tg_load_avg(cfs_rq, 0); |
| 7674 | 7737 | ||
| 7675 | /* Propagate pending load changes to the parent, if any: */ | 7738 | /* Propagate pending load changes to the parent, if any: */ |
| @@ -7677,14 +7740,21 @@ static void update_blocked_averages(int cpu) | |||
| 7677 | if (se && !skip_blocked_update(se)) | 7740 | if (se && !skip_blocked_update(se)) |
| 7678 | update_load_avg(cfs_rq_of(se), se, 0); | 7741 | update_load_avg(cfs_rq_of(se), se, 0); |
| 7679 | 7742 | ||
| 7743 | /* | ||
| 7744 | * There can be a lot of idle CPU cgroups. Don't let fully | ||
| 7745 | * decayed cfs_rqs linger on the list. | ||
| 7746 | */ | ||
| 7747 | if (cfs_rq_is_decayed(cfs_rq)) | ||
| 7748 | list_del_leaf_cfs_rq(cfs_rq); | ||
| 7749 | |||
| 7680 | /* Don't need periodic decay once load/util_avg are null */ | 7750 | /* Don't need periodic decay once load/util_avg are null */ |
| 7681 | if (cfs_rq_has_blocked(cfs_rq)) | 7751 | if (cfs_rq_has_blocked(cfs_rq)) |
| 7682 | done = false; | 7752 | done = false; |
| 7683 | } | 7753 | } |
| 7684 | 7754 | ||
| 7685 | curr_class = rq->curr->sched_class; | 7755 | curr_class = rq->curr->sched_class; |
| 7686 | update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); | 7756 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); |
| 7687 | update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); | 7757 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); |
| 7688 | update_irq_load_avg(rq, 0); | 7758 | update_irq_load_avg(rq, 0); |
| 7689 | /* Don't need periodic decay once load/util_avg are null */ | 7759 | /* Don't need periodic decay once load/util_avg are null */ |
| 7690 | if (others_have_blocked(rq)) | 7760 | if (others_have_blocked(rq)) |
| @@ -7754,11 +7824,11 @@ static inline void update_blocked_averages(int cpu) | |||
| 7754 | 7824 | ||
| 7755 | rq_lock_irqsave(rq, &rf); | 7825 | rq_lock_irqsave(rq, &rf); |
| 7756 | update_rq_clock(rq); | 7826 | update_rq_clock(rq); |
| 7757 | update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); | 7827 | update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); |
| 7758 | 7828 | ||
| 7759 | curr_class = rq->curr->sched_class; | 7829 | curr_class = rq->curr->sched_class; |
| 7760 | update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); | 7830 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); |
| 7761 | update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); | 7831 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); |
| 7762 | update_irq_load_avg(rq, 0); | 7832 | update_irq_load_avg(rq, 0); |
| 7763 | #ifdef CONFIG_NO_HZ_COMMON | 7833 | #ifdef CONFIG_NO_HZ_COMMON |
| 7764 | rq->last_blocked_load_update_tick = jiffies; | 7834 | rq->last_blocked_load_update_tick = jiffies; |
| @@ -8452,9 +8522,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 8452 | if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) | 8522 | if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) |
| 8453 | return 0; | 8523 | return 0; |
| 8454 | 8524 | ||
| 8455 | env->imbalance = DIV_ROUND_CLOSEST( | 8525 | env->imbalance = sds->busiest_stat.group_load; |
| 8456 | sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, | ||
| 8457 | SCHED_CAPACITY_SCALE); | ||
| 8458 | 8526 | ||
| 8459 | return 1; | 8527 | return 1; |
| 8460 | } | 8528 | } |
| @@ -8636,7 +8704,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 8636 | */ | 8704 | */ |
| 8637 | update_sd_lb_stats(env, &sds); | 8705 | update_sd_lb_stats(env, &sds); |
| 8638 | 8706 | ||
| 8639 | if (static_branch_unlikely(&sched_energy_present)) { | 8707 | if (sched_energy_enabled()) { |
| 8640 | struct root_domain *rd = env->dst_rq->rd; | 8708 | struct root_domain *rd = env->dst_rq->rd; |
| 8641 | 8709 | ||
| 8642 | if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) | 8710 | if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) |
| @@ -8827,21 +8895,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 8827 | */ | 8895 | */ |
| 8828 | #define MAX_PINNED_INTERVAL 512 | 8896 | #define MAX_PINNED_INTERVAL 512 |
| 8829 | 8897 | ||
| 8830 | static int need_active_balance(struct lb_env *env) | 8898 | static inline bool |
| 8899 | asym_active_balance(struct lb_env *env) | ||
| 8831 | { | 8900 | { |
| 8832 | struct sched_domain *sd = env->sd; | 8901 | /* |
| 8902 | * ASYM_PACKING needs to force migrate tasks from busy but | ||
| 8903 | * lower priority CPUs in order to pack all tasks in the | ||
| 8904 | * highest priority CPUs. | ||
| 8905 | */ | ||
| 8906 | return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && | ||
| 8907 | sched_asym_prefer(env->dst_cpu, env->src_cpu); | ||
| 8908 | } | ||
| 8833 | 8909 | ||
| 8834 | if (env->idle == CPU_NEWLY_IDLE) { | 8910 | static inline bool |
| 8911 | voluntary_active_balance(struct lb_env *env) | ||
| 8912 | { | ||
| 8913 | struct sched_domain *sd = env->sd; | ||
| 8835 | 8914 | ||
| 8836 | /* | 8915 | if (asym_active_balance(env)) |
| 8837 | * ASYM_PACKING needs to force migrate tasks from busy but | 8916 | return 1; |
| 8838 | * lower priority CPUs in order to pack all tasks in the | ||
| 8839 | * highest priority CPUs. | ||
| 8840 | */ | ||
| 8841 | if ((sd->flags & SD_ASYM_PACKING) && | ||
| 8842 | sched_asym_prefer(env->dst_cpu, env->src_cpu)) | ||
| 8843 | return 1; | ||
| 8844 | } | ||
| 8845 | 8917 | ||
| 8846 | /* | 8918 | /* |
| 8847 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | 8919 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. |
| @@ -8859,6 +8931,16 @@ static int need_active_balance(struct lb_env *env) | |||
| 8859 | if (env->src_grp_type == group_misfit_task) | 8931 | if (env->src_grp_type == group_misfit_task) |
| 8860 | return 1; | 8932 | return 1; |
| 8861 | 8933 | ||
| 8934 | return 0; | ||
| 8935 | } | ||
| 8936 | |||
| 8937 | static int need_active_balance(struct lb_env *env) | ||
| 8938 | { | ||
| 8939 | struct sched_domain *sd = env->sd; | ||
| 8940 | |||
| 8941 | if (voluntary_active_balance(env)) | ||
| 8942 | return 1; | ||
| 8943 | |||
| 8862 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 8944 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); |
| 8863 | } | 8945 | } |
| 8864 | 8946 | ||
| @@ -9023,7 +9105,7 @@ more_balance: | |||
| 9023 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { | 9105 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
| 9024 | 9106 | ||
| 9025 | /* Prevent to re-select dst_cpu via env's CPUs */ | 9107 | /* Prevent to re-select dst_cpu via env's CPUs */ |
| 9026 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | 9108 | __cpumask_clear_cpu(env.dst_cpu, env.cpus); |
| 9027 | 9109 | ||
| 9028 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 9110 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
| 9029 | env.dst_cpu = env.new_dst_cpu; | 9111 | env.dst_cpu = env.new_dst_cpu; |
| @@ -9050,7 +9132,7 @@ more_balance: | |||
| 9050 | 9132 | ||
| 9051 | /* All tasks on this runqueue were pinned by CPU affinity */ | 9133 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 9052 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 9134 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
| 9053 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 9135 | __cpumask_clear_cpu(cpu_of(busiest), cpus); |
| 9054 | /* | 9136 | /* |
| 9055 | * Attempting to continue load balancing at the current | 9137 | * Attempting to continue load balancing at the current |
| 9056 | * sched_domain level only makes sense if there are | 9138 | * sched_domain level only makes sense if there are |
| @@ -9120,7 +9202,7 @@ more_balance: | |||
| 9120 | } else | 9202 | } else |
| 9121 | sd->nr_balance_failed = 0; | 9203 | sd->nr_balance_failed = 0; |
| 9122 | 9204 | ||
| 9123 | if (likely(!active_balance)) { | 9205 | if (likely(!active_balance) || voluntary_active_balance(&env)) { |
| 9124 | /* We were unbalanced, so reset the balancing interval */ | 9206 | /* We were unbalanced, so reset the balancing interval */ |
| 9125 | sd->balance_interval = sd->min_interval; | 9207 | sd->balance_interval = sd->min_interval; |
| 9126 | } else { | 9208 | } else { |
| @@ -9469,15 +9551,8 @@ static void kick_ilb(unsigned int flags) | |||
| 9469 | } | 9551 | } |
| 9470 | 9552 | ||
| 9471 | /* | 9553 | /* |
| 9472 | * Current heuristic for kicking the idle load balancer in the presence | 9554 | * Current decision point for kicking the idle load balancer in the presence |
| 9473 | * of an idle cpu in the system. | 9555 | * of idle CPUs in the system. |
| 9474 | * - This rq has more than one task. | ||
| 9475 | * - This rq has at least one CFS task and the capacity of the CPU is | ||
| 9476 | * significantly reduced because of RT tasks or IRQs. | ||
| 9477 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
| 9478 | * multiple busy cpu. | ||
| 9479 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | ||
| 9480 | * domain span are idle. | ||
| 9481 | */ | 9556 | */ |
| 9482 | static void nohz_balancer_kick(struct rq *rq) | 9557 | static void nohz_balancer_kick(struct rq *rq) |
| 9483 | { | 9558 | { |
| @@ -9519,8 +9594,13 @@ static void nohz_balancer_kick(struct rq *rq) | |||
| 9519 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); | 9594 | sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); |
| 9520 | if (sds) { | 9595 | if (sds) { |
| 9521 | /* | 9596 | /* |
| 9522 | * XXX: write a coherent comment on why we do this. | 9597 | * If there is an imbalance between LLC domains (IOW we could |
| 9523 | * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com | 9598 | * increase the overall cache use), we need some less-loaded LLC |
| 9599 | * domain to pull some load. Likewise, we may need to spread | ||
| 9600 | * load within the current LLC domain (e.g. packed SMT cores but | ||
| 9601 | * other CPUs are idle). We can't really know from here how busy | ||
| 9602 | * the others are - so just get a nohz balance going if it looks | ||
| 9603 | * like this LLC domain has tasks we could move. | ||
| 9524 | */ | 9604 | */ |
| 9525 | nr_busy = atomic_read(&sds->nr_busy_cpus); | 9605 | nr_busy = atomic_read(&sds->nr_busy_cpus); |
| 9526 | if (nr_busy > 1) { | 9606 | if (nr_busy > 1) { |
| @@ -9533,7 +9613,7 @@ static void nohz_balancer_kick(struct rq *rq) | |||
| 9533 | sd = rcu_dereference(rq->sd); | 9613 | sd = rcu_dereference(rq->sd); |
| 9534 | if (sd) { | 9614 | if (sd) { |
| 9535 | if ((rq->cfs.h_nr_running >= 1) && | 9615 | if ((rq->cfs.h_nr_running >= 1) && |
| 9536 | check_cpu_capacity(rq, sd)) { | 9616 | check_cpu_capacity(rq, sd)) { |
| 9537 | flags = NOHZ_KICK_MASK; | 9617 | flags = NOHZ_KICK_MASK; |
| 9538 | goto unlock; | 9618 | goto unlock; |
| 9539 | } | 9619 | } |
| @@ -9541,11 +9621,7 @@ static void nohz_balancer_kick(struct rq *rq) | |||
| 9541 | 9621 | ||
| 9542 | sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); | 9622 | sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); |
| 9543 | if (sd) { | 9623 | if (sd) { |
| 9544 | for_each_cpu(i, sched_domain_span(sd)) { | 9624 | for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { |
| 9545 | if (i == cpu || | ||
| 9546 | !cpumask_test_cpu(i, nohz.idle_cpus_mask)) | ||
| 9547 | continue; | ||
| 9548 | |||
| 9549 | if (sched_asym_prefer(i, cpu)) { | 9625 | if (sched_asym_prefer(i, cpu)) { |
| 9550 | flags = NOHZ_KICK_MASK; | 9626 | flags = NOHZ_KICK_MASK; |
| 9551 | goto unlock; | 9627 | goto unlock; |
| @@ -10546,10 +10622,10 @@ const struct sched_class fair_sched_class = { | |||
| 10546 | #ifdef CONFIG_SCHED_DEBUG | 10622 | #ifdef CONFIG_SCHED_DEBUG |
| 10547 | void print_cfs_stats(struct seq_file *m, int cpu) | 10623 | void print_cfs_stats(struct seq_file *m, int cpu) |
| 10548 | { | 10624 | { |
| 10549 | struct cfs_rq *cfs_rq; | 10625 | struct cfs_rq *cfs_rq, *pos; |
| 10550 | 10626 | ||
| 10551 | rcu_read_lock(); | 10627 | rcu_read_lock(); |
| 10552 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 10628 | for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) |
| 10553 | print_cfs_rq(m, cpu, cfs_rq); | 10629 | print_cfs_rq(m, cpu, cfs_rq); |
| 10554 | rcu_read_unlock(); | 10630 | rcu_read_unlock(); |
| 10555 | } | 10631 | } |
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81faddba9e20..b02d148e7672 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c | |||
| @@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) | |||
| 80 | cpumask_andnot(housekeeping_mask, | 80 | cpumask_andnot(housekeeping_mask, |
| 81 | cpu_possible_mask, non_housekeeping_mask); | 81 | cpu_possible_mask, non_housekeeping_mask); |
| 82 | if (cpumask_empty(housekeeping_mask)) | 82 | if (cpumask_empty(housekeeping_mask)) |
| 83 | cpumask_set_cpu(smp_processor_id(), housekeeping_mask); | 83 | __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); |
| 84 | } else { | 84 | } else { |
| 85 | cpumask_var_t tmp; | 85 | cpumask_var_t tmp; |
| 86 | 86 | ||
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 90fb5bc12ad4..befce29bd882 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c | |||
| @@ -26,7 +26,6 @@ | |||
| 26 | 26 | ||
| 27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
| 28 | #include "sched.h" | 28 | #include "sched.h" |
| 29 | #include "sched-pelt.h" | ||
| 30 | #include "pelt.h" | 29 | #include "pelt.h" |
| 31 | 30 | ||
| 32 | /* | 31 | /* |
| @@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) | |||
| 106 | * n=1 | 105 | * n=1 |
| 107 | */ | 106 | */ |
| 108 | static __always_inline u32 | 107 | static __always_inline u32 |
| 109 | accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | 108 | accumulate_sum(u64 delta, struct sched_avg *sa, |
| 110 | unsigned long load, unsigned long runnable, int running) | 109 | unsigned long load, unsigned long runnable, int running) |
| 111 | { | 110 | { |
| 112 | unsigned long scale_freq, scale_cpu; | ||
| 113 | u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ | 111 | u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ |
| 114 | u64 periods; | 112 | u64 periods; |
| 115 | 113 | ||
| 116 | scale_freq = arch_scale_freq_capacity(cpu); | ||
| 117 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
| 118 | |||
| 119 | delta += sa->period_contrib; | 114 | delta += sa->period_contrib; |
| 120 | periods = delta / 1024; /* A period is 1024us (~1ms) */ | 115 | periods = delta / 1024; /* A period is 1024us (~1ms) */ |
| 121 | 116 | ||
| @@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | |||
| 137 | } | 132 | } |
| 138 | sa->period_contrib = delta; | 133 | sa->period_contrib = delta; |
| 139 | 134 | ||
| 140 | contrib = cap_scale(contrib, scale_freq); | ||
| 141 | if (load) | 135 | if (load) |
| 142 | sa->load_sum += load * contrib; | 136 | sa->load_sum += load * contrib; |
| 143 | if (runnable) | 137 | if (runnable) |
| 144 | sa->runnable_load_sum += runnable * contrib; | 138 | sa->runnable_load_sum += runnable * contrib; |
| 145 | if (running) | 139 | if (running) |
| 146 | sa->util_sum += contrib * scale_cpu; | 140 | sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; |
| 147 | 141 | ||
| 148 | return periods; | 142 | return periods; |
| 149 | } | 143 | } |
| @@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | |||
| 177 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 171 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
| 178 | */ | 172 | */ |
| 179 | static __always_inline int | 173 | static __always_inline int |
| 180 | ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, | 174 | ___update_load_sum(u64 now, struct sched_avg *sa, |
| 181 | unsigned long load, unsigned long runnable, int running) | 175 | unsigned long load, unsigned long runnable, int running) |
| 182 | { | 176 | { |
| 183 | u64 delta; | 177 | u64 delta; |
| @@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, | |||
| 221 | * Step 1: accumulate *_sum since last_update_time. If we haven't | 215 | * Step 1: accumulate *_sum since last_update_time. If we haven't |
| 222 | * crossed period boundaries, finish. | 216 | * crossed period boundaries, finish. |
| 223 | */ | 217 | */ |
| 224 | if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) | 218 | if (!accumulate_sum(delta, sa, load, runnable, running)) |
| 225 | return 0; | 219 | return 0; |
| 226 | 220 | ||
| 227 | return 1; | 221 | return 1; |
| @@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna | |||
| 267 | * runnable_load_avg = \Sum se->avg.runable_load_avg | 261 | * runnable_load_avg = \Sum se->avg.runable_load_avg |
| 268 | */ | 262 | */ |
| 269 | 263 | ||
| 270 | int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) | 264 | int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) |
| 271 | { | 265 | { |
| 272 | if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { | 266 | if (___update_load_sum(now, &se->avg, 0, 0, 0)) { |
| 273 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 267 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
| 274 | return 1; | 268 | return 1; |
| 275 | } | 269 | } |
| @@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) | |||
| 277 | return 0; | 271 | return 0; |
| 278 | } | 272 | } |
| 279 | 273 | ||
| 280 | int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) | 274 | int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 281 | { | 275 | { |
| 282 | if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, | 276 | if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, |
| 283 | cfs_rq->curr == se)) { | 277 | cfs_rq->curr == se)) { |
| 284 | 278 | ||
| 285 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 279 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
| @@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e | |||
| 290 | return 0; | 284 | return 0; |
| 291 | } | 285 | } |
| 292 | 286 | ||
| 293 | int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) | 287 | int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) |
| 294 | { | 288 | { |
| 295 | if (___update_load_sum(now, cpu, &cfs_rq->avg, | 289 | if (___update_load_sum(now, &cfs_rq->avg, |
| 296 | scale_load_down(cfs_rq->load.weight), | 290 | scale_load_down(cfs_rq->load.weight), |
| 297 | scale_load_down(cfs_rq->runnable_weight), | 291 | scale_load_down(cfs_rq->runnable_weight), |
| 298 | cfs_rq->curr != NULL)) { | 292 | cfs_rq->curr != NULL)) { |
| @@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) | |||
| 317 | 311 | ||
| 318 | int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) | 312 | int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) |
| 319 | { | 313 | { |
| 320 | if (___update_load_sum(now, rq->cpu, &rq->avg_rt, | 314 | if (___update_load_sum(now, &rq->avg_rt, |
| 321 | running, | 315 | running, |
| 322 | running, | 316 | running, |
| 323 | running)) { | 317 | running)) { |
| @@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) | |||
| 340 | 334 | ||
| 341 | int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) | 335 | int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) |
| 342 | { | 336 | { |
| 343 | if (___update_load_sum(now, rq->cpu, &rq->avg_dl, | 337 | if (___update_load_sum(now, &rq->avg_dl, |
| 344 | running, | 338 | running, |
| 345 | running, | 339 | running, |
| 346 | running)) { | 340 | running)) { |
| @@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) | |||
| 365 | int update_irq_load_avg(struct rq *rq, u64 running) | 359 | int update_irq_load_avg(struct rq *rq, u64 running) |
| 366 | { | 360 | { |
| 367 | int ret = 0; | 361 | int ret = 0; |
| 362 | |||
| 363 | /* | ||
| 364 | * We can't use clock_pelt because irq time is not accounted in | ||
| 365 | * clock_task. Instead we directly scale the running time to | ||
| 366 | * reflect the real amount of computation | ||
| 367 | */ | ||
| 368 | running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); | ||
| 369 | running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); | ||
| 370 | |||
| 368 | /* | 371 | /* |
| 369 | * We know the time that has been used by interrupt since last update | 372 | * We know the time that has been used by interrupt since last update |
| 370 | * but we don't when. Let be pessimistic and assume that interrupt has | 373 | * but we don't when. Let be pessimistic and assume that interrupt has |
| 371 | * happened just before the update. This is not so far from reality | 374 | * happened just before the update. This is not so far from reality |
| 372 | * because interrupt will most probably wake up task and trig an update | 375 | * because interrupt will most probably wake up task and trig an update |
| 373 | * of rq clock during which the metric si updated. | 376 | * of rq clock during which the metric is updated. |
| 374 | * We start to decay with normal context time and then we add the | 377 | * We start to decay with normal context time and then we add the |
| 375 | * interrupt context time. | 378 | * interrupt context time. |
| 376 | * We can safely remove running from rq->clock because | 379 | * We can safely remove running from rq->clock because |
| 377 | * rq->clock += delta with delta >= running | 380 | * rq->clock += delta with delta >= running |
| 378 | */ | 381 | */ |
| 379 | ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, | 382 | ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, |
| 380 | 0, | 383 | 0, |
| 381 | 0, | 384 | 0, |
| 382 | 0); | 385 | 0); |
| 383 | ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, | 386 | ret += ___update_load_sum(rq->clock, &rq->avg_irq, |
| 384 | 1, | 387 | 1, |
| 385 | 1, | 388 | 1, |
| 386 | 1); | 389 | 1); |
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7e56b489ff32..7489d5f56960 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h | |||
| @@ -1,8 +1,9 @@ | |||
| 1 | #ifdef CONFIG_SMP | 1 | #ifdef CONFIG_SMP |
| 2 | #include "sched-pelt.h" | ||
| 2 | 3 | ||
| 3 | int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); | 4 | int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); |
| 4 | int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); | 5 | int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); |
| 5 | int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); | 6 | int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); |
| 6 | int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); | 7 | int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); |
| 7 | int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); | 8 | int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); |
| 8 | 9 | ||
| @@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg) | |||
| 42 | WRITE_ONCE(avg->util_est.enqueued, enqueued); | 43 | WRITE_ONCE(avg->util_est.enqueued, enqueued); |
| 43 | } | 44 | } |
| 44 | 45 | ||
| 46 | /* | ||
| 47 | * The clock_pelt scales the time to reflect the effective amount of | ||
| 48 | * computation done during the running delta time but then sync back to | ||
| 49 | * clock_task when rq is idle. | ||
| 50 | * | ||
| 51 | * | ||
| 52 | * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 | ||
| 53 | * @ max capacity ------******---------------******--------------- | ||
| 54 | * @ half capacity ------************---------************--------- | ||
| 55 | * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16 | ||
| 56 | * | ||
| 57 | */ | ||
| 58 | static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) | ||
| 59 | { | ||
| 60 | if (unlikely(is_idle_task(rq->curr))) { | ||
| 61 | /* The rq is idle, we can sync to clock_task */ | ||
| 62 | rq->clock_pelt = rq_clock_task(rq); | ||
| 63 | return; | ||
| 64 | } | ||
| 65 | |||
| 66 | /* | ||
| 67 | * When a rq runs at a lower compute capacity, it will need | ||
| 68 | * more time to do the same amount of work than at max | ||
| 69 | * capacity. In order to be invariant, we scale the delta to | ||
| 70 | * reflect how much work has been really done. | ||
| 71 | * Running longer results in stealing idle time that will | ||
| 72 | * disturb the load signal compared to max capacity. This | ||
| 73 | * stolen idle time will be automatically reflected when the | ||
| 74 | * rq will be idle and the clock will be synced with | ||
| 75 | * rq_clock_task. | ||
| 76 | */ | ||
| 77 | |||
| 78 | /* | ||
| 79 | * Scale the elapsed time to reflect the real amount of | ||
| 80 | * computation | ||
| 81 | */ | ||
| 82 | delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); | ||
| 83 | delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); | ||
| 84 | |||
| 85 | rq->clock_pelt += delta; | ||
| 86 | } | ||
| 87 | |||
| 88 | /* | ||
| 89 | * When rq becomes idle, we have to check if it has lost idle time | ||
| 90 | * because it was fully busy. A rq is fully used when the /Sum util_sum | ||
| 91 | * is greater or equal to: | ||
| 92 | * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT; | ||
| 93 | * For optimization and computing rounding purpose, we don't take into account | ||
| 94 | * the position in the current window (period_contrib) and we use the higher | ||
| 95 | * bound of util_sum to decide. | ||
| 96 | */ | ||
| 97 | static inline void update_idle_rq_clock_pelt(struct rq *rq) | ||
| 98 | { | ||
| 99 | u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX; | ||
| 100 | u32 util_sum = rq->cfs.avg.util_sum; | ||
| 101 | util_sum += rq->avg_rt.util_sum; | ||
| 102 | util_sum += rq->avg_dl.util_sum; | ||
| 103 | |||
| 104 | /* | ||
| 105 | * Reflecting stolen time makes sense only if the idle | ||
| 106 | * phase would be present at max capacity. As soon as the | ||
| 107 | * utilization of a rq has reached the maximum value, it is | ||
| 108 | * considered as an always runnig rq without idle time to | ||
| 109 | * steal. This potential idle time is considered as lost in | ||
| 110 | * this case. We keep track of this lost idle time compare to | ||
| 111 | * rq's clock_task. | ||
| 112 | */ | ||
| 113 | if (util_sum >= divider) | ||
| 114 | rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; | ||
| 115 | } | ||
| 116 | |||
| 117 | static inline u64 rq_clock_pelt(struct rq *rq) | ||
| 118 | { | ||
| 119 | lockdep_assert_held(&rq->lock); | ||
| 120 | assert_clock_updated(rq); | ||
| 121 | |||
| 122 | return rq->clock_pelt - rq->lost_idle_time; | ||
| 123 | } | ||
| 124 | |||
| 125 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 126 | /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ | ||
| 127 | static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) | ||
| 128 | { | ||
| 129 | if (unlikely(cfs_rq->throttle_count)) | ||
| 130 | return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; | ||
| 131 | |||
| 132 | return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; | ||
| 133 | } | ||
| 134 | #else | ||
| 135 | static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) | ||
| 136 | { | ||
| 137 | return rq_clock_pelt(rq_of(cfs_rq)); | ||
| 138 | } | ||
| 139 | #endif | ||
| 140 | |||
| 45 | #else | 141 | #else |
| 46 | 142 | ||
| 47 | static inline int | 143 | static inline int |
| @@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running) | |||
| 67 | { | 163 | { |
| 68 | return 0; | 164 | return 0; |
| 69 | } | 165 | } |
| 166 | |||
| 167 | static inline u64 rq_clock_pelt(struct rq *rq) | ||
| 168 | { | ||
| 169 | return rq_clock_task(rq); | ||
| 170 | } | ||
| 171 | |||
| 172 | static inline void | ||
| 173 | update_rq_clock_pelt(struct rq *rq, s64 delta) { } | ||
| 174 | |||
| 175 | static inline void | ||
| 176 | update_idle_rq_clock_pelt(struct rq *rq) { } | ||
| 177 | |||
| 70 | #endif | 178 | #endif |
| 71 | 179 | ||
| 72 | 180 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e4f398ad9e73..90fa23d36565 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
| 1587 | * rt task | 1587 | * rt task |
| 1588 | */ | 1588 | */ |
| 1589 | if (rq->curr->sched_class != &rt_sched_class) | 1589 | if (rq->curr->sched_class != &rt_sched_class) |
| 1590 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); | 1590 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); |
| 1591 | 1591 | ||
| 1592 | return p; | 1592 | return p; |
| 1593 | } | 1593 | } |
| @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 1596 | { | 1596 | { |
| 1597 | update_curr_rt(rq); | 1597 | update_curr_rt(rq); |
| 1598 | 1598 | ||
| 1599 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); | 1599 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); |
| 1600 | 1600 | ||
| 1601 | /* | 1601 | /* |
| 1602 | * The previous task needs to be made eligible for pushing | 1602 | * The previous task needs to be made eligible for pushing |
| @@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 2325 | struct sched_rt_entity *rt_se = &p->rt; | 2325 | struct sched_rt_entity *rt_se = &p->rt; |
| 2326 | 2326 | ||
| 2327 | update_curr_rt(rq); | 2327 | update_curr_rt(rq); |
| 2328 | update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); | 2328 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); |
| 2329 | 2329 | ||
| 2330 | watchdog(rq, p); | 2330 | watchdog(rq, p); |
| 2331 | 2331 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6665b9c02e2f..efa686eeff26 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -861,7 +861,10 @@ struct rq { | |||
| 861 | 861 | ||
| 862 | unsigned int clock_update_flags; | 862 | unsigned int clock_update_flags; |
| 863 | u64 clock; | 863 | u64 clock; |
| 864 | u64 clock_task; | 864 | /* Ensure that all clocks are in the same cache line */ |
| 865 | u64 clock_task ____cacheline_aligned; | ||
| 866 | u64 clock_pelt; | ||
| 867 | unsigned long lost_idle_time; | ||
| 865 | 868 | ||
| 866 | atomic_t nr_iowait; | 869 | atomic_t nr_iowait; |
| 867 | 870 | ||
| @@ -951,6 +954,22 @@ struct rq { | |||
| 951 | #endif | 954 | #endif |
| 952 | }; | 955 | }; |
| 953 | 956 | ||
| 957 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 958 | |||
| 959 | /* CPU runqueue to which this cfs_rq is attached */ | ||
| 960 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | ||
| 961 | { | ||
| 962 | return cfs_rq->rq; | ||
| 963 | } | ||
| 964 | |||
| 965 | #else | ||
| 966 | |||
| 967 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | ||
| 968 | { | ||
| 969 | return container_of(cfs_rq, struct rq, cfs); | ||
| 970 | } | ||
| 971 | #endif | ||
| 972 | |||
| 954 | static inline int cpu_of(struct rq *rq) | 973 | static inline int cpu_of(struct rq *rq) |
| 955 | { | 974 | { |
| 956 | #ifdef CONFIG_SMP | 975 | #ifdef CONFIG_SMP |
| @@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 1460 | */ | 1479 | */ |
| 1461 | smp_wmb(); | 1480 | smp_wmb(); |
| 1462 | #ifdef CONFIG_THREAD_INFO_IN_TASK | 1481 | #ifdef CONFIG_THREAD_INFO_IN_TASK |
| 1463 | p->cpu = cpu; | 1482 | WRITE_ONCE(p->cpu, cpu); |
| 1464 | #else | 1483 | #else |
| 1465 | task_thread_info(p)->cpu = cpu; | 1484 | WRITE_ONCE(task_thread_info(p)->cpu, cpu); |
| 1466 | #endif | 1485 | #endif |
| 1467 | p->wake_cpu = cpu; | 1486 | p->wake_cpu = cpu; |
| 1468 | #endif | 1487 | #endif |
| @@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p) | |||
| 1563 | 1582 | ||
| 1564 | static inline int task_on_rq_migrating(struct task_struct *p) | 1583 | static inline int task_on_rq_migrating(struct task_struct *p) |
| 1565 | { | 1584 | { |
| 1566 | return p->on_rq == TASK_ON_RQ_MIGRATING; | 1585 | return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; |
| 1567 | } | 1586 | } |
| 1568 | 1587 | ||
| 1569 | /* | 1588 | /* |
| @@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); | |||
| 1781 | unsigned long to_ratio(u64 period, u64 runtime); | 1800 | unsigned long to_ratio(u64 period, u64 runtime); |
| 1782 | 1801 | ||
| 1783 | extern void init_entity_runnable_average(struct sched_entity *se); | 1802 | extern void init_entity_runnable_average(struct sched_entity *se); |
| 1784 | extern void post_init_entity_util_avg(struct sched_entity *se); | 1803 | extern void post_init_entity_util_avg(struct task_struct *p); |
| 1785 | 1804 | ||
| 1786 | #ifdef CONFIG_NO_HZ_FULL | 1805 | #ifdef CONFIG_NO_HZ_FULL |
| 1787 | extern bool sched_can_stop_tick(struct rq *rq); | 1806 | extern bool sched_can_stop_tick(struct rq *rq); |
| @@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | |||
| 2211 | # define arch_scale_freq_invariant() false | 2230 | # define arch_scale_freq_invariant() false |
| 2212 | #endif | 2231 | #endif |
| 2213 | 2232 | ||
| 2233 | #ifdef CONFIG_SMP | ||
| 2234 | static inline unsigned long capacity_orig_of(int cpu) | ||
| 2235 | { | ||
| 2236 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
| 2237 | } | ||
| 2238 | #endif | ||
| 2239 | |||
| 2214 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | 2240 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
| 2215 | /** | 2241 | /** |
| 2216 | * enum schedutil_type - CPU utilization type | 2242 | * enum schedutil_type - CPU utilization type |
| @@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned | |||
| 2299 | #endif | 2325 | #endif |
| 2300 | 2326 | ||
| 2301 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) | 2327 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) |
| 2328 | |||
| 2302 | #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) | 2329 | #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) |
| 2303 | #else | 2330 | |
| 2331 | DECLARE_STATIC_KEY_FALSE(sched_energy_present); | ||
| 2332 | |||
| 2333 | static inline bool sched_energy_enabled(void) | ||
| 2334 | { | ||
| 2335 | return static_branch_unlikely(&sched_energy_present); | ||
| 2336 | } | ||
| 2337 | |||
| 2338 | #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ | ||
| 2339 | |||
| 2304 | #define perf_domain_span(pd) NULL | 2340 | #define perf_domain_span(pd) NULL |
| 2305 | #endif | 2341 | static inline bool sched_energy_enabled(void) { return false; } |
| 2306 | 2342 | ||
| 2307 | #ifdef CONFIG_SMP | 2343 | #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ |
| 2308 | extern struct static_key_false sched_energy_present; | ||
| 2309 | #endif | ||
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7d905f55e7fa..ab7f371a3a17 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
| @@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 201 | return 1; | 201 | return 1; |
| 202 | } | 202 | } |
| 203 | 203 | ||
| 204 | DEFINE_STATIC_KEY_FALSE(sched_energy_present); | ||
| 205 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) | 204 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) |
| 205 | DEFINE_STATIC_KEY_FALSE(sched_energy_present); | ||
| 206 | unsigned int sysctl_sched_energy_aware = 1; | ||
| 206 | DEFINE_MUTEX(sched_energy_mutex); | 207 | DEFINE_MUTEX(sched_energy_mutex); |
| 207 | bool sched_energy_update; | 208 | bool sched_energy_update; |
| 208 | 209 | ||
| 210 | #ifdef CONFIG_PROC_SYSCTL | ||
| 211 | int sched_energy_aware_handler(struct ctl_table *table, int write, | ||
| 212 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 213 | { | ||
| 214 | int ret, state; | ||
| 215 | |||
| 216 | if (write && !capable(CAP_SYS_ADMIN)) | ||
| 217 | return -EPERM; | ||
| 218 | |||
| 219 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 220 | if (!ret && write) { | ||
| 221 | state = static_branch_unlikely(&sched_energy_present); | ||
| 222 | if (state != sysctl_sched_energy_aware) { | ||
| 223 | mutex_lock(&sched_energy_mutex); | ||
| 224 | sched_energy_update = 1; | ||
| 225 | rebuild_sched_domains(); | ||
| 226 | sched_energy_update = 0; | ||
| 227 | mutex_unlock(&sched_energy_mutex); | ||
| 228 | } | ||
| 229 | } | ||
| 230 | |||
| 231 | return ret; | ||
| 232 | } | ||
| 233 | #endif | ||
| 234 | |||
| 209 | static void free_pd(struct perf_domain *pd) | 235 | static void free_pd(struct perf_domain *pd) |
| 210 | { | 236 | { |
| 211 | struct perf_domain *tmp; | 237 | struct perf_domain *tmp; |
| @@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map) | |||
| 322 | struct cpufreq_policy *policy; | 348 | struct cpufreq_policy *policy; |
| 323 | struct cpufreq_governor *gov; | 349 | struct cpufreq_governor *gov; |
| 324 | 350 | ||
| 351 | if (!sysctl_sched_energy_aware) | ||
| 352 | goto free; | ||
| 353 | |||
| 325 | /* EAS is enabled for asymmetric CPU capacity topologies. */ | 354 | /* EAS is enabled for asymmetric CPU capacity topologies. */ |
| 326 | if (!per_cpu(sd_asym_cpucapacity, cpu)) { | 355 | if (!per_cpu(sd_asym_cpucapacity, cpu)) { |
| 327 | if (sched_debug()) { | 356 | if (sched_debug()) { |
| @@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 676 | } | 705 | } |
| 677 | 706 | ||
| 678 | struct s_data { | 707 | struct s_data { |
| 679 | struct sched_domain ** __percpu sd; | 708 | struct sched_domain * __percpu *sd; |
| 680 | struct root_domain *rd; | 709 | struct root_domain *rd; |
| 681 | }; | 710 | }; |
| 682 | 711 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7578e21a711b..7c2b9bc88ee8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -472,6 +472,17 @@ static struct ctl_table kern_table[] = { | |||
| 472 | .extra1 = &one, | 472 | .extra1 = &one, |
| 473 | }, | 473 | }, |
| 474 | #endif | 474 | #endif |
| 475 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) | ||
| 476 | { | ||
| 477 | .procname = "sched_energy_aware", | ||
| 478 | .data = &sysctl_sched_energy_aware, | ||
| 479 | .maxlen = sizeof(unsigned int), | ||
| 480 | .mode = 0644, | ||
| 481 | .proc_handler = sched_energy_aware_handler, | ||
| 482 | .extra1 = &zero, | ||
| 483 | .extra2 = &one, | ||
| 484 | }, | ||
| 485 | #endif | ||
| 475 | #ifdef CONFIG_PROVE_LOCKING | 486 | #ifdef CONFIG_PROVE_LOCKING |
| 476 | { | 487 | { |
| 477 | .procname = "prove_locking", | 488 | .procname = "prove_locking", |
