aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-13 16:37:52 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-13 16:37:52 -0500
commit3e2014637c50e5d6a77cd63d5db6c209fe29d1b1 (patch)
treea672ed603262aeddda4490056b27b09791d0cbbb
parentf2be8bd52e7410c70145f73511a2e80f4797e1a5 (diff)
parent765cc3a4b224e22bf524fabe40284a524f37cdd0 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main updates in this cycle were: - Group balancing enhancements and cleanups (Brendan Jackman) - Move CPU isolation related functionality into its separate kernel/sched/isolation.c file, with related 'housekeeping_*()' namespace and nomenclature et al. (Frederic Weisbecker) - Improve the interactive/cpu-intense fairness calculation (Josef Bacik) - Improve the PELT code and related cleanups (Peter Zijlstra) - Improve the logic of pick_next_task_fair() (Uladzislau Rezki) - Improve the RT IPI based balancing logic (Steven Rostedt) - Various micro-optimizations: - better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi) - better idle loop (Cheng Jian) - ... plus misc fixes, cleanups and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds sched/sysctl: Fix attributes of some extern declarations sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated sched/isolation: Add basic isolcpus flags sched/isolation: Move isolcpus= handling to the housekeeping code sched/isolation: Handle the nohz_full= parameter sched/isolation: Introduce housekeeping flags sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu() sched/isolation: Use its own static key sched/isolation: Make the housekeeping cpumask private sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu() sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version sched/isolation: Move housekeeping related code to its own file sched/idle: Micro-optimize the idle loop sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter sched/rt: Simplify the IPI based RT balancing logic block/ioprio: Use a helper to check for RT prio sched/rt: Add a helper to test for a RT task ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt40
-rw-r--r--drivers/base/cpu.c11
-rw-r--r--drivers/net/ethernet/tile/tilegx.c6
-rw-r--r--fs/proc/array.c2
-rw-r--r--include/linux/cpumask.h16
-rw-r--r--include/linux/ioprio.h3
-rw-r--r--include/linux/sched.h19
-rw-r--r--include/linux/sched/isolation.h51
-rw-r--r--include/linux/sched/rt.h11
-rw-r--r--include/linux/sched/sysctl.h6
-rw-r--r--include/linux/tick.h39
-rw-r--r--include/trace/events/sched.h2
-rw-r--r--init/Kconfig7
-rw-r--r--init/main.c2
-rw-r--r--kernel/cgroup/cpuset.c15
-rw-r--r--kernel/rcu/tree_plugin.h3
-rw-r--r--kernel/rcu/update.c3
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c56
-rw-r--r--kernel/sched/deadline.c21
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/fair.c1049
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/isolation.c155
-rw-r--r--kernel/sched/rt.c316
-rw-r--r--kernel/sched/sched.h73
-rw-r--r--kernel/sched/topology.c49
-rw-r--r--kernel/time/tick-sched.c33
-rw-r--r--kernel/trace/trace_output.c12
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/watchdog.c13
31 files changed, 1270 insertions, 774 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 116e798b61e6..38ed8787261b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1730,20 +1730,33 @@
1730 isapnp= [ISAPNP] 1730 isapnp= [ISAPNP]
1731 Format: <RDP>,<reset>,<pci_scan>,<verbosity> 1731 Format: <RDP>,<reset>,<pci_scan>,<verbosity>
1732 1732
1733 isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler. 1733 isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance.
1734 The argument is a cpu list, as described above. 1734 [Deprecated - use cpusets instead]
1735 Format: [flag-list,]<cpu-list>
1736
1737 Specify one or more CPUs to isolate from disturbances
1738 specified in the flag list (default: domain):
1739
1740 nohz
1741 Disable the tick when a single task runs.
1742 domain
1743 Isolate from the general SMP balancing and scheduling
1744 algorithms. Note that performing domain isolation this way
1745 is irreversible: it's not possible to bring back a CPU to
1746 the domains once isolated through isolcpus. It's strongly
1747 advised to use cpusets instead to disable scheduler load
1748 balancing through the "cpuset.sched_load_balance" file.
1749 It offers a much more flexible interface where CPUs can
1750 move in and out of an isolated set anytime.
1751
1752 You can move a process onto or off an "isolated" CPU via
1753 the CPU affinity syscalls or cpuset.
1754 <cpu number> begins at 0 and the maximum value is
1755 "number of CPUs in system - 1".
1756
1757 The format of <cpu-list> is described above.
1735 1758
1736 This option can be used to specify one or more CPUs
1737 to isolate from the general SMP balancing and scheduling
1738 algorithms. You can move a process onto or off an
1739 "isolated" CPU via the CPU affinity syscalls or cpuset.
1740 <cpu number> begins at 0 and the maximum value is
1741 "number of CPUs in system - 1".
1742 1759
1743 This option is the preferred way to isolate CPUs. The
1744 alternative -- manually setting the CPU mask of all
1745 tasks in the system -- can cause problems and
1746 suboptimal load balancer performance.
1747 1760
1748 iucv= [HW,NET] 1761 iucv= [HW,NET]
1749 1762
@@ -4209,6 +4222,9 @@
4209 Used to run time disable IRQ_TIME_ACCOUNTING on any 4222 Used to run time disable IRQ_TIME_ACCOUNTING on any
4210 platforms where RDTSC is slow and this accounting 4223 platforms where RDTSC is slow and this accounting
4211 can add overhead. 4224 can add overhead.
4225 [x86] unstable: mark the TSC clocksource as unstable, this
4226 marks the TSC unconditionally unstable at bootup and
4227 avoids any further wobbles once the TSC watchdog notices.
4212 4228
4213 turbografx.map[2|3]= [HW,JOY] 4229 turbografx.map[2|3]= [HW,JOY]
4214 TurboGraFX parallel port interface 4230 TurboGraFX parallel port interface
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b4d817..a73ab95558f5 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
18#include <linux/cpufeature.h> 18#include <linux/cpufeature.h>
19#include <linux/tick.h> 19#include <linux/tick.h>
20#include <linux/pm_qos.h> 20#include <linux/pm_qos.h>
21#include <linux/sched/isolation.h>
21 22
22#include "base.h" 23#include "base.h"
23 24
@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
271 struct device_attribute *attr, char *buf) 272 struct device_attribute *attr, char *buf)
272{ 273{
273 int n = 0, len = PAGE_SIZE-2; 274 int n = 0, len = PAGE_SIZE-2;
275 cpumask_var_t isolated;
274 276
275 n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map)); 277 if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
278 return -ENOMEM;
279
280 cpumask_andnot(isolated, cpu_possible_mask,
281 housekeeping_cpumask(HK_FLAG_DOMAIN));
282 n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
283
284 free_cpumask_var(isolated);
276 285
277 return n; 286 return n;
278} 287}
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index c00102b8145a..b3e5816a4678 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,7 +40,7 @@
40#include <linux/tcp.h> 40#include <linux/tcp.h>
41#include <linux/net_tstamp.h> 41#include <linux/net_tstamp.h>
42#include <linux/ptp_clock_kernel.h> 42#include <linux/ptp_clock_kernel.h>
43#include <linux/tick.h> 43#include <linux/sched/isolation.h>
44 44
45#include <asm/checksum.h> 45#include <asm/checksum.h>
46#include <asm/homecache.h> 46#include <asm/homecache.h>
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
2270 tile_net_dev_init(name, mac); 2270 tile_net_dev_init(name, mac);
2271 2271
2272 if (!network_cpus_init()) 2272 if (!network_cpus_init())
2273 cpumask_and(&network_cpus_map, housekeeping_cpumask(), 2273 cpumask_and(&network_cpus_map,
2274 cpu_online_mask); 2274 housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
2275 2275
2276 return 0; 2276 return 0;
2277} 2277}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d82549e80402..6f6fc1672ad1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,7 +138,7 @@ static const char * const task_state_array[] = {
138static inline const char *get_task_state(struct task_struct *tsk) 138static inline const char *get_task_state(struct task_struct *tsk)
139{ 139{
140 BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); 140 BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
141 return task_state_array[__get_task_state(tsk)]; 141 return task_state_array[task_state_index(tsk)];
142} 142}
143 143
144static inline int get_task_umask(struct task_struct *tsk) 144static inline int get_task_umask(struct task_struct *tsk)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 8d3125c493b2..75b565194437 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
131 return 0; 131 return 0;
132} 132}
133 133
134static inline unsigned int cpumask_last(const struct cpumask *srcp)
135{
136 return 0;
137}
138
134/* Valid inputs for n are -1 and 0. */ 139/* Valid inputs for n are -1 and 0. */
135static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) 140static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
136{ 141{
@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
179 return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); 184 return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
180} 185}
181 186
187/**
188 * cpumask_last - get the last CPU in a cpumask
189 * @srcp: - the cpumask pointer
190 *
191 * Returns >= nr_cpumask_bits if no CPUs set.
192 */
193static inline unsigned int cpumask_last(const struct cpumask *srcp)
194{
195 return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
196}
197
182unsigned int cpumask_next(int n, const struct cpumask *srcp); 198unsigned int cpumask_next(int n, const struct cpumask *srcp);
183 199
184/** 200/**
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 2cdd74809899..627efac73e6d 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -3,6 +3,7 @@
3#define IOPRIO_H 3#define IOPRIO_H
4 4
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/sched/rt.h>
6#include <linux/iocontext.h> 7#include <linux/iocontext.h>
7 8
8/* 9/*
@@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
63{ 64{
64 if (task->policy == SCHED_IDLE) 65 if (task->policy == SCHED_IDLE)
65 return IOPRIO_CLASS_IDLE; 66 return IOPRIO_CLASS_IDLE;
66 else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) 67 else if (task_is_realtime(task))
67 return IOPRIO_CLASS_RT; 68 return IOPRIO_CLASS_RT;
68 else 69 else
69 return IOPRIO_CLASS_BE; 70 return IOPRIO_CLASS_BE;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..a5dc7c98b0a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -166,8 +166,6 @@ struct task_group;
166/* Task command name length: */ 166/* Task command name length: */
167#define TASK_COMM_LEN 16 167#define TASK_COMM_LEN 16
168 168
169extern cpumask_var_t cpu_isolated_map;
170
171extern void scheduler_tick(void); 169extern void scheduler_tick(void);
172 170
173#define MAX_SCHEDULE_TIMEOUT LONG_MAX 171#define MAX_SCHEDULE_TIMEOUT LONG_MAX
@@ -332,9 +330,11 @@ struct load_weight {
332struct sched_avg { 330struct sched_avg {
333 u64 last_update_time; 331 u64 last_update_time;
334 u64 load_sum; 332 u64 load_sum;
333 u64 runnable_load_sum;
335 u32 util_sum; 334 u32 util_sum;
336 u32 period_contrib; 335 u32 period_contrib;
337 unsigned long load_avg; 336 unsigned long load_avg;
337 unsigned long runnable_load_avg;
338 unsigned long util_avg; 338 unsigned long util_avg;
339}; 339};
340 340
@@ -377,6 +377,7 @@ struct sched_statistics {
377struct sched_entity { 377struct sched_entity {
378 /* For load-balancing: */ 378 /* For load-balancing: */
379 struct load_weight load; 379 struct load_weight load;
380 unsigned long runnable_weight;
380 struct rb_node run_node; 381 struct rb_node run_node;
381 struct list_head group_node; 382 struct list_head group_node;
382 unsigned int on_rq; 383 unsigned int on_rq;
@@ -472,10 +473,10 @@ struct sched_dl_entity {
472 * conditions between the inactive timer handler and the wakeup 473 * conditions between the inactive timer handler and the wakeup
473 * code. 474 * code.
474 */ 475 */
475 int dl_throttled; 476 int dl_throttled : 1;
476 int dl_boosted; 477 int dl_boosted : 1;
477 int dl_yielded; 478 int dl_yielded : 1;
478 int dl_non_contending; 479 int dl_non_contending : 1;
479 480
480 /* 481 /*
481 * Bandwidth enforcement timer. Each -deadline task has its 482 * Bandwidth enforcement timer. Each -deadline task has its
@@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
1246#define TASK_REPORT_IDLE (TASK_REPORT + 1) 1247#define TASK_REPORT_IDLE (TASK_REPORT + 1)
1247#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) 1248#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
1248 1249
1249static inline unsigned int __get_task_state(struct task_struct *tsk) 1250static inline unsigned int task_state_index(struct task_struct *tsk)
1250{ 1251{
1251 unsigned int tsk_state = READ_ONCE(tsk->state); 1252 unsigned int tsk_state = READ_ONCE(tsk->state);
1252 unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; 1253 unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
@@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
1259 return fls(state); 1260 return fls(state);
1260} 1261}
1261 1262
1262static inline char __task_state_to_char(unsigned int state) 1263static inline char task_index_to_char(unsigned int state)
1263{ 1264{
1264 static const char state_char[] = "RSDTtXZPI"; 1265 static const char state_char[] = "RSDTtXZPI";
1265 1266
@@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state)
1270 1271
1271static inline char task_state_to_char(struct task_struct *tsk) 1272static inline char task_state_to_char(struct task_struct *tsk)
1272{ 1273{
1273 return __task_state_to_char(__get_task_state(tsk)); 1274 return task_index_to_char(task_state_index(tsk));
1274} 1275}
1275 1276
1276/** 1277/**
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
new file mode 100644
index 000000000000..d849431c8060
--- /dev/null
+++ b/include/linux/sched/isolation.h
@@ -0,0 +1,51 @@
1#ifndef _LINUX_SCHED_ISOLATION_H
2#define _LINUX_SCHED_ISOLATION_H
3
4#include <linux/cpumask.h>
5#include <linux/init.h>
6#include <linux/tick.h>
7
8enum hk_flags {
9 HK_FLAG_TIMER = 1,
10 HK_FLAG_RCU = (1 << 1),
11 HK_FLAG_MISC = (1 << 2),
12 HK_FLAG_SCHED = (1 << 3),
13 HK_FLAG_TICK = (1 << 4),
14 HK_FLAG_DOMAIN = (1 << 5),
15};
16
17#ifdef CONFIG_CPU_ISOLATION
18DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
19extern int housekeeping_any_cpu(enum hk_flags flags);
20extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
21extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
22extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
23extern void __init housekeeping_init(void);
24
25#else
26
27static inline int housekeeping_any_cpu(enum hk_flags flags)
28{
29 return smp_processor_id();
30}
31
32static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
33{
34 return cpu_possible_mask;
35}
36
37static inline void housekeeping_affine(struct task_struct *t,
38 enum hk_flags flags) { }
39static inline void housekeeping_init(void) { }
40#endif /* CONFIG_CPU_ISOLATION */
41
42static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
43{
44#ifdef CONFIG_CPU_ISOLATION
45 if (static_branch_unlikely(&housekeeping_overriden))
46 return housekeeping_test_cpu(cpu, flags);
47#endif
48 return true;
49}
50
51#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index db865ed25ef3..e5af028c08b4 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p)
18 return rt_prio(p->prio); 18 return rt_prio(p->prio);
19} 19}
20 20
21static inline bool task_is_realtime(struct task_struct *tsk)
22{
23 int policy = tsk->policy;
24
25 if (policy == SCHED_FIFO || policy == SCHED_RR)
26 return true;
27 if (policy == SCHED_DEADLINE)
28 return true;
29 return false;
30}
31
21#ifdef CONFIG_RT_MUTEXES 32#ifdef CONFIG_RT_MUTEXES
22/* 33/*
23 * Must hold either p->pi_lock or task_rq(p)->lock. 34 * Must hold either p->pi_lock or task_rq(p)->lock.
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d6a18a3839cc..1c1a1512ec55 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
38extern unsigned int sysctl_numa_balancing_scan_size; 38extern unsigned int sysctl_numa_balancing_scan_size;
39 39
40#ifdef CONFIG_SCHED_DEBUG 40#ifdef CONFIG_SCHED_DEBUG
41extern unsigned int sysctl_sched_migration_cost; 41extern __read_mostly unsigned int sysctl_sched_migration_cost;
42extern unsigned int sysctl_sched_nr_migrate; 42extern __read_mostly unsigned int sysctl_sched_nr_migrate;
43extern unsigned int sysctl_sched_time_avg; 43extern __read_mostly unsigned int sysctl_sched_time_avg;
44 44
45int sched_proc_update_handler(struct ctl_table *table, int write, 45int sched_proc_update_handler(struct ctl_table *table, int write,
46 void __user *buffer, size_t *length, 46 void __user *buffer, size_t *length,
diff --git a/include/linux/tick.h b/include/linux/tick.h
index cf413b344ddb..f442d1a42025 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
138#ifdef CONFIG_NO_HZ_FULL 138#ifdef CONFIG_NO_HZ_FULL
139extern bool tick_nohz_full_running; 139extern bool tick_nohz_full_running;
140extern cpumask_var_t tick_nohz_full_mask; 140extern cpumask_var_t tick_nohz_full_mask;
141extern cpumask_var_t housekeeping_mask;
142 141
143static inline bool tick_nohz_full_enabled(void) 142static inline bool tick_nohz_full_enabled(void)
144{ 143{
@@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
162 cpumask_or(mask, mask, tick_nohz_full_mask); 161 cpumask_or(mask, mask, tick_nohz_full_mask);
163} 162}
164 163
165static inline int housekeeping_any_cpu(void)
166{
167 return cpumask_any_and(housekeeping_mask, cpu_online_mask);
168}
169
170extern void tick_nohz_dep_set(enum tick_dep_bits bit); 164extern void tick_nohz_dep_set(enum tick_dep_bits bit);
171extern void tick_nohz_dep_clear(enum tick_dep_bits bit); 165extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
172extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); 166extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
@@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
235 229
236extern void tick_nohz_full_kick_cpu(int cpu); 230extern void tick_nohz_full_kick_cpu(int cpu);
237extern void __tick_nohz_task_switch(void); 231extern void __tick_nohz_task_switch(void);
232extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
238#else 233#else
239static inline int housekeeping_any_cpu(void)
240{
241 return smp_processor_id();
242}
243static inline bool tick_nohz_full_enabled(void) { return false; } 234static inline bool tick_nohz_full_enabled(void) { return false; }
244static inline bool tick_nohz_full_cpu(int cpu) { return false; } 235static inline bool tick_nohz_full_cpu(int cpu) { return false; }
245static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } 236static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
@@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
259 250
260static inline void tick_nohz_full_kick_cpu(int cpu) { } 251static inline void tick_nohz_full_kick_cpu(int cpu) { }
261static inline void __tick_nohz_task_switch(void) { } 252static inline void __tick_nohz_task_switch(void) { }
253static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
262#endif 254#endif
263 255
264static inline const struct cpumask *housekeeping_cpumask(void)
265{
266#ifdef CONFIG_NO_HZ_FULL
267 if (tick_nohz_full_enabled())
268 return housekeeping_mask;
269#endif
270 return cpu_possible_mask;
271}
272
273static inline bool is_housekeeping_cpu(int cpu)
274{
275#ifdef CONFIG_NO_HZ_FULL
276 if (tick_nohz_full_enabled())
277 return cpumask_test_cpu(cpu, housekeeping_mask);
278#endif
279 return true;
280}
281
282static inline void housekeeping_affine(struct task_struct *t)
283{
284#ifdef CONFIG_NO_HZ_FULL
285 if (tick_nohz_full_enabled())
286 set_cpus_allowed_ptr(t, housekeeping_mask);
287
288#endif
289}
290
291static inline void tick_nohz_task_switch(void) 256static inline void tick_nohz_task_switch(void)
292{ 257{
293 if (tick_nohz_full_enabled()) 258 if (tick_nohz_full_enabled())
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index da10aa21bebc..306b31de5194 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
118 if (preempt) 118 if (preempt)
119 return TASK_STATE_MAX; 119 return TASK_STATE_MAX;
120 120
121 return __get_task_state(p); 121 return task_state_index(p);
122} 122}
123#endif /* CREATE_TRACE_POINTS */ 123#endif /* CREATE_TRACE_POINTS */
124 124
diff --git a/init/Kconfig b/init/Kconfig
index 3c1faaa2af4a..c1fd2863d4ba 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING
472 472
473endmenu # "CPU/Task time and stats accounting" 473endmenu # "CPU/Task time and stats accounting"
474 474
475config CPU_ISOLATION
476 bool "CPU isolation"
477 help
478 Make sure that CPUs running critical tasks are not disturbed by
479 any source of "noise" such as unbound workqueues, timers, kthreads...
480 Unbound jobs get offloaded to housekeeping CPUs.
481
475source "kernel/rcu/Kconfig" 482source "kernel/rcu/Kconfig"
476 483
477config BUILD_BIN2C 484config BUILD_BIN2C
diff --git a/init/main.c b/init/main.c
index 0ee9c6866ada..4610c99ae306 100644
--- a/init/main.c
+++ b/init/main.c
@@ -46,6 +46,7 @@
46#include <linux/cgroup.h> 46#include <linux/cgroup.h>
47#include <linux/efi.h> 47#include <linux/efi.h>
48#include <linux/tick.h> 48#include <linux/tick.h>
49#include <linux/sched/isolation.h>
49#include <linux/interrupt.h> 50#include <linux/interrupt.h>
50#include <linux/taskstats_kern.h> 51#include <linux/taskstats_kern.h>
51#include <linux/delayacct.h> 52#include <linux/delayacct.h>
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
606 early_irq_init(); 607 early_irq_init();
607 init_IRQ(); 608 init_IRQ();
608 tick_init(); 609 tick_init();
610 housekeeping_init();
609 rcu_init_nohz(); 611 rcu_init_nohz();
610 init_timers(); 612 init_timers();
611 hrtimers_init(); 613 hrtimers_init();
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e2924ecb..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
57#include <linux/backing-dev.h> 57#include <linux/backing-dev.h>
58#include <linux/sort.h> 58#include <linux/sort.h>
59#include <linux/oom.h> 59#include <linux/oom.h>
60 60#include <linux/sched/isolation.h>
61#include <linux/uaccess.h> 61#include <linux/uaccess.h>
62#include <linux/atomic.h> 62#include <linux/atomic.h>
63#include <linux/mutex.h> 63#include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
656 int csn; /* how many cpuset ptrs in csa so far */ 656 int csn; /* how many cpuset ptrs in csa so far */
657 int i, j, k; /* indices for partition finding loops */ 657 int i, j, k; /* indices for partition finding loops */
658 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ 658 cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
659 cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
660 struct sched_domain_attr *dattr; /* attributes for custom domains */ 659 struct sched_domain_attr *dattr; /* attributes for custom domains */
661 int ndoms = 0; /* number of sched domains in result */ 660 int ndoms = 0; /* number of sched domains in result */
662 int nslot; /* next empty doms[] struct cpumask slot */ 661 int nslot; /* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
666 dattr = NULL; 665 dattr = NULL;
667 csa = NULL; 666 csa = NULL;
668 667
669 if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
670 goto done;
671 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
672
673 /* Special case for the 99% of systems with one, full, sched domain */ 668 /* Special case for the 99% of systems with one, full, sched domain */
674 if (is_sched_load_balance(&top_cpuset)) { 669 if (is_sched_load_balance(&top_cpuset)) {
675 ndoms = 1; 670 ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
683 update_domain_attr_tree(dattr, &top_cpuset); 678 update_domain_attr_tree(dattr, &top_cpuset);
684 } 679 }
685 cpumask_and(doms[0], top_cpuset.effective_cpus, 680 cpumask_and(doms[0], top_cpuset.effective_cpus,
686 non_isolated_cpus); 681 housekeeping_cpumask(HK_FLAG_DOMAIN));
687 682
688 goto done; 683 goto done;
689 } 684 }
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
707 */ 702 */
708 if (!cpumask_empty(cp->cpus_allowed) && 703 if (!cpumask_empty(cp->cpus_allowed) &&
709 !(is_sched_load_balance(cp) && 704 !(is_sched_load_balance(cp) &&
710 cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) 705 cpumask_intersects(cp->cpus_allowed,
706 housekeeping_cpumask(HK_FLAG_DOMAIN))))
711 continue; 707 continue;
712 708
713 if (is_sched_load_balance(cp)) 709 if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ restart:
789 785
790 if (apn == b->pn) { 786 if (apn == b->pn) {
791 cpumask_or(dp, dp, b->effective_cpus); 787 cpumask_or(dp, dp, b->effective_cpus);
792 cpumask_and(dp, dp, non_isolated_cpus); 788 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
793 if (dattr) 789 if (dattr)
794 update_domain_attr_tree(dattr + nslot, b); 790 update_domain_attr_tree(dattr + nslot, b);
795 791
@@ -802,7 +798,6 @@ restart:
802 BUG_ON(nslot != ndoms); 798 BUG_ON(nslot != ndoms);
803 799
804done: 800done:
805 free_cpumask_var(non_isolated_cpus);
806 kfree(csa); 801 kfree(csa);
807 802
808 /* 803 /*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dd4d0d390e5b..910405dc6e5c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/sched/debug.h> 30#include <linux/sched/debug.h>
31#include <linux/smpboot.h> 31#include <linux/smpboot.h>
32#include <linux/sched/isolation.h>
32#include <uapi/linux/sched/types.h> 33#include <uapi/linux/sched/types.h>
33#include "../time/tick-internal.h" 34#include "../time/tick-internal.h"
34 35
@@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void)
2587 2588
2588 if (!tick_nohz_full_enabled()) 2589 if (!tick_nohz_full_enabled())
2589 return; 2590 return;
2590 housekeeping_affine(current); 2591 housekeeping_affine(current, HK_FLAG_RCU);
2591} 2592}
2592 2593
2593/* Record the current task on dyntick-idle entry. */ 2594/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 27694561f769..fbd56d6e575b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/tick.h> 52#include <linux/tick.h>
53#include <linux/rcupdate_wait.h> 53#include <linux/rcupdate_wait.h>
54#include <linux/sched/isolation.h>
54 55
55#define CREATE_TRACE_POINTS 56#define CREATE_TRACE_POINTS
56 57
@@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
714 LIST_HEAD(rcu_tasks_holdouts); 715 LIST_HEAD(rcu_tasks_holdouts);
715 716
716 /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ 717 /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
717 housekeeping_affine(current); 718 housekeeping_affine(current, HK_FLAG_RCU);
718 719
719 /* 720 /*
720 * Each pass through the following loop makes one check for 721 * Each pass through the following loop makes one check for
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index a9ee16bbc693..e2f9d4feff40 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
27obj-$(CONFIG_CPU_FREQ) += cpufreq.o 27obj-$(CONFIG_CPU_FREQ) += cpufreq.o
28obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o 28obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
29obj-$(CONFIG_MEMBARRIER) += membarrier.o 29obj-$(CONFIG_MEMBARRIER) += membarrier.o
30obj-$(CONFIG_CPU_ISOLATION) += isolation.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9446b2e5eac5..5b82a0073532 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
26#include <linux/profile.h> 26#include <linux/profile.h>
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/sched/isolation.h>
29 30
30#include <asm/switch_to.h> 31#include <asm/switch_to.h>
31#include <asm/tlb.h> 32#include <asm/tlb.h>
@@ -42,18 +43,21 @@
42 43
43DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 44DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
44 45
46#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
45/* 47/*
46 * Debugging: various feature bits 48 * Debugging: various feature bits
49 *
50 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
51 * sysctl_sched_features, defined in sched.h, to allow constants propagation
52 * at compile time and compiler optimization based on features default.
47 */ 53 */
48
49#define SCHED_FEAT(name, enabled) \ 54#define SCHED_FEAT(name, enabled) \
50 (1UL << __SCHED_FEAT_##name) * enabled | 55 (1UL << __SCHED_FEAT_##name) * enabled |
51
52const_debug unsigned int sysctl_sched_features = 56const_debug unsigned int sysctl_sched_features =
53#include "features.h" 57#include "features.h"
54 0; 58 0;
55
56#undef SCHED_FEAT 59#undef SCHED_FEAT
60#endif
57 61
58/* 62/*
59 * Number of tasks to iterate in a single balance run. 63 * Number of tasks to iterate in a single balance run.
@@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
83 */ 87 */
84int sysctl_sched_rt_runtime = 950000; 88int sysctl_sched_rt_runtime = 950000;
85 89
86/* CPUs with isolated domains */
87cpumask_var_t cpu_isolated_map;
88
89/* 90/*
90 * __task_rq_lock - lock the rq @p resides on. 91 * __task_rq_lock - lock the rq @p resides on.
91 */ 92 */
@@ -525,7 +526,7 @@ int get_nohz_timer_target(void)
525 int i, cpu = smp_processor_id(); 526 int i, cpu = smp_processor_id();
526 struct sched_domain *sd; 527 struct sched_domain *sd;
527 528
528 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) 529 if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
529 return cpu; 530 return cpu;
530 531
531 rcu_read_lock(); 532 rcu_read_lock();
@@ -534,15 +535,15 @@ int get_nohz_timer_target(void)
534 if (cpu == i) 535 if (cpu == i)
535 continue; 536 continue;
536 537
537 if (!idle_cpu(i) && is_housekeeping_cpu(i)) { 538 if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
538 cpu = i; 539 cpu = i;
539 goto unlock; 540 goto unlock;
540 } 541 }
541 } 542 }
542 } 543 }
543 544
544 if (!is_housekeeping_cpu(cpu)) 545 if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
545 cpu = housekeeping_any_cpu(); 546 cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
546unlock: 547unlock:
547 rcu_read_unlock(); 548 rcu_read_unlock();
548 return cpu; 549 return cpu;
@@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
732} 733}
733#endif 734#endif
734 735
735static void set_load_weight(struct task_struct *p) 736static void set_load_weight(struct task_struct *p, bool update_load)
736{ 737{
737 int prio = p->static_prio - MAX_RT_PRIO; 738 int prio = p->static_prio - MAX_RT_PRIO;
738 struct load_weight *load = &p->se.load; 739 struct load_weight *load = &p->se.load;
@@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
746 return; 747 return;
747 } 748 }
748 749
749 load->weight = scale_load(sched_prio_to_weight[prio]); 750 /*
750 load->inv_weight = sched_prio_to_wmult[prio]; 751 * SCHED_OTHER tasks have to update their load when changing their
752 * weight
753 */
754 if (update_load && p->sched_class == &fair_sched_class) {
755 reweight_task(p, prio);
756 } else {
757 load->weight = scale_load(sched_prio_to_weight[prio]);
758 load->inv_weight = sched_prio_to_wmult[prio];
759 }
751} 760}
752 761
753static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 762static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2357 p->static_prio = NICE_TO_PRIO(0); 2366 p->static_prio = NICE_TO_PRIO(0);
2358 2367
2359 p->prio = p->normal_prio = __normal_prio(p); 2368 p->prio = p->normal_prio = __normal_prio(p);
2360 set_load_weight(p); 2369 set_load_weight(p, false);
2361 2370
2362 /* 2371 /*
2363 * We don't need the reset flag anymore after the fork. It has 2372 * We don't need the reset flag anymore after the fork. It has
@@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
3804 put_prev_task(rq, p); 3813 put_prev_task(rq, p);
3805 3814
3806 p->static_prio = NICE_TO_PRIO(nice); 3815 p->static_prio = NICE_TO_PRIO(nice);
3807 set_load_weight(p); 3816 set_load_weight(p, true);
3808 old_prio = p->prio; 3817 old_prio = p->prio;
3809 p->prio = effective_prio(p); 3818 p->prio = effective_prio(p);
3810 delta = p->prio - old_prio; 3819 delta = p->prio - old_prio;
@@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
3961 */ 3970 */
3962 p->rt_priority = attr->sched_priority; 3971 p->rt_priority = attr->sched_priority;
3963 p->normal_prio = normal_prio(p); 3972 p->normal_prio = normal_prio(p);
3964 set_load_weight(p); 3973 set_load_weight(p, true);
3965} 3974}
3966 3975
3967/* Actually do priority change: must hold pi & rq lock. */ 3976/* Actually do priority change: must hold pi & rq lock. */
@@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { }
5727 5736
5728void __init sched_init_smp(void) 5737void __init sched_init_smp(void)
5729{ 5738{
5730 cpumask_var_t non_isolated_cpus;
5731
5732 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
5733
5734 sched_init_numa(); 5739 sched_init_numa();
5735 5740
5736 /* 5741 /*
@@ -5740,16 +5745,12 @@ void __init sched_init_smp(void)
5740 */ 5745 */
5741 mutex_lock(&sched_domains_mutex); 5746 mutex_lock(&sched_domains_mutex);
5742 sched_init_domains(cpu_active_mask); 5747 sched_init_domains(cpu_active_mask);
5743 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
5744 if (cpumask_empty(non_isolated_cpus))
5745 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
5746 mutex_unlock(&sched_domains_mutex); 5748 mutex_unlock(&sched_domains_mutex);
5747 5749
5748 /* Move init over to a non-isolated CPU */ 5750 /* Move init over to a non-isolated CPU */
5749 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 5751 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
5750 BUG(); 5752 BUG();
5751 sched_init_granularity(); 5753 sched_init_granularity();
5752 free_cpumask_var(non_isolated_cpus);
5753 5754
5754 init_sched_rt_class(); 5755 init_sched_rt_class();
5755 init_sched_dl_class(); 5756 init_sched_dl_class();
@@ -5934,7 +5935,7 @@ void __init sched_init(void)
5934 atomic_set(&rq->nr_iowait, 0); 5935 atomic_set(&rq->nr_iowait, 0);
5935 } 5936 }
5936 5937
5937 set_load_weight(&init_task); 5938 set_load_weight(&init_task, false);
5938 5939
5939 /* 5940 /*
5940 * The boot idle thread does lazy MMU switching as well: 5941 * The boot idle thread does lazy MMU switching as well:
@@ -5953,9 +5954,6 @@ void __init sched_init(void)
5953 calc_load_update = jiffies + LOAD_FREQ; 5954 calc_load_update = jiffies + LOAD_FREQ;
5954 5955
5955#ifdef CONFIG_SMP 5956#ifdef CONFIG_SMP
5956 /* May be allocated at isolcpus cmdline parse time */
5957 if (cpu_isolated_map == NULL)
5958 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
5959 idle_thread_set_boot_cpu(); 5957 idle_thread_set_boot_cpu();
5960 set_cpu_rq_start_time(smp_processor_id()); 5958 set_cpu_rq_start_time(smp_processor_id());
5961#endif 5959#endif
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 4ae5c1ea90e2..f349f7e98dec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
243 if (p->state == TASK_DEAD) 243 if (p->state == TASK_DEAD)
244 sub_rq_bw(p->dl.dl_bw, &rq->dl); 244 sub_rq_bw(p->dl.dl_bw, &rq->dl);
245 raw_spin_lock(&dl_b->lock); 245 raw_spin_lock(&dl_b->lock);
246 __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 246 __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
247 __dl_clear_params(p); 247 __dl_clear_params(p);
248 raw_spin_unlock(&dl_b->lock); 248 raw_spin_unlock(&dl_b->lock);
249 } 249 }
@@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
1210 } 1210 }
1211 1211
1212 raw_spin_lock(&dl_b->lock); 1212 raw_spin_lock(&dl_b->lock);
1213 __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 1213 __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
1214 raw_spin_unlock(&dl_b->lock); 1214 raw_spin_unlock(&dl_b->lock);
1215 __dl_clear_params(p); 1215 __dl_clear_params(p);
1216 1216
@@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
1365 update_dl_entity(dl_se, pi_se); 1365 update_dl_entity(dl_se, pi_se);
1366 } else if (flags & ENQUEUE_REPLENISH) { 1366 } else if (flags & ENQUEUE_REPLENISH) {
1367 replenish_dl_entity(dl_se, pi_se); 1367 replenish_dl_entity(dl_se, pi_se);
1368 } else if ((flags & ENQUEUE_RESTORE) &&
1369 dl_time_before(dl_se->deadline,
1370 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
1371 setup_new_dl_entity(dl_se);
1368 } 1372 }
1369 1373
1370 __enqueue_dl_entity(dl_se); 1374 __enqueue_dl_entity(dl_se);
@@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
2167 * until we complete the update. 2171 * until we complete the update.
2168 */ 2172 */
2169 raw_spin_lock(&src_dl_b->lock); 2173 raw_spin_lock(&src_dl_b->lock);
2170 __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 2174 __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
2171 raw_spin_unlock(&src_dl_b->lock); 2175 raw_spin_unlock(&src_dl_b->lock);
2172 } 2176 }
2173 2177
@@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
2256 2260
2257 return; 2261 return;
2258 } 2262 }
2259 /*
2260 * If p is boosted we already updated its params in
2261 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
2262 * p's deadline being now already after rq_clock(rq).
2263 */
2264 if (dl_time_before(p->dl.deadline, rq_clock(rq)))
2265 setup_new_dl_entity(&p->dl);
2266 2263
2267 if (rq->curr != p) { 2264 if (rq->curr != p) {
2268#ifdef CONFIG_SMP 2265#ifdef CONFIG_SMP
@@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
2452 if (dl_policy(policy) && !task_has_dl_policy(p) && 2449 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2453 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2450 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2454 if (hrtimer_active(&p->dl.inactive_timer)) 2451 if (hrtimer_active(&p->dl.inactive_timer))
2455 __dl_clear(dl_b, p->dl.dl_bw, cpus); 2452 __dl_sub(dl_b, p->dl.dl_bw, cpus);
2456 __dl_add(dl_b, new_bw, cpus); 2453 __dl_add(dl_b, new_bw, cpus);
2457 err = 0; 2454 err = 0;
2458 } else if (dl_policy(policy) && task_has_dl_policy(p) && 2455 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
2464 * But this would require to set the task's "inactive 2461 * But this would require to set the task's "inactive
2465 * timer" when the task is not inactive. 2462 * timer" when the task is not inactive.
2466 */ 2463 */
2467 __dl_clear(dl_b, p->dl.dl_bw, cpus); 2464 __dl_sub(dl_b, p->dl.dl_bw, cpus);
2468 __dl_add(dl_b, new_bw, cpus); 2465 __dl_add(dl_b, new_bw, cpus);
2469 dl_change_utilization(p, new_bw); 2466 dl_change_utilization(p, new_bw);
2470 err = 0; 2467 err = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f93e4a2d9f6..1ca0130ed4f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
441 P_SCHEDSTAT(se->statistics.wait_count); 441 P_SCHEDSTAT(se->statistics.wait_count);
442 } 442 }
443 P(se->load.weight); 443 P(se->load.weight);
444 P(se->runnable_weight);
444#ifdef CONFIG_SMP 445#ifdef CONFIG_SMP
445 P(se->avg.load_avg); 446 P(se->avg.load_avg);
446 P(se->avg.util_avg); 447 P(se->avg.util_avg);
448 P(se->avg.runnable_load_avg);
447#endif 449#endif
448 450
449#undef PN_SCHEDSTAT 451#undef PN_SCHEDSTAT
@@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
558 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 560 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
559 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 561 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
560#ifdef CONFIG_SMP 562#ifdef CONFIG_SMP
563 SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
561 SEQ_printf(m, " .%-30s: %lu\n", "load_avg", 564 SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
562 cfs_rq->avg.load_avg); 565 cfs_rq->avg.load_avg);
563 SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", 566 SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
564 cfs_rq->runnable_load_avg); 567 cfs_rq->avg.runnable_load_avg);
565 SEQ_printf(m, " .%-30s: %lu\n", "util_avg", 568 SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
566 cfs_rq->avg.util_avg); 569 cfs_rq->avg.util_avg);
567 SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", 570 SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
568 atomic_long_read(&cfs_rq->removed_load_avg)); 571 cfs_rq->removed.load_avg);
569 SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", 572 SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
570 atomic_long_read(&cfs_rq->removed_util_avg)); 573 cfs_rq->removed.util_avg);
574 SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
575 cfs_rq->removed.runnable_sum);
571#ifdef CONFIG_FAIR_GROUP_SCHED 576#ifdef CONFIG_FAIR_GROUP_SCHED
572 SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", 577 SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
573 cfs_rq->tg_load_avg_contrib); 578 cfs_rq->tg_load_avg_contrib);
@@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
1004 "nr_involuntary_switches", (long long)p->nivcsw); 1009 "nr_involuntary_switches", (long long)p->nivcsw);
1005 1010
1006 P(se.load.weight); 1011 P(se.load.weight);
1012 P(se.runnable_weight);
1007#ifdef CONFIG_SMP 1013#ifdef CONFIG_SMP
1008 P(se.avg.load_sum); 1014 P(se.avg.load_sum);
1015 P(se.avg.runnable_load_sum);
1009 P(se.avg.util_sum); 1016 P(se.avg.util_sum);
1010 P(se.avg.load_avg); 1017 P(se.avg.load_avg);
1018 P(se.avg.runnable_load_avg);
1011 P(se.avg.util_avg); 1019 P(se.avg.util_avg);
1012 P(se.avg.last_update_time); 1020 P(se.avg.last_update_time);
1013#endif 1021#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c09ddf8c832..0989676c50e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -33,6 +33,7 @@
33#include <linux/mempolicy.h> 33#include <linux/mempolicy.h>
34#include <linux/migrate.h> 34#include <linux/migrate.h>
35#include <linux/task_work.h> 35#include <linux/task_work.h>
36#include <linux/sched/isolation.h>
36 37
37#include <trace/events/sched.h> 38#include <trace/events/sched.h>
38 39
@@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
717{ 718{
718 struct sched_avg *sa = &se->avg; 719 struct sched_avg *sa = &se->avg;
719 720
720 sa->last_update_time = 0; 721 memset(sa, 0, sizeof(*sa));
721 /* 722
722 * sched_avg's period_contrib should be strictly less then 1024, so
723 * we give it 1023 to make sure it is almost a period (1024us), and
724 * will definitely be update (after enqueue).
725 */
726 sa->period_contrib = 1023;
727 /* 723 /*
728 * Tasks are intialized with full load to be seen as heavy tasks until 724 * Tasks are intialized with full load to be seen as heavy tasks until
729 * they get a chance to stabilize to their real load level. 725 * they get a chance to stabilize to their real load level.
@@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
731 * nothing has been attached to the task group yet. 727 * nothing has been attached to the task group yet.
732 */ 728 */
733 if (entity_is_task(se)) 729 if (entity_is_task(se))
734 sa->load_avg = scale_load_down(se->load.weight); 730 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
735 sa->load_sum = sa->load_avg * LOAD_AVG_MAX; 731
736 /* 732 se->runnable_weight = se->load.weight;
737 * At this point, util_avg won't be used in select_task_rq_fair anyway 733
738 */
739 sa->util_avg = 0;
740 sa->util_sum = 0;
741 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 734 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
742} 735}
743 736
@@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
785 } else { 778 } else {
786 sa->util_avg = cap; 779 sa->util_avg = cap;
787 } 780 }
788 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
789 } 781 }
790 782
791 if (entity_is_task(se)) { 783 if (entity_is_task(se)) {
@@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2026 delta = runtime - p->last_sum_exec_runtime; 2018 delta = runtime - p->last_sum_exec_runtime;
2027 *period = now - p->last_task_numa_placement; 2019 *period = now - p->last_task_numa_placement;
2028 } else { 2020 } else {
2029 delta = p->se.avg.load_sum / p->se.load.weight; 2021 delta = p->se.avg.load_sum;
2030 *period = LOAD_AVG_MAX; 2022 *period = LOAD_AVG_MAX;
2031 } 2023 }
2032 2024
@@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2693 cfs_rq->nr_running--; 2685 cfs_rq->nr_running--;
2694} 2686}
2695 2687
2688/*
2689 * Signed add and clamp on underflow.
2690 *
2691 * Explicitly do a load-store to ensure the intermediate value never hits
2692 * memory. This allows lockless observations without ever seeing the negative
2693 * values.
2694 */
2695#define add_positive(_ptr, _val) do { \
2696 typeof(_ptr) ptr = (_ptr); \
2697 typeof(_val) val = (_val); \
2698 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2699 \
2700 res = var + val; \
2701 \
2702 if (val < 0 && res > var) \
2703 res = 0; \
2704 \
2705 WRITE_ONCE(*ptr, res); \
2706} while (0)
2707
2708/*
2709 * Unsigned subtract and clamp on underflow.
2710 *
2711 * Explicitly do a load-store to ensure the intermediate value never hits
2712 * memory. This allows lockless observations without ever seeing the negative
2713 * values.
2714 */
2715#define sub_positive(_ptr, _val) do { \
2716 typeof(_ptr) ptr = (_ptr); \
2717 typeof(*ptr) val = (_val); \
2718 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2719 res = var - val; \
2720 if (res > var) \
2721 res = 0; \
2722 WRITE_ONCE(*ptr, res); \
2723} while (0)
2724
2725#ifdef CONFIG_SMP
2726/*
2727 * XXX we want to get rid of these helpers and use the full load resolution.
2728 */
2729static inline long se_weight(struct sched_entity *se)
2730{
2731 return scale_load_down(se->load.weight);
2732}
2733
2734static inline long se_runnable(struct sched_entity *se)
2735{
2736 return scale_load_down(se->runnable_weight);
2737}
2738
2739static inline void
2740enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2741{
2742 cfs_rq->runnable_weight += se->runnable_weight;
2743
2744 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2745 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2746}
2747
2748static inline void
2749dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2750{
2751 cfs_rq->runnable_weight -= se->runnable_weight;
2752
2753 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2754 sub_positive(&cfs_rq->avg.runnable_load_sum,
2755 se_runnable(se) * se->avg.runnable_load_sum);
2756}
2757
2758static inline void
2759enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2760{
2761 cfs_rq->avg.load_avg += se->avg.load_avg;
2762 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2763}
2764
2765static inline void
2766dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2767{
2768 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2769 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2770}
2771#else
2772static inline void
2773enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2774static inline void
2775dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2776static inline void
2777enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2778static inline void
2779dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2780#endif
2781
2782static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2783 unsigned long weight, unsigned long runnable)
2784{
2785 if (se->on_rq) {
2786 /* commit outstanding execution time */
2787 if (cfs_rq->curr == se)
2788 update_curr(cfs_rq);
2789 account_entity_dequeue(cfs_rq, se);
2790 dequeue_runnable_load_avg(cfs_rq, se);
2791 }
2792 dequeue_load_avg(cfs_rq, se);
2793
2794 se->runnable_weight = runnable;
2795 update_load_set(&se->load, weight);
2796
2797#ifdef CONFIG_SMP
2798 do {
2799 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2800
2801 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2802 se->avg.runnable_load_avg =
2803 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2804 } while (0);
2805#endif
2806
2807 enqueue_load_avg(cfs_rq, se);
2808 if (se->on_rq) {
2809 account_entity_enqueue(cfs_rq, se);
2810 enqueue_runnable_load_avg(cfs_rq, se);
2811 }
2812}
2813
2814void reweight_task(struct task_struct *p, int prio)
2815{
2816 struct sched_entity *se = &p->se;
2817 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2818 struct load_weight *load = &se->load;
2819 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2820
2821 reweight_entity(cfs_rq, se, weight, weight);
2822 load->inv_weight = sched_prio_to_wmult[prio];
2823}
2824
2696#ifdef CONFIG_FAIR_GROUP_SCHED 2825#ifdef CONFIG_FAIR_GROUP_SCHED
2697# ifdef CONFIG_SMP 2826# ifdef CONFIG_SMP
2698static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 2827/*
2828 * All this does is approximate the hierarchical proportion which includes that
2829 * global sum we all love to hate.
2830 *
2831 * That is, the weight of a group entity, is the proportional share of the
2832 * group weight based on the group runqueue weights. That is:
2833 *
2834 * tg->weight * grq->load.weight
2835 * ge->load.weight = ----------------------------- (1)
2836 * \Sum grq->load.weight
2837 *
2838 * Now, because computing that sum is prohibitively expensive to compute (been
2839 * there, done that) we approximate it with this average stuff. The average
2840 * moves slower and therefore the approximation is cheaper and more stable.
2841 *
2842 * So instead of the above, we substitute:
2843 *
2844 * grq->load.weight -> grq->avg.load_avg (2)
2845 *
2846 * which yields the following:
2847 *
2848 * tg->weight * grq->avg.load_avg
2849 * ge->load.weight = ------------------------------ (3)
2850 * tg->load_avg
2851 *
2852 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
2853 *
2854 * That is shares_avg, and it is right (given the approximation (2)).
2855 *
2856 * The problem with it is that because the average is slow -- it was designed
2857 * to be exactly that of course -- this leads to transients in boundary
2858 * conditions. In specific, the case where the group was idle and we start the
2859 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
2860 * yielding bad latency etc..
2861 *
2862 * Now, in that special case (1) reduces to:
2863 *
2864 * tg->weight * grq->load.weight
2865 * ge->load.weight = ----------------------------- = tg->weight (4)
2866 * grp->load.weight
2867 *
2868 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
2869 *
2870 * So what we do is modify our approximation (3) to approach (4) in the (near)
2871 * UP case, like:
2872 *
2873 * ge->load.weight =
2874 *
2875 * tg->weight * grq->load.weight
2876 * --------------------------------------------------- (5)
2877 * tg->load_avg - grq->avg.load_avg + grq->load.weight
2878 *
2879 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
2880 * we need to use grq->avg.load_avg as its lower bound, which then gives:
2881 *
2882 *
2883 * tg->weight * grq->load.weight
2884 * ge->load.weight = ----------------------------- (6)
2885 * tg_load_avg'
2886 *
2887 * Where:
2888 *
2889 * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
2890 * max(grq->load.weight, grq->avg.load_avg)
2891 *
2892 * And that is shares_weight and is icky. In the (near) UP case it approaches
2893 * (4) while in the normal case it approaches (3). It consistently
2894 * overestimates the ge->load.weight and therefore:
2895 *
2896 * \Sum ge->load.weight >= tg->weight
2897 *
2898 * hence icky!
2899 */
2900static long calc_group_shares(struct cfs_rq *cfs_rq)
2699{ 2901{
2700 long tg_weight, load, shares; 2902 long tg_weight, tg_shares, load, shares;
2903 struct task_group *tg = cfs_rq->tg;
2701 2904
2702 /* 2905 tg_shares = READ_ONCE(tg->shares);
2703 * This really should be: cfs_rq->avg.load_avg, but instead we use 2906
2704 * cfs_rq->load.weight, which is its upper bound. This helps ramp up 2907 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
2705 * the shares for small weight interactive tasks.
2706 */
2707 load = scale_load_down(cfs_rq->load.weight);
2708 2908
2709 tg_weight = atomic_long_read(&tg->load_avg); 2909 tg_weight = atomic_long_read(&tg->load_avg);
2710 2910
@@ -2712,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2712 tg_weight -= cfs_rq->tg_load_avg_contrib; 2912 tg_weight -= cfs_rq->tg_load_avg_contrib;
2713 tg_weight += load; 2913 tg_weight += load;
2714 2914
2715 shares = (tg->shares * load); 2915 shares = (tg_shares * load);
2716 if (tg_weight) 2916 if (tg_weight)
2717 shares /= tg_weight; 2917 shares /= tg_weight;
2718 2918
@@ -2728,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2728 * case no task is runnable on a CPU MIN_SHARES=2 should be returned 2928 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
2729 * instead of 0. 2929 * instead of 0.
2730 */ 2930 */
2731 if (shares < MIN_SHARES) 2931 return clamp_t(long, shares, MIN_SHARES, tg_shares);
2732 shares = MIN_SHARES;
2733 if (shares > tg->shares)
2734 shares = tg->shares;
2735
2736 return shares;
2737}
2738# else /* CONFIG_SMP */
2739static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2740{
2741 return tg->shares;
2742} 2932}
2743# endif /* CONFIG_SMP */
2744 2933
2745static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 2934/*
2746 unsigned long weight) 2935 * This calculates the effective runnable weight for a group entity based on
2936 * the group entity weight calculated above.
2937 *
2938 * Because of the above approximation (2), our group entity weight is
2939 * an load_avg based ratio (3). This means that it includes blocked load and
2940 * does not represent the runnable weight.
2941 *
2942 * Approximate the group entity's runnable weight per ratio from the group
2943 * runqueue:
2944 *
2945 * grq->avg.runnable_load_avg
2946 * ge->runnable_weight = ge->load.weight * -------------------------- (7)
2947 * grq->avg.load_avg
2948 *
2949 * However, analogous to above, since the avg numbers are slow, this leads to
2950 * transients in the from-idle case. Instead we use:
2951 *
2952 * ge->runnable_weight = ge->load.weight *
2953 *
2954 * max(grq->avg.runnable_load_avg, grq->runnable_weight)
2955 * ----------------------------------------------------- (8)
2956 * max(grq->avg.load_avg, grq->load.weight)
2957 *
2958 * Where these max() serve both to use the 'instant' values to fix the slow
2959 * from-idle and avoid the /0 on to-idle, similar to (6).
2960 */
2961static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2747{ 2962{
2748 if (se->on_rq) { 2963 long runnable, load_avg;
2749 /* commit outstanding execution time */
2750 if (cfs_rq->curr == se)
2751 update_curr(cfs_rq);
2752 account_entity_dequeue(cfs_rq, se);
2753 }
2754 2964
2755 update_load_set(&se->load, weight); 2965 load_avg = max(cfs_rq->avg.load_avg,
2966 scale_load_down(cfs_rq->load.weight));
2756 2967
2757 if (se->on_rq) 2968 runnable = max(cfs_rq->avg.runnable_load_avg,
2758 account_entity_enqueue(cfs_rq, se); 2969 scale_load_down(cfs_rq->runnable_weight));
2970
2971 runnable *= shares;
2972 if (load_avg)
2973 runnable /= load_avg;
2974
2975 return clamp_t(long, runnable, MIN_SHARES, shares);
2759} 2976}
2977# endif /* CONFIG_SMP */
2760 2978
2761static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 2979static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2762 2980
2763static void update_cfs_shares(struct sched_entity *se) 2981/*
2982 * Recomputes the group entity based on the current state of its group
2983 * runqueue.
2984 */
2985static void update_cfs_group(struct sched_entity *se)
2764{ 2986{
2765 struct cfs_rq *cfs_rq = group_cfs_rq(se); 2987 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
2766 struct task_group *tg; 2988 long shares, runnable;
2767 long shares;
2768 2989
2769 if (!cfs_rq) 2990 if (!gcfs_rq)
2770 return; 2991 return;
2771 2992
2772 if (throttled_hierarchy(cfs_rq)) 2993 if (throttled_hierarchy(gcfs_rq))
2773 return; 2994 return;
2774 2995
2775 tg = cfs_rq->tg;
2776
2777#ifndef CONFIG_SMP 2996#ifndef CONFIG_SMP
2778 if (likely(se->load.weight == tg->shares)) 2997 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
2998
2999 if (likely(se->load.weight == shares))
2779 return; 3000 return;
3001#else
3002 shares = calc_group_shares(gcfs_rq);
3003 runnable = calc_group_runnable(gcfs_rq, shares);
2780#endif 3004#endif
2781 shares = calc_cfs_shares(cfs_rq, tg);
2782 3005
2783 reweight_entity(cfs_rq_of(se), se, shares); 3006 reweight_entity(cfs_rq_of(se), se, shares, runnable);
2784} 3007}
2785 3008
2786#else /* CONFIG_FAIR_GROUP_SCHED */ 3009#else /* CONFIG_FAIR_GROUP_SCHED */
2787static inline void update_cfs_shares(struct sched_entity *se) 3010static inline void update_cfs_group(struct sched_entity *se)
2788{ 3011{
2789} 3012}
2790#endif /* CONFIG_FAIR_GROUP_SCHED */ 3013#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2893,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
2893 */ 3116 */
2894static __always_inline u32 3117static __always_inline u32
2895accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, 3118accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
2896 unsigned long weight, int running, struct cfs_rq *cfs_rq) 3119 unsigned long load, unsigned long runnable, int running)
2897{ 3120{
2898 unsigned long scale_freq, scale_cpu; 3121 unsigned long scale_freq, scale_cpu;
2899 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ 3122 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -2910,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
2910 */ 3133 */
2911 if (periods) { 3134 if (periods) {
2912 sa->load_sum = decay_load(sa->load_sum, periods); 3135 sa->load_sum = decay_load(sa->load_sum, periods);
2913 if (cfs_rq) { 3136 sa->runnable_load_sum =
2914 cfs_rq->runnable_load_sum = 3137 decay_load(sa->runnable_load_sum, periods);
2915 decay_load(cfs_rq->runnable_load_sum, periods);
2916 }
2917 sa->util_sum = decay_load((u64)(sa->util_sum), periods); 3138 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
2918 3139
2919 /* 3140 /*
@@ -2926,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
2926 sa->period_contrib = delta; 3147 sa->period_contrib = delta;
2927 3148
2928 contrib = cap_scale(contrib, scale_freq); 3149 contrib = cap_scale(contrib, scale_freq);
2929 if (weight) { 3150 if (load)
2930 sa->load_sum += weight * contrib; 3151 sa->load_sum += load * contrib;
2931 if (cfs_rq) 3152 if (runnable)
2932 cfs_rq->runnable_load_sum += weight * contrib; 3153 sa->runnable_load_sum += runnable * contrib;
2933 }
2934 if (running) 3154 if (running)
2935 sa->util_sum += contrib * scale_cpu; 3155 sa->util_sum += contrib * scale_cpu;
2936 3156
@@ -2966,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
2966 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 3186 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2967 */ 3187 */
2968static __always_inline int 3188static __always_inline int
2969___update_load_avg(u64 now, int cpu, struct sched_avg *sa, 3189___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
2970 unsigned long weight, int running, struct cfs_rq *cfs_rq) 3190 unsigned long load, unsigned long runnable, int running)
2971{ 3191{
2972 u64 delta; 3192 u64 delta;
2973 3193
@@ -3000,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
3000 * this happens during idle_balance() which calls 3220 * this happens during idle_balance() which calls
3001 * update_blocked_averages() 3221 * update_blocked_averages()
3002 */ 3222 */
3003 if (!weight) 3223 if (!load)
3004 running = 0; 3224 runnable = running = 0;
3005 3225
3006 /* 3226 /*
3007 * Now we know we crossed measurement unit boundaries. The *_avg 3227 * Now we know we crossed measurement unit boundaries. The *_avg
@@ -3010,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
3010 * Step 1: accumulate *_sum since last_update_time. If we haven't 3230 * Step 1: accumulate *_sum since last_update_time. If we haven't
3011 * crossed period boundaries, finish. 3231 * crossed period boundaries, finish.
3012 */ 3232 */
3013 if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) 3233 if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
3014 return 0; 3234 return 0;
3015 3235
3236 return 1;
3237}
3238
3239static __always_inline void
3240___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
3241{
3242 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3243
3016 /* 3244 /*
3017 * Step 2: update *_avg. 3245 * Step 2: update *_avg.
3018 */ 3246 */
3019 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); 3247 sa->load_avg = div_u64(load * sa->load_sum, divider);
3020 if (cfs_rq) { 3248 sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
3021 cfs_rq->runnable_load_avg = 3249 sa->util_avg = sa->util_sum / divider;
3022 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
3023 }
3024 sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
3025
3026 return 1;
3027} 3250}
3028 3251
3252/*
3253 * sched_entity:
3254 *
3255 * task:
3256 * se_runnable() == se_weight()
3257 *
3258 * group: [ see update_cfs_group() ]
3259 * se_weight() = tg->weight * grq->load_avg / tg->load_avg
3260 * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
3261 *
3262 * load_sum := runnable_sum
3263 * load_avg = se_weight(se) * runnable_avg
3264 *
3265 * runnable_load_sum := runnable_sum
3266 * runnable_load_avg = se_runnable(se) * runnable_avg
3267 *
3268 * XXX collapse load_sum and runnable_load_sum
3269 *
3270 * cfq_rs:
3271 *
3272 * load_sum = \Sum se_weight(se) * se->avg.load_sum
3273 * load_avg = \Sum se->avg.load_avg
3274 *
3275 * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
3276 * runnable_load_avg = \Sum se->avg.runable_load_avg
3277 */
3278
3029static int 3279static int
3030__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) 3280__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
3031{ 3281{
3032 return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); 3282 if (entity_is_task(se))
3283 se->runnable_weight = se->load.weight;
3284
3285 if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
3286 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3287 return 1;
3288 }
3289
3290 return 0;
3033} 3291}
3034 3292
3035static int 3293static int
3036__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) 3294__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
3037{ 3295{
3038 return ___update_load_avg(now, cpu, &se->avg, 3296 if (entity_is_task(se))
3039 se->on_rq * scale_load_down(se->load.weight), 3297 se->runnable_weight = se->load.weight;
3040 cfs_rq->curr == se, NULL); 3298
3299 if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
3300 cfs_rq->curr == se)) {
3301
3302 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
3303 return 1;
3304 }
3305
3306 return 0;
3041} 3307}
3042 3308
3043static int 3309static int
3044__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) 3310__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
3045{ 3311{
3046 return ___update_load_avg(now, cpu, &cfs_rq->avg, 3312 if (___update_load_sum(now, cpu, &cfs_rq->avg,
3047 scale_load_down(cfs_rq->load.weight), 3313 scale_load_down(cfs_rq->load.weight),
3048 cfs_rq->curr != NULL, cfs_rq); 3314 scale_load_down(cfs_rq->runnable_weight),
3049} 3315 cfs_rq->curr != NULL)) {
3050 3316
3051/* 3317 ___update_load_avg(&cfs_rq->avg, 1, 1);
3052 * Signed add and clamp on underflow. 3318 return 1;
3053 * 3319 }
3054 * Explicitly do a load-store to ensure the intermediate value never hits 3320
3055 * memory. This allows lockless observations without ever seeing the negative 3321 return 0;
3056 * values. 3322}
3057 */
3058#define add_positive(_ptr, _val) do { \
3059 typeof(_ptr) ptr = (_ptr); \
3060 typeof(_val) val = (_val); \
3061 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3062 \
3063 res = var + val; \
3064 \
3065 if (val < 0 && res > var) \
3066 res = 0; \
3067 \
3068 WRITE_ONCE(*ptr, res); \
3069} while (0)
3070 3323
3071#ifdef CONFIG_FAIR_GROUP_SCHED 3324#ifdef CONFIG_FAIR_GROUP_SCHED
3072/** 3325/**
@@ -3149,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
3149 se->avg.last_update_time = n_last_update_time; 3402 se->avg.last_update_time = n_last_update_time;
3150} 3403}
3151 3404
3152/* Take into account change of utilization of a child task group */ 3405
3406/*
3407 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3408 * propagate its contribution. The key to this propagation is the invariant
3409 * that for each group:
3410 *
3411 * ge->avg == grq->avg (1)
3412 *
3413 * _IFF_ we look at the pure running and runnable sums. Because they
3414 * represent the very same entity, just at different points in the hierarchy.
3415 *
3416 *
3417 * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
3418 * simply copies the running sum over.
3419 *
3420 * However, update_tg_cfs_runnable() is more complex. So we have:
3421 *
3422 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3423 *
3424 * And since, like util, the runnable part should be directly transferable,
3425 * the following would _appear_ to be the straight forward approach:
3426 *
3427 * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
3428 *
3429 * And per (1) we have:
3430 *
3431 * ge->avg.running_avg == grq->avg.running_avg
3432 *
3433 * Which gives:
3434 *
3435 * ge->load.weight * grq->avg.load_avg
3436 * ge->avg.load_avg = ----------------------------------- (4)
3437 * grq->load.weight
3438 *
3439 * Except that is wrong!
3440 *
3441 * Because while for entities historical weight is not important and we
3442 * really only care about our future and therefore can consider a pure
3443 * runnable sum, runqueues can NOT do this.
3444 *
3445 * We specifically want runqueues to have a load_avg that includes
3446 * historical weights. Those represent the blocked load, the load we expect
3447 * to (shortly) return to us. This only works by keeping the weights as
3448 * integral part of the sum. We therefore cannot decompose as per (3).
3449 *
3450 * OK, so what then?
3451 *
3452 *
3453 * Another way to look at things is:
3454 *
3455 * grq->avg.load_avg = \Sum se->avg.load_avg
3456 *
3457 * Therefore, per (2):
3458 *
3459 * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
3460 *
3461 * And the very thing we're propagating is a change in that sum (someone
3462 * joined/left). So we can easily know the runnable change, which would be, per
3463 * (2) the already tracked se->load_avg divided by the corresponding
3464 * se->weight.
3465 *
3466 * Basically (4) but in differential form:
3467 *
3468 * d(runnable_avg) += se->avg.load_avg / se->load.weight
3469 * (5)
3470 * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
3471 */
3472
3153static inline void 3473static inline void
3154update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) 3474update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3155{ 3475{
3156 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3157 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; 3476 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3158 3477
3159 /* Nothing to update */ 3478 /* Nothing to update */
@@ -3169,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
3169 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; 3488 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3170} 3489}
3171 3490
3172/* Take into account change of load of a child task group */
3173static inline void 3491static inline void
3174update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) 3492update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3175{ 3493{
3176 struct cfs_rq *gcfs_rq = group_cfs_rq(se); 3494 long runnable_sum = gcfs_rq->prop_runnable_sum;
3177 long delta, load = gcfs_rq->avg.load_avg; 3495 long runnable_load_avg, load_avg;
3496 s64 runnable_load_sum, load_sum;
3178 3497
3179 /* 3498 if (!runnable_sum)
3180 * If the load of group cfs_rq is null, the load of the 3499 return;
3181 * sched_entity will also be null so we can skip the formula
3182 */
3183 if (load) {
3184 long tg_load;
3185 3500
3186 /* Get tg's load and ensure tg_load > 0 */ 3501 gcfs_rq->prop_runnable_sum = 0;
3187 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
3188 3502
3189 /* Ensure tg_load >= load and updated with current load*/ 3503 load_sum = (s64)se_weight(se) * runnable_sum;
3190 tg_load -= gcfs_rq->tg_load_avg_contrib; 3504 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3191 tg_load += load;
3192 3505
3193 /* 3506 add_positive(&se->avg.load_sum, runnable_sum);
3194 * We need to compute a correction term in the case that the 3507 add_positive(&se->avg.load_avg, load_avg);
3195 * task group is consuming more CPU than a task of equal
3196 * weight. A task with a weight equals to tg->shares will have
3197 * a load less or equal to scale_load_down(tg->shares).
3198 * Similarly, the sched_entities that represent the task group
3199 * at parent level, can't have a load higher than
3200 * scale_load_down(tg->shares). And the Sum of sched_entities'
3201 * load must be <= scale_load_down(tg->shares).
3202 */
3203 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
3204 /* scale gcfs_rq's load into tg's shares*/
3205 load *= scale_load_down(gcfs_rq->tg->shares);
3206 load /= tg_load;
3207 }
3208 }
3209 3508
3210 delta = load - se->avg.load_avg; 3509 add_positive(&cfs_rq->avg.load_avg, load_avg);
3510 add_positive(&cfs_rq->avg.load_sum, load_sum);
3211 3511
3212 /* Nothing to update */ 3512 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3213 if (!delta) 3513 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3214 return;
3215
3216 /* Set new sched_entity's load */
3217 se->avg.load_avg = load;
3218 se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
3219 3514
3220 /* Update parent cfs_rq load */ 3515 add_positive(&se->avg.runnable_load_sum, runnable_sum);
3221 add_positive(&cfs_rq->avg.load_avg, delta); 3516 add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
3222 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
3223 3517
3224 /*
3225 * If the sched_entity is already enqueued, we also have to update the
3226 * runnable load avg.
3227 */
3228 if (se->on_rq) { 3518 if (se->on_rq) {
3229 /* Update parent cfs_rq runnable_load_avg */ 3519 add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
3230 add_positive(&cfs_rq->runnable_load_avg, delta); 3520 add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
3231 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
3232 } 3521 }
3233} 3522}
3234 3523
3235static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) 3524static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3236{
3237 cfs_rq->propagate_avg = 1;
3238}
3239
3240static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
3241{ 3525{
3242 struct cfs_rq *cfs_rq = group_cfs_rq(se); 3526 cfs_rq->propagate = 1;
3243 3527 cfs_rq->prop_runnable_sum += runnable_sum;
3244 if (!cfs_rq->propagate_avg)
3245 return 0;
3246
3247 cfs_rq->propagate_avg = 0;
3248 return 1;
3249} 3528}
3250 3529
3251/* Update task and its cfs_rq load average */ 3530/* Update task and its cfs_rq load average */
3252static inline int propagate_entity_load_avg(struct sched_entity *se) 3531static inline int propagate_entity_load_avg(struct sched_entity *se)
3253{ 3532{
3254 struct cfs_rq *cfs_rq; 3533 struct cfs_rq *cfs_rq, *gcfs_rq;
3255 3534
3256 if (entity_is_task(se)) 3535 if (entity_is_task(se))
3257 return 0; 3536 return 0;
3258 3537
3259 if (!test_and_clear_tg_cfs_propagate(se)) 3538 gcfs_rq = group_cfs_rq(se);
3539 if (!gcfs_rq->propagate)
3260 return 0; 3540 return 0;
3261 3541
3542 gcfs_rq->propagate = 0;
3543
3262 cfs_rq = cfs_rq_of(se); 3544 cfs_rq = cfs_rq_of(se);
3263 3545
3264 set_tg_cfs_propagate(cfs_rq); 3546 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3265 3547
3266 update_tg_cfs_util(cfs_rq, se); 3548 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3267 update_tg_cfs_load(cfs_rq, se); 3549 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3268 3550
3269 return 1; 3551 return 1;
3270} 3552}
@@ -3288,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
3288 * If there is a pending propagation, we have to update the load and 3570 * If there is a pending propagation, we have to update the load and
3289 * the utilization of the sched_entity: 3571 * the utilization of the sched_entity:
3290 */ 3572 */
3291 if (gcfs_rq->propagate_avg) 3573 if (gcfs_rq->propagate)
3292 return false; 3574 return false;
3293 3575
3294 /* 3576 /*
@@ -3308,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
3308 return 0; 3590 return 0;
3309} 3591}
3310 3592
3311static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} 3593static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3312 3594
3313#endif /* CONFIG_FAIR_GROUP_SCHED */ 3595#endif /* CONFIG_FAIR_GROUP_SCHED */
3314 3596
3315/*
3316 * Unsigned subtract and clamp on underflow.
3317 *
3318 * Explicitly do a load-store to ensure the intermediate value never hits
3319 * memory. This allows lockless observations without ever seeing the negative
3320 * values.
3321 */
3322#define sub_positive(_ptr, _val) do { \
3323 typeof(_ptr) ptr = (_ptr); \
3324 typeof(*ptr) val = (_val); \
3325 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3326 res = var - val; \
3327 if (res > var) \
3328 res = 0; \
3329 WRITE_ONCE(*ptr, res); \
3330} while (0)
3331
3332/** 3597/**
3333 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages 3598 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3334 * @now: current time, as per cfs_rq_clock_task() 3599 * @now: current time, as per cfs_rq_clock_task()
@@ -3348,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
3348static inline int 3613static inline int
3349update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 3614update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3350{ 3615{
3616 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3351 struct sched_avg *sa = &cfs_rq->avg; 3617 struct sched_avg *sa = &cfs_rq->avg;
3352 int decayed, removed_load = 0, removed_util = 0; 3618 int decayed = 0;
3353 3619
3354 if (atomic_long_read(&cfs_rq->removed_load_avg)) { 3620 if (cfs_rq->removed.nr) {
3355 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); 3621 unsigned long r;
3622 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3623
3624 raw_spin_lock(&cfs_rq->removed.lock);
3625 swap(cfs_rq->removed.util_avg, removed_util);
3626 swap(cfs_rq->removed.load_avg, removed_load);
3627 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3628 cfs_rq->removed.nr = 0;
3629 raw_spin_unlock(&cfs_rq->removed.lock);
3630
3631 r = removed_load;
3356 sub_positive(&sa->load_avg, r); 3632 sub_positive(&sa->load_avg, r);
3357 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); 3633 sub_positive(&sa->load_sum, r * divider);
3358 removed_load = 1;
3359 set_tg_cfs_propagate(cfs_rq);
3360 }
3361 3634
3362 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 3635 r = removed_util;
3363 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
3364 sub_positive(&sa->util_avg, r); 3636 sub_positive(&sa->util_avg, r);
3365 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); 3637 sub_positive(&sa->util_sum, r * divider);
3366 removed_util = 1; 3638
3367 set_tg_cfs_propagate(cfs_rq); 3639 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3640
3641 decayed = 1;
3368 } 3642 }
3369 3643
3370 decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); 3644 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
3371 3645
3372#ifndef CONFIG_64BIT 3646#ifndef CONFIG_64BIT
3373 smp_wmb(); 3647 smp_wmb();
3374 cfs_rq->load_last_update_time_copy = sa->last_update_time; 3648 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3375#endif 3649#endif
3376 3650
3377 if (decayed || removed_util) 3651 if (decayed)
3378 cfs_rq_util_change(cfs_rq); 3652 cfs_rq_util_change(cfs_rq);
3379 3653
3380 return decayed || removed_load; 3654 return decayed;
3381}
3382
3383/*
3384 * Optional action to be done while updating the load average
3385 */
3386#define UPDATE_TG 0x1
3387#define SKIP_AGE_LOAD 0x2
3388
3389/* Update task and its cfs_rq load average */
3390static inline void update_load_avg(struct sched_entity *se, int flags)
3391{
3392 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3393 u64 now = cfs_rq_clock_task(cfs_rq);
3394 struct rq *rq = rq_of(cfs_rq);
3395 int cpu = cpu_of(rq);
3396 int decayed;
3397
3398 /*
3399 * Track task load average for carrying it to new CPU after migrated, and
3400 * track group sched_entity load average for task_h_load calc in migration
3401 */
3402 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3403 __update_load_avg_se(now, cpu, cfs_rq, se);
3404
3405 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3406 decayed |= propagate_entity_load_avg(se);
3407
3408 if (decayed && (flags & UPDATE_TG))
3409 update_tg_load_avg(cfs_rq, 0);
3410} 3655}
3411 3656
3412/** 3657/**
@@ -3419,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
3419 */ 3664 */
3420static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3665static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3421{ 3666{
3667 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3668
3669 /*
3670 * When we attach the @se to the @cfs_rq, we must align the decay
3671 * window because without that, really weird and wonderful things can
3672 * happen.
3673 *
3674 * XXX illustrate
3675 */
3422 se->avg.last_update_time = cfs_rq->avg.last_update_time; 3676 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3423 cfs_rq->avg.load_avg += se->avg.load_avg; 3677 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3424 cfs_rq->avg.load_sum += se->avg.load_sum; 3678
3679 /*
3680 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3681 * period_contrib. This isn't strictly correct, but since we're
3682 * entirely outside of the PELT hierarchy, nobody cares if we truncate
3683 * _sum a little.
3684 */
3685 se->avg.util_sum = se->avg.util_avg * divider;
3686
3687 se->avg.load_sum = divider;
3688 if (se_weight(se)) {
3689 se->avg.load_sum =
3690 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3691 }
3692
3693 se->avg.runnable_load_sum = se->avg.load_sum;
3694
3695 enqueue_load_avg(cfs_rq, se);
3425 cfs_rq->avg.util_avg += se->avg.util_avg; 3696 cfs_rq->avg.util_avg += se->avg.util_avg;
3426 cfs_rq->avg.util_sum += se->avg.util_sum; 3697 cfs_rq->avg.util_sum += se->avg.util_sum;
3427 set_tg_cfs_propagate(cfs_rq); 3698
3699 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3428 3700
3429 cfs_rq_util_change(cfs_rq); 3701 cfs_rq_util_change(cfs_rq);
3430} 3702}
@@ -3439,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3439 */ 3711 */
3440static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3712static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3441{ 3713{
3442 3714 dequeue_load_avg(cfs_rq, se);
3443 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3444 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3445 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3715 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3446 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3716 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3447 set_tg_cfs_propagate(cfs_rq); 3717
3718 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3448 3719
3449 cfs_rq_util_change(cfs_rq); 3720 cfs_rq_util_change(cfs_rq);
3450} 3721}
3451 3722
3452/* Add the load generated by se into cfs_rq's load average */ 3723/*
3453static inline void 3724 * Optional action to be done while updating the load average
3454enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3725 */
3726#define UPDATE_TG 0x1
3727#define SKIP_AGE_LOAD 0x2
3728#define DO_ATTACH 0x4
3729
3730/* Update task and its cfs_rq load average */
3731static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3455{ 3732{
3456 struct sched_avg *sa = &se->avg; 3733 u64 now = cfs_rq_clock_task(cfs_rq);
3734 struct rq *rq = rq_of(cfs_rq);
3735 int cpu = cpu_of(rq);
3736 int decayed;
3737
3738 /*
3739 * Track task load average for carrying it to new CPU after migrated, and
3740 * track group sched_entity load average for task_h_load calc in migration
3741 */
3742 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3743 __update_load_avg_se(now, cpu, cfs_rq, se);
3457 3744
3458 cfs_rq->runnable_load_avg += sa->load_avg; 3745 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3459 cfs_rq->runnable_load_sum += sa->load_sum; 3746 decayed |= propagate_entity_load_avg(se);
3747
3748 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3460 3749
3461 if (!sa->last_update_time) {
3462 attach_entity_load_avg(cfs_rq, se); 3750 attach_entity_load_avg(cfs_rq, se);
3463 update_tg_load_avg(cfs_rq, 0); 3751 update_tg_load_avg(cfs_rq, 0);
3464 }
3465}
3466 3752
3467/* Remove the runnable load generated by se from cfs_rq's runnable load average */ 3753 } else if (decayed && (flags & UPDATE_TG))
3468static inline void 3754 update_tg_load_avg(cfs_rq, 0);
3469dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3470{
3471 cfs_rq->runnable_load_avg =
3472 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3473 cfs_rq->runnable_load_sum =
3474 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3475} 3755}
3476 3756
3477#ifndef CONFIG_64BIT 3757#ifndef CONFIG_64BIT
@@ -3515,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
3515void remove_entity_load_avg(struct sched_entity *se) 3795void remove_entity_load_avg(struct sched_entity *se)
3516{ 3796{
3517 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3797 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3798 unsigned long flags;
3518 3799
3519 /* 3800 /*
3520 * tasks cannot exit without having gone through wake_up_new_task() -> 3801 * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3527,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
3527 */ 3808 */
3528 3809
3529 sync_entity_load_avg(se); 3810 sync_entity_load_avg(se);
3530 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); 3811
3531 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); 3812 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3813 ++cfs_rq->removed.nr;
3814 cfs_rq->removed.util_avg += se->avg.util_avg;
3815 cfs_rq->removed.load_avg += se->avg.load_avg;
3816 cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
3817 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3532} 3818}
3533 3819
3534static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) 3820static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3535{ 3821{
3536 return cfs_rq->runnable_load_avg; 3822 return cfs_rq->avg.runnable_load_avg;
3537} 3823}
3538 3824
3539static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) 3825static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3553,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3553 3839
3554#define UPDATE_TG 0x0 3840#define UPDATE_TG 0x0
3555#define SKIP_AGE_LOAD 0x0 3841#define SKIP_AGE_LOAD 0x0
3842#define DO_ATTACH 0x0
3556 3843
3557static inline void update_load_avg(struct sched_entity *se, int not_used1) 3844static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3558{ 3845{
3559 cfs_rq_util_change(cfs_rq_of(se)); 3846 cfs_rq_util_change(cfs_rq);
3560} 3847}
3561 3848
3562static inline void
3563enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3564static inline void
3565dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3566static inline void remove_entity_load_avg(struct sched_entity *se) {} 3849static inline void remove_entity_load_avg(struct sched_entity *se) {}
3567 3850
3568static inline void 3851static inline void
@@ -3707,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3707 * its group cfs_rq 3990 * its group cfs_rq
3708 * - Add its new weight to cfs_rq->load.weight 3991 * - Add its new weight to cfs_rq->load.weight
3709 */ 3992 */
3710 update_load_avg(se, UPDATE_TG); 3993 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
3711 enqueue_entity_load_avg(cfs_rq, se); 3994 update_cfs_group(se);
3712 update_cfs_shares(se); 3995 enqueue_runnable_load_avg(cfs_rq, se);
3713 account_entity_enqueue(cfs_rq, se); 3996 account_entity_enqueue(cfs_rq, se);
3714 3997
3715 if (flags & ENQUEUE_WAKEUP) 3998 if (flags & ENQUEUE_WAKEUP)
@@ -3791,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3791 * - For group entity, update its weight to reflect the new share 4074 * - For group entity, update its weight to reflect the new share
3792 * of its group cfs_rq. 4075 * of its group cfs_rq.
3793 */ 4076 */
3794 update_load_avg(se, UPDATE_TG); 4077 update_load_avg(cfs_rq, se, UPDATE_TG);
3795 dequeue_entity_load_avg(cfs_rq, se); 4078 dequeue_runnable_load_avg(cfs_rq, se);
3796 4079
3797 update_stats_dequeue(cfs_rq, se, flags); 4080 update_stats_dequeue(cfs_rq, se, flags);
3798 4081
@@ -3815,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3815 /* return excess runtime on last dequeue */ 4098 /* return excess runtime on last dequeue */
3816 return_cfs_rq_runtime(cfs_rq); 4099 return_cfs_rq_runtime(cfs_rq);
3817 4100
3818 update_cfs_shares(se); 4101 update_cfs_group(se);
3819 4102
3820 /* 4103 /*
3821 * Now advance min_vruntime if @se was the entity holding it back, 4104 * Now advance min_vruntime if @se was the entity holding it back,
@@ -3879,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3879 */ 4162 */
3880 update_stats_wait_end(cfs_rq, se); 4163 update_stats_wait_end(cfs_rq, se);
3881 __dequeue_entity(cfs_rq, se); 4164 __dequeue_entity(cfs_rq, se);
3882 update_load_avg(se, UPDATE_TG); 4165 update_load_avg(cfs_rq, se, UPDATE_TG);
3883 } 4166 }
3884 4167
3885 update_stats_curr_start(cfs_rq, se); 4168 update_stats_curr_start(cfs_rq, se);
@@ -3981,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3981 /* Put 'current' back into the tree. */ 4264 /* Put 'current' back into the tree. */
3982 __enqueue_entity(cfs_rq, prev); 4265 __enqueue_entity(cfs_rq, prev);
3983 /* in !on_rq case, update occurred at dequeue */ 4266 /* in !on_rq case, update occurred at dequeue */
3984 update_load_avg(prev, 0); 4267 update_load_avg(cfs_rq, prev, 0);
3985 } 4268 }
3986 cfs_rq->curr = NULL; 4269 cfs_rq->curr = NULL;
3987} 4270}
@@ -3997,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3997 /* 4280 /*
3998 * Ensure that runnable average is periodically updated. 4281 * Ensure that runnable average is periodically updated.
3999 */ 4282 */
4000 update_load_avg(curr, UPDATE_TG); 4283 update_load_avg(cfs_rq, curr, UPDATE_TG);
4001 update_cfs_shares(curr); 4284 update_cfs_group(curr);
4002 4285
4003#ifdef CONFIG_SCHED_HRTICK 4286#ifdef CONFIG_SCHED_HRTICK
4004 /* 4287 /*
@@ -4915,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4915 if (cfs_rq_throttled(cfs_rq)) 5198 if (cfs_rq_throttled(cfs_rq))
4916 break; 5199 break;
4917 5200
4918 update_load_avg(se, UPDATE_TG); 5201 update_load_avg(cfs_rq, se, UPDATE_TG);
4919 update_cfs_shares(se); 5202 update_cfs_group(se);
4920 } 5203 }
4921 5204
4922 if (!se) 5205 if (!se)
@@ -4974,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4974 if (cfs_rq_throttled(cfs_rq)) 5257 if (cfs_rq_throttled(cfs_rq))
4975 break; 5258 break;
4976 5259
4977 update_load_avg(se, UPDATE_TG); 5260 update_load_avg(cfs_rq, se, UPDATE_TG);
4978 update_cfs_shares(se); 5261 update_cfs_group(se);
4979 } 5262 }
4980 5263
4981 if (!se) 5264 if (!se)
@@ -5449,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5449/* 5732/*
5450 * find_idlest_group finds and returns the least busy CPU group within the 5733 * find_idlest_group finds and returns the least busy CPU group within the
5451 * domain. 5734 * domain.
5735 *
5736 * Assumes p is allowed on at least one CPU in sd.
5452 */ 5737 */
5453static struct sched_group * 5738static struct sched_group *
5454find_idlest_group(struct sched_domain *sd, struct task_struct *p, 5739find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5456,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5456{ 5741{
5457 struct sched_group *idlest = NULL, *group = sd->groups; 5742 struct sched_group *idlest = NULL, *group = sd->groups;
5458 struct sched_group *most_spare_sg = NULL; 5743 struct sched_group *most_spare_sg = NULL;
5459 unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; 5744 unsigned long min_runnable_load = ULONG_MAX;
5460 unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; 5745 unsigned long this_runnable_load = ULONG_MAX;
5746 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5461 unsigned long most_spare = 0, this_spare = 0; 5747 unsigned long most_spare = 0, this_spare = 0;
5462 int load_idx = sd->forkexec_idx; 5748 int load_idx = sd->forkexec_idx;
5463 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; 5749 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
@@ -5578,10 +5864,10 @@ skip_spare:
5578} 5864}
5579 5865
5580/* 5866/*
5581 * find_idlest_cpu - find the idlest cpu among the cpus in group. 5867 * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
5582 */ 5868 */
5583static int 5869static int
5584find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 5870find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5585{ 5871{
5586 unsigned long load, min_load = ULONG_MAX; 5872 unsigned long load, min_load = ULONG_MAX;
5587 unsigned int min_exit_latency = UINT_MAX; 5873 unsigned int min_exit_latency = UINT_MAX;
@@ -5630,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5630 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; 5916 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
5631} 5917}
5632 5918
5919static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5920 int cpu, int prev_cpu, int sd_flag)
5921{
5922 int new_cpu = cpu;
5923
5924 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
5925 return prev_cpu;
5926
5927 while (sd) {
5928 struct sched_group *group;
5929 struct sched_domain *tmp;
5930 int weight;
5931
5932 if (!(sd->flags & sd_flag)) {
5933 sd = sd->child;
5934 continue;
5935 }
5936
5937 group = find_idlest_group(sd, p, cpu, sd_flag);
5938 if (!group) {
5939 sd = sd->child;
5940 continue;
5941 }
5942
5943 new_cpu = find_idlest_group_cpu(group, p, cpu);
5944 if (new_cpu == cpu) {
5945 /* Now try balancing at a lower domain level of cpu */
5946 sd = sd->child;
5947 continue;
5948 }
5949
5950 /* Now try balancing at a lower domain level of new_cpu */
5951 cpu = new_cpu;
5952 weight = sd->span_weight;
5953 sd = NULL;
5954 for_each_domain(cpu, tmp) {
5955 if (weight <= tmp->span_weight)
5956 break;
5957 if (tmp->flags & sd_flag)
5958 sd = tmp;
5959 }
5960 /* while loop will break here if sd == NULL */
5961 }
5962
5963 return new_cpu;
5964}
5965
5633#ifdef CONFIG_SCHED_SMT 5966#ifdef CONFIG_SCHED_SMT
5634 5967
5635static inline void set_idle_cores(int cpu, int val) 5968static inline void set_idle_cores(int cpu, int val)
@@ -5982,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5982 new_cpu = cpu; 6315 new_cpu = cpu;
5983 } 6316 }
5984 6317
6318 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
6319 /*
6320 * We're going to need the task's util for capacity_spare_wake
6321 * in find_idlest_group. Sync it up to prev_cpu's
6322 * last_update_time.
6323 */
6324 sync_entity_load_avg(&p->se);
6325 }
6326
5985 if (!sd) { 6327 if (!sd) {
5986 pick_cpu: 6328pick_cpu:
5987 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ 6329 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5988 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); 6330 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
5989 6331
5990 } else while (sd) { 6332 } else {
5991 struct sched_group *group; 6333 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
5992 int weight;
5993
5994 if (!(sd->flags & sd_flag)) {
5995 sd = sd->child;
5996 continue;
5997 }
5998
5999 group = find_idlest_group(sd, p, cpu, sd_flag);
6000 if (!group) {
6001 sd = sd->child;
6002 continue;
6003 }
6004
6005 new_cpu = find_idlest_cpu(group, p, cpu);
6006 if (new_cpu == -1 || new_cpu == cpu) {
6007 /* Now try balancing at a lower domain level of cpu */
6008 sd = sd->child;
6009 continue;
6010 }
6011
6012 /* Now try balancing at a lower domain level of new_cpu */
6013 cpu = new_cpu;
6014 weight = sd->span_weight;
6015 sd = NULL;
6016 for_each_domain(cpu, tmp) {
6017 if (weight <= tmp->span_weight)
6018 break;
6019 if (tmp->flags & sd_flag)
6020 sd = tmp;
6021 }
6022 /* while loop will break here if sd == NULL */
6023 } 6334 }
6024 rcu_read_unlock(); 6335 rcu_read_unlock();
6025 6336
6026 return new_cpu; 6337 return new_cpu;
6027} 6338}
6028 6339
6340static void detach_entity_cfs_rq(struct sched_entity *se);
6341
6029/* 6342/*
6030 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 6343 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
6031 * cfs_rq_of(p) references at time of call are still valid and identify the 6344 * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6059,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
6059 se->vruntime -= min_vruntime; 6372 se->vruntime -= min_vruntime;
6060 } 6373 }
6061 6374
6062 /* 6375 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6063 * We are supposed to update the task to "current" time, then its up to date 6376 /*
6064 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting 6377 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
6065 * what current time is, so simply throw away the out-of-date time. This 6378 * rq->lock and can modify state directly.
6066 * will result in the wakee task is less decayed, but giving the wakee more 6379 */
6067 * load sounds not bad. 6380 lockdep_assert_held(&task_rq(p)->lock);
6068 */ 6381 detach_entity_cfs_rq(&p->se);
6069 remove_entity_load_avg(&p->se); 6382
6383 } else {
6384 /*
6385 * We are supposed to update the task to "current" time, then
6386 * its up to date and ready to go to new CPU/cfs_rq. But we
6387 * have difficulty in getting what current time is, so simply
6388 * throw away the out-of-date time. This will result in the
6389 * wakee task is less decayed, but giving the wakee more load
6390 * sounds not bad.
6391 */
6392 remove_entity_load_avg(&p->se);
6393 }
6070 6394
6071 /* Tell new CPU we are migrated */ 6395 /* Tell new CPU we are migrated */
6072 p->se.avg.last_update_time = 0; 6396 p->se.avg.last_update_time = 0;
@@ -6334,10 +6658,7 @@ again:
6334 set_next_entity(cfs_rq, se); 6658 set_next_entity(cfs_rq, se);
6335 } 6659 }
6336 6660
6337 if (hrtick_enabled(rq)) 6661 goto done;
6338 hrtick_start_fair(rq, p);
6339
6340 return p;
6341simple: 6662simple:
6342#endif 6663#endif
6343 6664
@@ -6351,6 +6672,16 @@ simple:
6351 6672
6352 p = task_of(se); 6673 p = task_of(se);
6353 6674
6675done: __maybe_unused
6676#ifdef CONFIG_SMP
6677 /*
6678 * Move the next running task to the front of
6679 * the list, so our cfs_tasks list becomes MRU
6680 * one.
6681 */
6682 list_move(&p->se.group_node, &rq->cfs_tasks);
6683#endif
6684
6354 if (hrtick_enabled(rq)) 6685 if (hrtick_enabled(rq))
6355 hrtick_start_fair(rq, p); 6686 hrtick_start_fair(rq, p);
6356 6687
@@ -6786,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
6786 */ 7117 */
6787static struct task_struct *detach_one_task(struct lb_env *env) 7118static struct task_struct *detach_one_task(struct lb_env *env)
6788{ 7119{
6789 struct task_struct *p, *n; 7120 struct task_struct *p;
6790 7121
6791 lockdep_assert_held(&env->src_rq->lock); 7122 lockdep_assert_held(&env->src_rq->lock);
6792 7123
6793 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 7124 list_for_each_entry_reverse(p,
7125 &env->src_rq->cfs_tasks, se.group_node) {
6794 if (!can_migrate_task(p, env)) 7126 if (!can_migrate_task(p, env))
6795 continue; 7127 continue;
6796 7128
@@ -6836,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
6836 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) 7168 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
6837 break; 7169 break;
6838 7170
6839 p = list_first_entry(tasks, struct task_struct, se.group_node); 7171 p = list_last_entry(tasks, struct task_struct, se.group_node);
6840 7172
6841 env->loop++; 7173 env->loop++;
6842 /* We've more or less seen every task there is, call it quits */ 7174 /* We've more or less seen every task there is, call it quits */
@@ -6886,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
6886 7218
6887 continue; 7219 continue;
6888next: 7220next:
6889 list_move_tail(&p->se.group_node, tasks); 7221 list_move(&p->se.group_node, tasks);
6890 } 7222 }
6891 7223
6892 /* 7224 /*
@@ -6962,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
6962 if (cfs_rq->avg.util_sum) 7294 if (cfs_rq->avg.util_sum)
6963 return false; 7295 return false;
6964 7296
6965 if (cfs_rq->runnable_load_sum) 7297 if (cfs_rq->avg.runnable_load_sum)
6966 return false; 7298 return false;
6967 7299
6968 return true; 7300 return true;
@@ -6994,7 +7326,7 @@ static void update_blocked_averages(int cpu)
6994 /* Propagate pending load changes to the parent, if any: */ 7326 /* Propagate pending load changes to the parent, if any: */
6995 se = cfs_rq->tg->se[cpu]; 7327 se = cfs_rq->tg->se[cpu];
6996 if (se && !skip_blocked_update(se)) 7328 if (se && !skip_blocked_update(se))
6997 update_load_avg(se, 0); 7329 update_load_avg(cfs_rq_of(se), se, 0);
6998 7330
6999 /* 7331 /*
7000 * There can be a lot of idle CPU cgroups. Don't let fully 7332 * There can be a lot of idle CPU cgroups. Don't let fully
@@ -7875,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
7875 if (busiest->group_type == group_imbalanced) 8207 if (busiest->group_type == group_imbalanced)
7876 goto force_balance; 8208 goto force_balance;
7877 8209
7878 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 8210 /*
7879 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && 8211 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
8212 * capacities from resulting in underutilization due to avg_load.
8213 */
8214 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
7880 busiest->group_no_capacity) 8215 busiest->group_no_capacity)
7881 goto force_balance; 8216 goto force_balance;
7882 8217
@@ -8693,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
8693 return; 9028 return;
8694 9029
8695 /* Spare idle load balancing on CPUs that don't want to be disturbed: */ 9030 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
8696 if (!is_housekeeping_cpu(cpu)) 9031 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
8697 return; 9032 return;
8698 9033
8699 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 9034 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
@@ -9158,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
9158 if (cfs_rq_throttled(cfs_rq)) 9493 if (cfs_rq_throttled(cfs_rq))
9159 break; 9494 break;
9160 9495
9161 update_load_avg(se, UPDATE_TG); 9496 update_load_avg(cfs_rq, se, UPDATE_TG);
9162 } 9497 }
9163} 9498}
9164#else 9499#else
@@ -9170,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
9170 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9505 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9171 9506
9172 /* Catch up with the cfs_rq and remove our load when we leave */ 9507 /* Catch up with the cfs_rq and remove our load when we leave */
9173 update_load_avg(se, 0); 9508 update_load_avg(cfs_rq, se, 0);
9174 detach_entity_load_avg(cfs_rq, se); 9509 detach_entity_load_avg(cfs_rq, se);
9175 update_tg_load_avg(cfs_rq, false); 9510 update_tg_load_avg(cfs_rq, false);
9176 propagate_entity_cfs_rq(se); 9511 propagate_entity_cfs_rq(se);
@@ -9189,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
9189#endif 9524#endif
9190 9525
9191 /* Synchronize entity with its cfs_rq */ 9526 /* Synchronize entity with its cfs_rq */
9192 update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); 9527 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
9193 attach_entity_load_avg(cfs_rq, se); 9528 attach_entity_load_avg(cfs_rq, se);
9194 update_tg_load_avg(cfs_rq, false); 9529 update_tg_load_avg(cfs_rq, false);
9195 propagate_entity_cfs_rq(se); 9530 propagate_entity_cfs_rq(se);
@@ -9271,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
9271 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 9606 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9272#endif 9607#endif
9273#ifdef CONFIG_SMP 9608#ifdef CONFIG_SMP
9274#ifdef CONFIG_FAIR_GROUP_SCHED 9609 raw_spin_lock_init(&cfs_rq->removed.lock);
9275 cfs_rq->propagate_avg = 0;
9276#endif
9277 atomic_long_set(&cfs_rq->removed_load_avg, 0);
9278 atomic_long_set(&cfs_rq->removed_util_avg, 0);
9279#endif 9610#endif
9280} 9611}
9281 9612
@@ -9473,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9473 rq_lock_irqsave(rq, &rf); 9804 rq_lock_irqsave(rq, &rf);
9474 update_rq_clock(rq); 9805 update_rq_clock(rq);
9475 for_each_sched_entity(se) { 9806 for_each_sched_entity(se) {
9476 update_load_avg(se, UPDATE_TG); 9807 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
9477 update_cfs_shares(se); 9808 update_cfs_group(se);
9478 } 9809 }
9479 rq_unlock_irqrestore(rq, &rf); 9810 rq_unlock_irqrestore(rq, &rf);
9480 } 9811 }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 257f4f0b4532..7dae9eb8c042 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -209,6 +209,7 @@ exit_idle:
209 */ 209 */
210static void do_idle(void) 210static void do_idle(void)
211{ 211{
212 int cpu = smp_processor_id();
212 /* 213 /*
213 * If the arch has a polling bit, we maintain an invariant: 214 * If the arch has a polling bit, we maintain an invariant:
214 * 215 *
@@ -219,14 +220,13 @@ static void do_idle(void)
219 */ 220 */
220 221
221 __current_set_polling(); 222 __current_set_polling();
222 quiet_vmstat();
223 tick_nohz_idle_enter(); 223 tick_nohz_idle_enter();
224 224
225 while (!need_resched()) { 225 while (!need_resched()) {
226 check_pgt_cache(); 226 check_pgt_cache();
227 rmb(); 227 rmb();
228 228
229 if (cpu_is_offline(smp_processor_id())) { 229 if (cpu_is_offline(cpu)) {
230 cpuhp_report_idle_dead(); 230 cpuhp_report_idle_dead();
231 arch_cpu_idle_dead(); 231 arch_cpu_idle_dead();
232 } 232 }
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000000..b71b436f59f2
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,155 @@
1/*
2 * Housekeeping management. Manage the targets for routine code that can run on
3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
4 *
5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
6 *
7 */
8
9#include <linux/sched/isolation.h>
10#include <linux/tick.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/static_key.h>
14#include <linux/ctype.h>
15
16DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
17EXPORT_SYMBOL_GPL(housekeeping_overriden);
18static cpumask_var_t housekeeping_mask;
19static unsigned int housekeeping_flags;
20
21int housekeeping_any_cpu(enum hk_flags flags)
22{
23 if (static_branch_unlikely(&housekeeping_overriden))
24 if (housekeeping_flags & flags)
25 return cpumask_any_and(housekeeping_mask, cpu_online_mask);
26 return smp_processor_id();
27}
28EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
29
30const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
31{
32 if (static_branch_unlikely(&housekeeping_overriden))
33 if (housekeeping_flags & flags)
34 return housekeeping_mask;
35 return cpu_possible_mask;
36}
37EXPORT_SYMBOL_GPL(housekeeping_cpumask);
38
39void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
40{
41 if (static_branch_unlikely(&housekeeping_overriden))
42 if (housekeeping_flags & flags)
43 set_cpus_allowed_ptr(t, housekeeping_mask);
44}
45EXPORT_SYMBOL_GPL(housekeeping_affine);
46
47bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
48{
49 if (static_branch_unlikely(&housekeeping_overriden))
50 if (housekeeping_flags & flags)
51 return cpumask_test_cpu(cpu, housekeeping_mask);
52 return true;
53}
54EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
55
56void __init housekeeping_init(void)
57{
58 if (!housekeeping_flags)
59 return;
60
61 static_branch_enable(&housekeeping_overriden);
62
63 /* We need at least one CPU to handle housekeeping work */
64 WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
65}
66
67static int __init housekeeping_setup(char *str, enum hk_flags flags)
68{
69 cpumask_var_t non_housekeeping_mask;
70 int err;
71
72 alloc_bootmem_cpumask_var(&non_housekeeping_mask);
73 err = cpulist_parse(str, non_housekeeping_mask);
74 if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
75 pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
76 free_bootmem_cpumask_var(non_housekeeping_mask);
77 return 0;
78 }
79
80 if (!housekeeping_flags) {
81 alloc_bootmem_cpumask_var(&housekeeping_mask);
82 cpumask_andnot(housekeeping_mask,
83 cpu_possible_mask, non_housekeeping_mask);
84 if (cpumask_empty(housekeeping_mask))
85 cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
86 } else {
87 cpumask_var_t tmp;
88
89 alloc_bootmem_cpumask_var(&tmp);
90 cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
91 if (!cpumask_equal(tmp, housekeeping_mask)) {
92 pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
93 free_bootmem_cpumask_var(tmp);
94 free_bootmem_cpumask_var(non_housekeeping_mask);
95 return 0;
96 }
97 free_bootmem_cpumask_var(tmp);
98 }
99
100 if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
101 if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
102 tick_nohz_full_setup(non_housekeeping_mask);
103 } else {
104 pr_warn("Housekeeping: nohz unsupported."
105 " Build with CONFIG_NO_HZ_FULL\n");
106 free_bootmem_cpumask_var(non_housekeeping_mask);
107 return 0;
108 }
109 }
110
111 housekeeping_flags |= flags;
112
113 free_bootmem_cpumask_var(non_housekeeping_mask);
114
115 return 1;
116}
117
118static int __init housekeeping_nohz_full_setup(char *str)
119{
120 unsigned int flags;
121
122 flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
123
124 return housekeeping_setup(str, flags);
125}
126__setup("nohz_full=", housekeeping_nohz_full_setup);
127
128static int __init housekeeping_isolcpus_setup(char *str)
129{
130 unsigned int flags = 0;
131
132 while (isalpha(*str)) {
133 if (!strncmp(str, "nohz,", 5)) {
134 str += 5;
135 flags |= HK_FLAG_TICK;
136 continue;
137 }
138
139 if (!strncmp(str, "domain,", 7)) {
140 str += 7;
141 flags |= HK_FLAG_DOMAIN;
142 continue;
143 }
144
145 pr_warn("isolcpus: Error, unknown flag\n");
146 return 0;
147 }
148
149 /* Default behaviour for isolcpus without flags */
150 if (!flags)
151 flags |= HK_FLAG_DOMAIN;
152
153 return housekeeping_setup(str, flags);
154}
155__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3c96c80e0992..d8c43d73e078 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
74 raw_spin_unlock(&rt_b->rt_runtime_lock); 74 raw_spin_unlock(&rt_b->rt_runtime_lock);
75} 75}
76 76
77#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
78static void push_irq_work_func(struct irq_work *work);
79#endif
80
81void init_rt_rq(struct rt_rq *rt_rq) 77void init_rt_rq(struct rt_rq *rt_rq)
82{ 78{
83 struct rt_prio_array *array; 79 struct rt_prio_array *array;
@@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
97 rt_rq->rt_nr_migratory = 0; 93 rt_rq->rt_nr_migratory = 0;
98 rt_rq->overloaded = 0; 94 rt_rq->overloaded = 0;
99 plist_head_init(&rt_rq->pushable_tasks); 95 plist_head_init(&rt_rq->pushable_tasks);
100
101#ifdef HAVE_RT_PUSH_IPI
102 rt_rq->push_flags = 0;
103 rt_rq->push_cpu = nr_cpu_ids;
104 raw_spin_lock_init(&rt_rq->push_lock);
105 init_irq_work(&rt_rq->push_work, push_irq_work_func);
106#endif
107#endif /* CONFIG_SMP */ 96#endif /* CONFIG_SMP */
108 /* We start is dequeued state, because no RT tasks are queued */ 97 /* We start is dequeued state, because no RT tasks are queued */
109 rt_rq->rt_queued = 0; 98 rt_rq->rt_queued = 0;
@@ -1876,241 +1865,166 @@ static void push_rt_tasks(struct rq *rq)
1876} 1865}
1877 1866
1878#ifdef HAVE_RT_PUSH_IPI 1867#ifdef HAVE_RT_PUSH_IPI
1868
1879/* 1869/*
1880 * The search for the next cpu always starts at rq->cpu and ends 1870 * When a high priority task schedules out from a CPU and a lower priority
1881 * when we reach rq->cpu again. It will never return rq->cpu. 1871 * task is scheduled in, a check is made to see if there's any RT tasks
1882 * This returns the next cpu to check, or nr_cpu_ids if the loop 1872 * on other CPUs that are waiting to run because a higher priority RT task
1883 * is complete. 1873 * is currently running on its CPU. In this case, the CPU with multiple RT
1874 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1875 * up that may be able to run one of its non-running queued RT tasks.
1876 *
1877 * All CPUs with overloaded RT tasks need to be notified as there is currently
1878 * no way to know which of these CPUs have the highest priority task waiting
1879 * to run. Instead of trying to take a spinlock on each of these CPUs,
1880 * which has shown to cause large latency when done on machines with many
1881 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1882 * RT tasks waiting to run.
1883 *
1884 * Just sending an IPI to each of the CPUs is also an issue, as on large
1885 * count CPU machines, this can cause an IPI storm on a CPU, especially
1886 * if its the only CPU with multiple RT tasks queued, and a large number
1887 * of CPUs scheduling a lower priority task at the same time.
1888 *
1889 * Each root domain has its own irq work function that can iterate over
1890 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1891 * tassk must be checked if there's one or many CPUs that are lowering
1892 * their priority, there's a single irq work iterator that will try to
1893 * push off RT tasks that are waiting to run.
1894 *
1895 * When a CPU schedules a lower priority task, it will kick off the
1896 * irq work iterator that will jump to each CPU with overloaded RT tasks.
1897 * As it only takes the first CPU that schedules a lower priority task
1898 * to start the process, the rto_start variable is incremented and if
1899 * the atomic result is one, then that CPU will try to take the rto_lock.
1900 * This prevents high contention on the lock as the process handles all
1901 * CPUs scheduling lower priority tasks.
1902 *
1903 * All CPUs that are scheduling a lower priority task will increment the
1904 * rt_loop_next variable. This will make sure that the irq work iterator
1905 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
1906 * priority task, even if the iterator is in the middle of a scan. Incrementing
1907 * the rt_loop_next will cause the iterator to perform another scan.
1884 * 1908 *
1885 * rq->rt.push_cpu holds the last cpu returned by this function,
1886 * or if this is the first instance, it must hold rq->cpu.
1887 */ 1909 */
1888static int rto_next_cpu(struct rq *rq) 1910static int rto_next_cpu(struct rq *rq)
1889{ 1911{
1890 int prev_cpu = rq->rt.push_cpu; 1912 struct root_domain *rd = rq->rd;
1913 int next;
1891 int cpu; 1914 int cpu;
1892 1915
1893 cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
1894
1895 /* 1916 /*
1896 * If the previous cpu is less than the rq's CPU, then it already 1917 * When starting the IPI RT pushing, the rto_cpu is set to -1,
1897 * passed the end of the mask, and has started from the beginning. 1918 * rt_next_cpu() will simply return the first CPU found in
1898 * We end if the next CPU is greater or equal to rq's CPU. 1919 * the rto_mask.
1920 *
1921 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
1922 * will return the next CPU found in the rto_mask.
1923 *
1924 * If there are no more CPUs left in the rto_mask, then a check is made
1925 * against rto_loop and rto_loop_next. rto_loop is only updated with
1926 * the rto_lock held, but any CPU may increment the rto_loop_next
1927 * without any locking.
1899 */ 1928 */
1900 if (prev_cpu < rq->cpu) { 1929 for (;;) {
1901 if (cpu >= rq->cpu)
1902 return nr_cpu_ids;
1903 1930
1904 } else if (cpu >= nr_cpu_ids) { 1931 /* When rto_cpu is -1 this acts like cpumask_first() */
1905 /* 1932 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
1906 * We passed the end of the mask, start at the beginning.
1907 * If the result is greater or equal to the rq's CPU, then
1908 * the loop is finished.
1909 */
1910 cpu = cpumask_first(rq->rd->rto_mask);
1911 if (cpu >= rq->cpu)
1912 return nr_cpu_ids;
1913 }
1914 rq->rt.push_cpu = cpu;
1915 1933
1916 /* Return cpu to let the caller know if the loop is finished or not */ 1934 rd->rto_cpu = cpu;
1917 return cpu;
1918}
1919 1935
1920static int find_next_push_cpu(struct rq *rq) 1936 if (cpu < nr_cpu_ids)
1921{ 1937 return cpu;
1922 struct rq *next_rq;
1923 int cpu;
1924 1938
1925 while (1) { 1939 rd->rto_cpu = -1;
1926 cpu = rto_next_cpu(rq); 1940
1927 if (cpu >= nr_cpu_ids) 1941 /*
1928 break; 1942 * ACQUIRE ensures we see the @rto_mask changes
1929 next_rq = cpu_rq(cpu); 1943 * made prior to the @next value observed.
1944 *
1945 * Matches WMB in rt_set_overload().
1946 */
1947 next = atomic_read_acquire(&rd->rto_loop_next);
1930 1948
1931 /* Make sure the next rq can push to this rq */ 1949 if (rd->rto_loop == next)
1932 if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
1933 break; 1950 break;
1951
1952 rd->rto_loop = next;
1934 } 1953 }
1935 1954
1936 return cpu; 1955 return -1;
1937} 1956}
1938 1957
1939#define RT_PUSH_IPI_EXECUTING 1 1958static inline bool rto_start_trylock(atomic_t *v)
1940#define RT_PUSH_IPI_RESTART 2 1959{
1960 return !atomic_cmpxchg_acquire(v, 0, 1);
1961}
1941 1962
1942/* 1963static inline void rto_start_unlock(atomic_t *v)
1943 * When a high priority task schedules out from a CPU and a lower priority
1944 * task is scheduled in, a check is made to see if there's any RT tasks
1945 * on other CPUs that are waiting to run because a higher priority RT task
1946 * is currently running on its CPU. In this case, the CPU with multiple RT
1947 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1948 * up that may be able to run one of its non-running queued RT tasks.
1949 *
1950 * On large CPU boxes, there's the case that several CPUs could schedule
1951 * a lower priority task at the same time, in which case it will look for
1952 * any overloaded CPUs that it could pull a task from. To do this, the runqueue
1953 * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
1954 * for a single overloaded CPU's runqueue lock can produce a large latency.
1955 * (This has actually been observed on large boxes running cyclictest).
1956 * Instead of taking the runqueue lock of the overloaded CPU, each of the
1957 * CPUs that scheduled a lower priority task simply sends an IPI to the
1958 * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
1959 * lots of contention. The overloaded CPU will look to push its non-running
1960 * RT task off, and if it does, it can then ignore the other IPIs coming
1961 * in, and just pass those IPIs off to any other overloaded CPU.
1962 *
1963 * When a CPU schedules a lower priority task, it only sends an IPI to
1964 * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
1965 * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
1966 * RT overloaded tasks, would cause 100 IPIs to go out at once.
1967 *
1968 * The overloaded RT CPU, when receiving an IPI, will try to push off its
1969 * overloaded RT tasks and then send an IPI to the next CPU that has
1970 * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
1971 * have completed. Just because a CPU may have pushed off its own overloaded
1972 * RT task does not mean it should stop sending the IPI around to other
1973 * overloaded CPUs. There may be another RT task waiting to run on one of
1974 * those CPUs that are of higher priority than the one that was just
1975 * pushed.
1976 *
1977 * An optimization that could possibly be made is to make a CPU array similar
1978 * to the cpupri array mask of all running RT tasks, but for the overloaded
1979 * case, then the IPI could be sent to only the CPU with the highest priority
1980 * RT task waiting, and that CPU could send off further IPIs to the CPU with
1981 * the next highest waiting task. Since the overloaded case is much less likely
1982 * to happen, the complexity of this implementation may not be worth it.
1983 * Instead, just send an IPI around to all overloaded CPUs.
1984 *
1985 * The rq->rt.push_flags holds the status of the IPI that is going around.
1986 * A run queue can only send out a single IPI at a time. The possible flags
1987 * for rq->rt.push_flags are:
1988 *
1989 * (None or zero): No IPI is going around for the current rq
1990 * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
1991 * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
1992 * has changed, and the IPI should restart
1993 * circulating the overloaded CPUs again.
1994 *
1995 * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
1996 * before sending to the next CPU.
1997 *
1998 * Instead of having all CPUs that schedule a lower priority task send
1999 * an IPI to the same "first" CPU in the RT overload mask, they send it
2000 * to the next overloaded CPU after their own CPU. This helps distribute
2001 * the work when there's more than one overloaded CPU and multiple CPUs
2002 * scheduling in lower priority tasks.
2003 *
2004 * When a rq schedules a lower priority task than what was currently
2005 * running, the next CPU with overloaded RT tasks is examined first.
2006 * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
2007 * priority task, it will send an IPI first to CPU 5, then CPU 5 will
2008 * send to CPU 1 if it is still overloaded. CPU 1 will clear the
2009 * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
2010 *
2011 * The first CPU to notice IPI_RESTART is set, will clear that flag and then
2012 * send an IPI to the next overloaded CPU after the rq->cpu and not the next
2013 * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
2014 * schedules a lower priority task, and the IPI_RESTART gets set while the
2015 * handling is being done on CPU 5, it will clear the flag and send it back to
2016 * CPU 4 instead of CPU 1.
2017 *
2018 * Note, the above logic can be disabled by turning off the sched_feature
2019 * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
2020 * taken by the CPU requesting a pull and the waiting RT task will be pulled
2021 * by that CPU. This may be fine for machines with few CPUs.
2022 */
2023static void tell_cpu_to_push(struct rq *rq)
2024{ 1964{
2025 int cpu; 1965 atomic_set_release(v, 0);
1966}
2026 1967
2027 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 1968static void tell_cpu_to_push(struct rq *rq)
2028 raw_spin_lock(&rq->rt.push_lock); 1969{
2029 /* Make sure it's still executing */ 1970 int cpu = -1;
2030 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
2031 /*
2032 * Tell the IPI to restart the loop as things have
2033 * changed since it started.
2034 */
2035 rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
2036 raw_spin_unlock(&rq->rt.push_lock);
2037 return;
2038 }
2039 raw_spin_unlock(&rq->rt.push_lock);
2040 }
2041 1971
2042 /* When here, there's no IPI going around */ 1972 /* Keep the loop going if the IPI is currently active */
1973 atomic_inc(&rq->rd->rto_loop_next);
2043 1974
2044 rq->rt.push_cpu = rq->cpu; 1975 /* Only one CPU can initiate a loop at a time */
2045 cpu = find_next_push_cpu(rq); 1976 if (!rto_start_trylock(&rq->rd->rto_loop_start))
2046 if (cpu >= nr_cpu_ids)
2047 return; 1977 return;
2048 1978
2049 rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; 1979 raw_spin_lock(&rq->rd->rto_lock);
1980
1981 /*
1982 * The rto_cpu is updated under the lock, if it has a valid cpu
1983 * then the IPI is still running and will continue due to the
1984 * update to loop_next, and nothing needs to be done here.
1985 * Otherwise it is finishing up and an ipi needs to be sent.
1986 */
1987 if (rq->rd->rto_cpu < 0)
1988 cpu = rto_next_cpu(rq);
2050 1989
2051 irq_work_queue_on(&rq->rt.push_work, cpu); 1990 raw_spin_unlock(&rq->rd->rto_lock);
1991
1992 rto_start_unlock(&rq->rd->rto_loop_start);
1993
1994 if (cpu >= 0)
1995 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2052} 1996}
2053 1997
2054/* Called from hardirq context */ 1998/* Called from hardirq context */
2055static void try_to_push_tasks(void *arg) 1999void rto_push_irq_work_func(struct irq_work *work)
2056{ 2000{
2057 struct rt_rq *rt_rq = arg; 2001 struct rq *rq;
2058 struct rq *rq, *src_rq;
2059 int this_cpu;
2060 int cpu; 2002 int cpu;
2061 2003
2062 this_cpu = rt_rq->push_cpu; 2004 rq = this_rq();
2063 2005
2064 /* Paranoid check */ 2006 /*
2065 BUG_ON(this_cpu != smp_processor_id()); 2007 * We do not need to grab the lock to check for has_pushable_tasks.
2066 2008 * When it gets updated, a check is made if a push is possible.
2067 rq = cpu_rq(this_cpu); 2009 */
2068 src_rq = rq_of_rt_rq(rt_rq);
2069
2070again:
2071 if (has_pushable_tasks(rq)) { 2010 if (has_pushable_tasks(rq)) {
2072 raw_spin_lock(&rq->lock); 2011 raw_spin_lock(&rq->lock);
2073 push_rt_task(rq); 2012 push_rt_tasks(rq);
2074 raw_spin_unlock(&rq->lock); 2013 raw_spin_unlock(&rq->lock);
2075 } 2014 }
2076 2015
2077 /* Pass the IPI to the next rt overloaded queue */ 2016 raw_spin_lock(&rq->rd->rto_lock);
2078 raw_spin_lock(&rt_rq->push_lock);
2079 /*
2080 * If the source queue changed since the IPI went out,
2081 * we need to restart the search from that CPU again.
2082 */
2083 if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
2084 rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
2085 rt_rq->push_cpu = src_rq->cpu;
2086 }
2087 2017
2088 cpu = find_next_push_cpu(src_rq); 2018 /* Pass the IPI to the next rt overloaded queue */
2019 cpu = rto_next_cpu(rq);
2089 2020
2090 if (cpu >= nr_cpu_ids) 2021 raw_spin_unlock(&rq->rd->rto_lock);
2091 rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
2092 raw_spin_unlock(&rt_rq->push_lock);
2093 2022
2094 if (cpu >= nr_cpu_ids) 2023 if (cpu < 0)
2095 return; 2024 return;
2096 2025
2097 /*
2098 * It is possible that a restart caused this CPU to be
2099 * chosen again. Don't bother with an IPI, just see if we
2100 * have more to push.
2101 */
2102 if (unlikely(cpu == rq->cpu))
2103 goto again;
2104
2105 /* Try the next RT overloaded CPU */ 2026 /* Try the next RT overloaded CPU */
2106 irq_work_queue_on(&rt_rq->push_work, cpu); 2027 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2107}
2108
2109static void push_irq_work_func(struct irq_work *work)
2110{
2111 struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
2112
2113 try_to_push_tasks(rt_rq);
2114} 2028}
2115#endif /* HAVE_RT_PUSH_IPI */ 2029#endif /* HAVE_RT_PUSH_IPI */
2116 2030
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3b448ba82225..45ab0bf564e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,7 +227,7 @@ struct dl_bw {
227static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 227static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
228 228
229static inline 229static inline
230void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) 230void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
231{ 231{
232 dl_b->total_bw -= tsk_bw; 232 dl_b->total_bw -= tsk_bw;
233 __dl_update(dl_b, (s32)tsk_bw / cpus); 233 __dl_update(dl_b, (s32)tsk_bw / cpus);
@@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
256extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 256extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
257extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 257extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
258extern bool __checkparam_dl(const struct sched_attr *attr); 258extern bool __checkparam_dl(const struct sched_attr *attr);
259extern void __dl_clear_params(struct task_struct *p);
260extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 259extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
261extern int dl_task_can_attach(struct task_struct *p, 260extern int dl_task_can_attach(struct task_struct *p,
262 const struct cpumask *cs_cpus_allowed); 261 const struct cpumask *cs_cpus_allowed);
@@ -419,6 +418,7 @@ struct cfs_bandwidth { };
419/* CFS-related fields in a runqueue */ 418/* CFS-related fields in a runqueue */
420struct cfs_rq { 419struct cfs_rq {
421 struct load_weight load; 420 struct load_weight load;
421 unsigned long runnable_weight;
422 unsigned int nr_running, h_nr_running; 422 unsigned int nr_running, h_nr_running;
423 423
424 u64 exec_clock; 424 u64 exec_clock;
@@ -444,18 +444,22 @@ struct cfs_rq {
444 * CFS load tracking 444 * CFS load tracking
445 */ 445 */
446 struct sched_avg avg; 446 struct sched_avg avg;
447 u64 runnable_load_sum;
448 unsigned long runnable_load_avg;
449#ifdef CONFIG_FAIR_GROUP_SCHED
450 unsigned long tg_load_avg_contrib;
451 unsigned long propagate_avg;
452#endif
453 atomic_long_t removed_load_avg, removed_util_avg;
454#ifndef CONFIG_64BIT 447#ifndef CONFIG_64BIT
455 u64 load_last_update_time_copy; 448 u64 load_last_update_time_copy;
456#endif 449#endif
450 struct {
451 raw_spinlock_t lock ____cacheline_aligned;
452 int nr;
453 unsigned long load_avg;
454 unsigned long util_avg;
455 unsigned long runnable_sum;
456 } removed;
457 457
458#ifdef CONFIG_FAIR_GROUP_SCHED 458#ifdef CONFIG_FAIR_GROUP_SCHED
459 unsigned long tg_load_avg_contrib;
460 long propagate;
461 long prop_runnable_sum;
462
459 /* 463 /*
460 * h_load = weight * f(tg) 464 * h_load = weight * f(tg)
461 * 465 *
@@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
502} 506}
503 507
504/* RT IPI pull logic requires IRQ_WORK */ 508/* RT IPI pull logic requires IRQ_WORK */
505#ifdef CONFIG_IRQ_WORK 509#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
506# define HAVE_RT_PUSH_IPI 510# define HAVE_RT_PUSH_IPI
507#endif 511#endif
508 512
@@ -524,12 +528,6 @@ struct rt_rq {
524 unsigned long rt_nr_total; 528 unsigned long rt_nr_total;
525 int overloaded; 529 int overloaded;
526 struct plist_head pushable_tasks; 530 struct plist_head pushable_tasks;
527#ifdef HAVE_RT_PUSH_IPI
528 int push_flags;
529 int push_cpu;
530 struct irq_work push_work;
531 raw_spinlock_t push_lock;
532#endif
533#endif /* CONFIG_SMP */ 531#endif /* CONFIG_SMP */
534 int rt_queued; 532 int rt_queued;
535 533
@@ -638,6 +636,19 @@ struct root_domain {
638 struct dl_bw dl_bw; 636 struct dl_bw dl_bw;
639 struct cpudl cpudl; 637 struct cpudl cpudl;
640 638
639#ifdef HAVE_RT_PUSH_IPI
640 /*
641 * For IPI pull requests, loop across the rto_mask.
642 */
643 struct irq_work rto_push_work;
644 raw_spinlock_t rto_lock;
645 /* These are only updated and read within rto_lock */
646 int rto_loop;
647 int rto_cpu;
648 /* These atomics are updated outside of a lock */
649 atomic_t rto_loop_next;
650 atomic_t rto_loop_start;
651#endif
641 /* 652 /*
642 * The "RT overload" flag: it gets set if a CPU has more than 653 * The "RT overload" flag: it gets set if a CPU has more than
643 * one runnable RT task. 654 * one runnable RT task.
@@ -655,6 +666,9 @@ extern void init_defrootdomain(void);
655extern int sched_init_domains(const struct cpumask *cpu_map); 666extern int sched_init_domains(const struct cpumask *cpu_map);
656extern void rq_attach_root(struct rq *rq, struct root_domain *rd); 667extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
657 668
669#ifdef HAVE_RT_PUSH_IPI
670extern void rto_push_irq_work_func(struct irq_work *work);
671#endif
658#endif /* CONFIG_SMP */ 672#endif /* CONFIG_SMP */
659 673
660/* 674/*
@@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1219# define const_debug const 1233# define const_debug const
1220#endif 1234#endif
1221 1235
1222extern const_debug unsigned int sysctl_sched_features;
1223
1224#define SCHED_FEAT(name, enabled) \ 1236#define SCHED_FEAT(name, enabled) \
1225 __SCHED_FEAT_##name , 1237 __SCHED_FEAT_##name ,
1226 1238
@@ -1232,6 +1244,13 @@ enum {
1232#undef SCHED_FEAT 1244#undef SCHED_FEAT
1233 1245
1234#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 1246#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
1247
1248/*
1249 * To support run-time toggling of sched features, all the translation units
1250 * (but core.c) reference the sysctl_sched_features defined in core.c.
1251 */
1252extern const_debug unsigned int sysctl_sched_features;
1253
1235#define SCHED_FEAT(name, enabled) \ 1254#define SCHED_FEAT(name, enabled) \
1236static __always_inline bool static_branch_##name(struct static_key *key) \ 1255static __always_inline bool static_branch_##name(struct static_key *key) \
1237{ \ 1256{ \
@@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
1239} 1258}
1240 1259
1241#include "features.h" 1260#include "features.h"
1242
1243#undef SCHED_FEAT 1261#undef SCHED_FEAT
1244 1262
1245extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; 1263extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
1246#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) 1264#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
1265
1247#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ 1266#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
1267
1268/*
1269 * Each translation unit has its own copy of sysctl_sched_features to allow
1270 * constants propagation at compile time and compiler optimization based on
1271 * features default.
1272 */
1273#define SCHED_FEAT(name, enabled) \
1274 (1UL << __SCHED_FEAT_##name) * enabled |
1275static const_debug __maybe_unused unsigned int sysctl_sched_features =
1276#include "features.h"
1277 0;
1278#undef SCHED_FEAT
1279
1248#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 1280#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1281
1249#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1282#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
1250 1283
1251extern struct static_key_false sched_numa_balancing; 1284extern struct static_key_false sched_numa_balancing;
@@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void);
1530extern void init_sched_rt_class(void); 1563extern void init_sched_rt_class(void);
1531extern void init_sched_fair_class(void); 1564extern void init_sched_fair_class(void);
1532 1565
1566extern void reweight_task(struct task_struct *p, int prio);
1567
1533extern void resched_curr(struct rq *rq); 1568extern void resched_curr(struct rq *rq);
1534extern void resched_cpu(int cpu); 1569extern void resched_cpu(int cpu);
1535 1570
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6798276d29af..034cbed7f88b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
4 */ 4 */
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/mutex.h> 6#include <linux/mutex.h>
7#include <linux/sched/isolation.h>
7 8
8#include "sched.h" 9#include "sched.h"
9 10
@@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
269 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 270 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
270 goto free_dlo_mask; 271 goto free_dlo_mask;
271 272
273#ifdef HAVE_RT_PUSH_IPI
274 rd->rto_cpu = -1;
275 raw_spin_lock_init(&rd->rto_lock);
276 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
277#endif
278
272 init_dl_bw(&rd->dl_bw); 279 init_dl_bw(&rd->dl_bw);
273 if (cpudl_init(&rd->cpudl) != 0) 280 if (cpudl_init(&rd->cpudl) != 0)
274 goto free_rto_mask; 281 goto free_rto_mask;
@@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
464 update_top_cache_domain(cpu); 471 update_top_cache_domain(cpu);
465} 472}
466 473
467/* Setup the mask of CPUs configured for isolated domains */
468static int __init isolated_cpu_setup(char *str)
469{
470 int ret;
471
472 alloc_bootmem_cpumask_var(&cpu_isolated_map);
473 ret = cpulist_parse(str, cpu_isolated_map);
474 if (ret) {
475 pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
476 return 0;
477 }
478 return 1;
479}
480__setup("isolcpus=", isolated_cpu_setup);
481
482struct s_data { 474struct s_data {
483 struct sched_domain ** __percpu sd; 475 struct sched_domain ** __percpu sd;
484 struct root_domain *rd; 476 struct root_domain *rd;
@@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
1158 sd->smt_gain = 1178; /* ~15% */ 1150 sd->smt_gain = 1178; /* ~15% */
1159 1151
1160 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 1152 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1153 sd->flags |= SD_PREFER_SIBLING;
1161 sd->imbalance_pct = 117; 1154 sd->imbalance_pct = 117;
1162 sd->cache_nice_tries = 1; 1155 sd->cache_nice_tries = 1;
1163 sd->busy_idx = 2; 1156 sd->busy_idx = 2;
@@ -1332,6 +1325,10 @@ void sched_init_numa(void)
1332 if (!sched_domains_numa_distance) 1325 if (!sched_domains_numa_distance)
1333 return; 1326 return;
1334 1327
1328 /* Includes NUMA identity node at level 0. */
1329 sched_domains_numa_distance[level++] = curr_distance;
1330 sched_domains_numa_levels = level;
1331
1335 /* 1332 /*
1336 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 1333 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1337 * unique distances in the node_distance() table. 1334 * unique distances in the node_distance() table.
@@ -1379,8 +1376,7 @@ void sched_init_numa(void)
1379 return; 1376 return;
1380 1377
1381 /* 1378 /*
1382 * 'level' contains the number of unique distances, excluding the 1379 * 'level' contains the number of unique distances
1383 * identity distance node_distance(i,i).
1384 * 1380 *
1385 * The sched_domains_numa_distance[] array includes the actual distance 1381 * The sched_domains_numa_distance[] array includes the actual distance
1386 * numbers. 1382 * numbers.
@@ -1442,9 +1438,18 @@ void sched_init_numa(void)
1442 tl[i] = sched_domain_topology[i]; 1438 tl[i] = sched_domain_topology[i];
1443 1439
1444 /* 1440 /*
1441 * Add the NUMA identity distance, aka single NODE.
1442 */
1443 tl[i++] = (struct sched_domain_topology_level){
1444 .mask = sd_numa_mask,
1445 .numa_level = 0,
1446 SD_INIT_NAME(NODE)
1447 };
1448
1449 /*
1445 * .. and append 'j' levels of NUMA goodness. 1450 * .. and append 'j' levels of NUMA goodness.
1446 */ 1451 */
1447 for (j = 0; j < level; i++, j++) { 1452 for (j = 1; j < level; i++, j++) {
1448 tl[i] = (struct sched_domain_topology_level){ 1453 tl[i] = (struct sched_domain_topology_level){
1449 .mask = sd_numa_mask, 1454 .mask = sd_numa_mask,
1450 .sd_flags = cpu_numa_flags, 1455 .sd_flags = cpu_numa_flags,
@@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
1774 doms_cur = alloc_sched_domains(ndoms_cur); 1779 doms_cur = alloc_sched_domains(ndoms_cur);
1775 if (!doms_cur) 1780 if (!doms_cur)
1776 doms_cur = &fallback_doms; 1781 doms_cur = &fallback_doms;
1777 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 1782 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
1778 err = build_sched_domains(doms_cur[0], NULL); 1783 err = build_sched_domains(doms_cur[0], NULL);
1779 register_sched_domain_sysctl(); 1784 register_sched_domain_sysctl();
1780 1785
@@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1857 doms_new = alloc_sched_domains(1); 1862 doms_new = alloc_sched_domains(1);
1858 if (doms_new) { 1863 if (doms_new) {
1859 n = 1; 1864 n = 1;
1860 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 1865 cpumask_and(doms_new[0], cpu_active_mask,
1866 housekeeping_cpumask(HK_FLAG_DOMAIN));
1861 } 1867 }
1862 } else { 1868 } else {
1863 n = ndoms_new; 1869 n = ndoms_new;
@@ -1880,7 +1886,8 @@ match1:
1880 if (!doms_new) { 1886 if (!doms_new) {
1881 n = 0; 1887 n = 0;
1882 doms_new = &fallback_doms; 1888 doms_new = &fallback_doms;
1883 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 1889 cpumask_and(doms_new[0], cpu_active_mask,
1890 housekeeping_cpumask(HK_FLAG_DOMAIN));
1884 } 1891 }
1885 1892
1886 /* Build new domains: */ 1893 /* Build new domains: */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index dd4b7b492c9b..99578f06c8d4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -27,6 +27,7 @@
27#include <linux/irq_work.h> 27#include <linux/irq_work.h>
28#include <linux/posix-timers.h> 28#include <linux/posix-timers.h>
29#include <linux/context_tracking.h> 29#include <linux/context_tracking.h>
30#include <linux/mm.h>
30 31
31#include <asm/irq_regs.h> 32#include <asm/irq_regs.h>
32 33
@@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
165 166
166#ifdef CONFIG_NO_HZ_FULL 167#ifdef CONFIG_NO_HZ_FULL
167cpumask_var_t tick_nohz_full_mask; 168cpumask_var_t tick_nohz_full_mask;
168cpumask_var_t housekeeping_mask;
169bool tick_nohz_full_running; 169bool tick_nohz_full_running;
170static atomic_t tick_dep_mask; 170static atomic_t tick_dep_mask;
171 171
@@ -385,20 +385,13 @@ out:
385 local_irq_restore(flags); 385 local_irq_restore(flags);
386} 386}
387 387
388/* Parse the boot-time nohz CPU list from the kernel parameters. */ 388/* Get the boot-time nohz CPU list from the kernel parameters. */
389static int __init tick_nohz_full_setup(char *str) 389void __init tick_nohz_full_setup(cpumask_var_t cpumask)
390{ 390{
391 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 391 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
392 if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 392 cpumask_copy(tick_nohz_full_mask, cpumask);
393 pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
394 free_bootmem_cpumask_var(tick_nohz_full_mask);
395 return 1;
396 }
397 tick_nohz_full_running = true; 393 tick_nohz_full_running = true;
398
399 return 1;
400} 394}
401__setup("nohz_full=", tick_nohz_full_setup);
402 395
403static int tick_nohz_cpu_down(unsigned int cpu) 396static int tick_nohz_cpu_down(unsigned int cpu)
404{ 397{
@@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
437 return; 430 return;
438 } 431 }
439 432
440 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
441 WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
442 cpumask_clear(tick_nohz_full_mask);
443 tick_nohz_full_running = false;
444 return;
445 }
446
447 /* 433 /*
448 * Full dynticks uses irq work to drive the tick rescheduling on safe 434 * Full dynticks uses irq work to drive the tick rescheduling on safe
449 * locking contexts. But then we need irq work to raise its own 435 * locking contexts. But then we need irq work to raise its own
@@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
452 if (!arch_irq_work_has_interrupt()) { 438 if (!arch_irq_work_has_interrupt()) {
453 pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); 439 pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
454 cpumask_clear(tick_nohz_full_mask); 440 cpumask_clear(tick_nohz_full_mask);
455 cpumask_copy(housekeeping_mask, cpu_possible_mask);
456 tick_nohz_full_running = false; 441 tick_nohz_full_running = false;
457 return; 442 return;
458 } 443 }
@@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
465 cpumask_clear_cpu(cpu, tick_nohz_full_mask); 450 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
466 } 451 }
467 452
468 cpumask_andnot(housekeeping_mask,
469 cpu_possible_mask, tick_nohz_full_mask);
470
471 for_each_cpu(cpu, tick_nohz_full_mask) 453 for_each_cpu(cpu, tick_nohz_full_mask)
472 context_tracking_cpu_set(cpu); 454 context_tracking_cpu_set(cpu);
473 455
@@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
477 WARN_ON(ret < 0); 459 WARN_ON(ret < 0);
478 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", 460 pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
479 cpumask_pr_args(tick_nohz_full_mask)); 461 cpumask_pr_args(tick_nohz_full_mask));
480
481 /*
482 * We need at least one CPU to handle housekeeping work such
483 * as timekeeping, unbound timers, workqueues, ...
484 */
485 WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
486} 462}
487#endif 463#endif
488 464
@@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
787 if (!ts->tick_stopped) { 763 if (!ts->tick_stopped) {
788 calc_load_nohz_start(); 764 calc_load_nohz_start();
789 cpu_load_update_nohz_start(); 765 cpu_load_update_nohz_start();
766 quiet_vmstat();
790 767
791 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 768 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
792 ts->tick_stopped = 1; 769 ts->tick_stopped = 1;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c738e764e2a5..90db994ac900 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
921 921
922 trace_assign_type(field, iter->ent); 922 trace_assign_type(field, iter->ent);
923 923
924 T = __task_state_to_char(field->next_state); 924 T = task_index_to_char(field->next_state);
925 S = __task_state_to_char(field->prev_state); 925 S = task_index_to_char(field->prev_state);
926 trace_find_cmdline(field->next_pid, comm); 926 trace_find_cmdline(field->next_pid, comm);
927 trace_seq_printf(&iter->seq, 927 trace_seq_printf(&iter->seq,
928 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 928 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
957 trace_assign_type(field, iter->ent); 957 trace_assign_type(field, iter->ent);
958 958
959 if (!S) 959 if (!S)
960 S = __task_state_to_char(field->prev_state); 960 S = task_index_to_char(field->prev_state);
961 T = __task_state_to_char(field->next_state); 961 T = task_index_to_char(field->next_state);
962 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 962 trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
963 field->prev_pid, 963 field->prev_pid,
964 field->prev_prio, 964 field->prev_prio,
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
993 trace_assign_type(field, iter->ent); 993 trace_assign_type(field, iter->ent);
994 994
995 if (!S) 995 if (!S)
996 S = __task_state_to_char(field->prev_state); 996 S = task_index_to_char(field->prev_state);
997 T = __task_state_to_char(field->next_state); 997 T = task_index_to_char(field->next_state);
998 998
999 SEQ_PUT_HEX_FIELD(s, field->prev_pid); 999 SEQ_PUT_HEX_FIELD(s, field->prev_pid);
1000 SEQ_PUT_HEX_FIELD(s, field->prev_prio); 1000 SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7d461dcd4831..a86b303e6c67 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
398 entry = ring_buffer_event_data(event); 398 entry = ring_buffer_event_data(event);
399 entry->prev_pid = prev->pid; 399 entry->prev_pid = prev->pid;
400 entry->prev_prio = prev->prio; 400 entry->prev_prio = prev->prio;
401 entry->prev_state = __get_task_state(prev); 401 entry->prev_state = task_state_index(prev);
402 entry->next_pid = next->pid; 402 entry->next_pid = next->pid;
403 entry->next_prio = next->prio; 403 entry->next_prio = next->prio;
404 entry->next_state = __get_task_state(next); 404 entry->next_state = task_state_index(next);
405 entry->next_cpu = task_cpu(next); 405 entry->next_cpu = task_cpu(next);
406 406
407 if (!call_filter_check_discard(call, entry, buffer, event)) 407 if (!call_filter_check_discard(call, entry, buffer, event))
@@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
426 entry = ring_buffer_event_data(event); 426 entry = ring_buffer_event_data(event);
427 entry->prev_pid = curr->pid; 427 entry->prev_pid = curr->pid;
428 entry->prev_prio = curr->prio; 428 entry->prev_prio = curr->prio;
429 entry->prev_state = __get_task_state(curr); 429 entry->prev_state = task_state_index(curr);
430 entry->next_pid = wakee->pid; 430 entry->next_pid = wakee->pid;
431 entry->next_prio = wakee->prio; 431 entry->next_prio = wakee->prio;
432 entry->next_state = __get_task_state(wakee); 432 entry->next_state = task_state_index(wakee);
433 entry->next_cpu = task_cpu(wakee); 433 entry->next_cpu = task_cpu(wakee);
434 434
435 if (!call_filter_check_discard(call, entry, buffer, event)) 435 if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c8e06703e44c..576d18045811 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
25#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/sched/clock.h> 26#include <linux/sched/clock.h>
27#include <linux/sched/debug.h> 27#include <linux/sched/debug.h>
28#include <linux/sched/isolation.h>
28 29
29#include <asm/irq_regs.h> 30#include <asm/irq_regs.h>
30#include <linux/kvm_para.h> 31#include <linux/kvm_para.h>
@@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
774 775
775void __init lockup_detector_init(void) 776void __init lockup_detector_init(void)
776{ 777{
777#ifdef CONFIG_NO_HZ_FULL 778 if (tick_nohz_full_enabled())
778 if (tick_nohz_full_enabled()) {
779 pr_info("Disabling watchdog on nohz_full cores by default\n"); 779 pr_info("Disabling watchdog on nohz_full cores by default\n");
780 cpumask_copy(&watchdog_cpumask, housekeeping_mask); 780
781 } else 781 cpumask_copy(&watchdog_cpumask,
782 cpumask_copy(&watchdog_cpumask, cpu_possible_mask); 782 housekeeping_cpumask(HK_FLAG_TIMER));
783#else
784 cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
785#endif
786 783
787 if (!watchdog_nmi_probe()) 784 if (!watchdog_nmi_probe())
788 nmi_watchdog_available = true; 785 nmi_watchdog_available = true;