diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-13 16:37:52 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-13 16:37:52 -0500 |
commit | 3e2014637c50e5d6a77cd63d5db6c209fe29d1b1 (patch) | |
tree | a672ed603262aeddda4490056b27b09791d0cbbb | |
parent | f2be8bd52e7410c70145f73511a2e80f4797e1a5 (diff) | |
parent | 765cc3a4b224e22bf524fabe40284a524f37cdd0 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main updates in this cycle were:
- Group balancing enhancements and cleanups (Brendan Jackman)
- Move CPU isolation related functionality into its separate
kernel/sched/isolation.c file, with related 'housekeeping_*()'
namespace and nomenclature et al. (Frederic Weisbecker)
- Improve the interactive/cpu-intense fairness calculation (Josef
Bacik)
- Improve the PELT code and related cleanups (Peter Zijlstra)
- Improve the logic of pick_next_task_fair() (Uladzislau Rezki)
- Improve the RT IPI based balancing logic (Steven Rostedt)
- Various micro-optimizations:
- better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi)
- better idle loop (Cheng Jian)
- ... plus misc fixes, cleanups and updates"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds
sched/sysctl: Fix attributes of some extern declarations
sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated
sched/isolation: Add basic isolcpus flags
sched/isolation: Move isolcpus= handling to the housekeeping code
sched/isolation: Handle the nohz_full= parameter
sched/isolation: Introduce housekeeping flags
sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL
sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu()
sched/isolation: Use its own static key
sched/isolation: Make the housekeeping cpumask private
sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu()
sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version
sched/isolation: Move housekeeping related code to its own file
sched/idle: Micro-optimize the idle loop
sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK
x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter
sched/rt: Simplify the IPI based RT balancing logic
block/ioprio: Use a helper to check for RT prio
sched/rt: Add a helper to test for a RT task
...
31 files changed, 1270 insertions, 774 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 116e798b61e6..38ed8787261b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -1730,20 +1730,33 @@ | |||
1730 | isapnp= [ISAPNP] | 1730 | isapnp= [ISAPNP] |
1731 | Format: <RDP>,<reset>,<pci_scan>,<verbosity> | 1731 | Format: <RDP>,<reset>,<pci_scan>,<verbosity> |
1732 | 1732 | ||
1733 | isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler. | 1733 | isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance. |
1734 | The argument is a cpu list, as described above. | 1734 | [Deprecated - use cpusets instead] |
1735 | Format: [flag-list,]<cpu-list> | ||
1736 | |||
1737 | Specify one or more CPUs to isolate from disturbances | ||
1738 | specified in the flag list (default: domain): | ||
1739 | |||
1740 | nohz | ||
1741 | Disable the tick when a single task runs. | ||
1742 | domain | ||
1743 | Isolate from the general SMP balancing and scheduling | ||
1744 | algorithms. Note that performing domain isolation this way | ||
1745 | is irreversible: it's not possible to bring back a CPU to | ||
1746 | the domains once isolated through isolcpus. It's strongly | ||
1747 | advised to use cpusets instead to disable scheduler load | ||
1748 | balancing through the "cpuset.sched_load_balance" file. | ||
1749 | It offers a much more flexible interface where CPUs can | ||
1750 | move in and out of an isolated set anytime. | ||
1751 | |||
1752 | You can move a process onto or off an "isolated" CPU via | ||
1753 | the CPU affinity syscalls or cpuset. | ||
1754 | <cpu number> begins at 0 and the maximum value is | ||
1755 | "number of CPUs in system - 1". | ||
1756 | |||
1757 | The format of <cpu-list> is described above. | ||
1735 | 1758 | ||
1736 | This option can be used to specify one or more CPUs | ||
1737 | to isolate from the general SMP balancing and scheduling | ||
1738 | algorithms. You can move a process onto or off an | ||
1739 | "isolated" CPU via the CPU affinity syscalls or cpuset. | ||
1740 | <cpu number> begins at 0 and the maximum value is | ||
1741 | "number of CPUs in system - 1". | ||
1742 | 1759 | ||
1743 | This option is the preferred way to isolate CPUs. The | ||
1744 | alternative -- manually setting the CPU mask of all | ||
1745 | tasks in the system -- can cause problems and | ||
1746 | suboptimal load balancer performance. | ||
1747 | 1760 | ||
1748 | iucv= [HW,NET] | 1761 | iucv= [HW,NET] |
1749 | 1762 | ||
@@ -4209,6 +4222,9 @@ | |||
4209 | Used to run time disable IRQ_TIME_ACCOUNTING on any | 4222 | Used to run time disable IRQ_TIME_ACCOUNTING on any |
4210 | platforms where RDTSC is slow and this accounting | 4223 | platforms where RDTSC is slow and this accounting |
4211 | can add overhead. | 4224 | can add overhead. |
4225 | [x86] unstable: mark the TSC clocksource as unstable, this | ||
4226 | marks the TSC unconditionally unstable at bootup and | ||
4227 | avoids any further wobbles once the TSC watchdog notices. | ||
4212 | 4228 | ||
4213 | turbografx.map[2|3]= [HW,JOY] | 4229 | turbografx.map[2|3]= [HW,JOY] |
4214 | TurboGraFX parallel port interface | 4230 | TurboGraFX parallel port interface |
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 321cd7b4d817..a73ab95558f5 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/cpufeature.h> | 18 | #include <linux/cpufeature.h> |
19 | #include <linux/tick.h> | 19 | #include <linux/tick.h> |
20 | #include <linux/pm_qos.h> | 20 | #include <linux/pm_qos.h> |
21 | #include <linux/sched/isolation.h> | ||
21 | 22 | ||
22 | #include "base.h" | 23 | #include "base.h" |
23 | 24 | ||
@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev, | |||
271 | struct device_attribute *attr, char *buf) | 272 | struct device_attribute *attr, char *buf) |
272 | { | 273 | { |
273 | int n = 0, len = PAGE_SIZE-2; | 274 | int n = 0, len = PAGE_SIZE-2; |
275 | cpumask_var_t isolated; | ||
274 | 276 | ||
275 | n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map)); | 277 | if (!alloc_cpumask_var(&isolated, GFP_KERNEL)) |
278 | return -ENOMEM; | ||
279 | |||
280 | cpumask_andnot(isolated, cpu_possible_mask, | ||
281 | housekeeping_cpumask(HK_FLAG_DOMAIN)); | ||
282 | n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated)); | ||
283 | |||
284 | free_cpumask_var(isolated); | ||
276 | 285 | ||
277 | return n; | 286 | return n; |
278 | } | 287 | } |
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index c00102b8145a..b3e5816a4678 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c | |||
@@ -40,7 +40,7 @@ | |||
40 | #include <linux/tcp.h> | 40 | #include <linux/tcp.h> |
41 | #include <linux/net_tstamp.h> | 41 | #include <linux/net_tstamp.h> |
42 | #include <linux/ptp_clock_kernel.h> | 42 | #include <linux/ptp_clock_kernel.h> |
43 | #include <linux/tick.h> | 43 | #include <linux/sched/isolation.h> |
44 | 44 | ||
45 | #include <asm/checksum.h> | 45 | #include <asm/checksum.h> |
46 | #include <asm/homecache.h> | 46 | #include <asm/homecache.h> |
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void) | |||
2270 | tile_net_dev_init(name, mac); | 2270 | tile_net_dev_init(name, mac); |
2271 | 2271 | ||
2272 | if (!network_cpus_init()) | 2272 | if (!network_cpus_init()) |
2273 | cpumask_and(&network_cpus_map, housekeeping_cpumask(), | 2273 | cpumask_and(&network_cpus_map, |
2274 | cpu_online_mask); | 2274 | housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask); |
2275 | 2275 | ||
2276 | return 0; | 2276 | return 0; |
2277 | } | 2277 | } |
diff --git a/fs/proc/array.c b/fs/proc/array.c index d82549e80402..6f6fc1672ad1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -138,7 +138,7 @@ static const char * const task_state_array[] = { | |||
138 | static inline const char *get_task_state(struct task_struct *tsk) | 138 | static inline const char *get_task_state(struct task_struct *tsk) |
139 | { | 139 | { |
140 | BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); | 140 | BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array)); |
141 | return task_state_array[__get_task_state(tsk)]; | 141 | return task_state_array[task_state_index(tsk)]; |
142 | } | 142 | } |
143 | 143 | ||
144 | static inline int get_task_umask(struct task_struct *tsk) | 144 | static inline int get_task_umask(struct task_struct *tsk) |
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 8d3125c493b2..75b565194437 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h | |||
@@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) | |||
131 | return 0; | 131 | return 0; |
132 | } | 132 | } |
133 | 133 | ||
134 | static inline unsigned int cpumask_last(const struct cpumask *srcp) | ||
135 | { | ||
136 | return 0; | ||
137 | } | ||
138 | |||
134 | /* Valid inputs for n are -1 and 0. */ | 139 | /* Valid inputs for n are -1 and 0. */ |
135 | static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) | 140 | static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) |
136 | { | 141 | { |
@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) | |||
179 | return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); | 184 | return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); |
180 | } | 185 | } |
181 | 186 | ||
187 | /** | ||
188 | * cpumask_last - get the last CPU in a cpumask | ||
189 | * @srcp: - the cpumask pointer | ||
190 | * | ||
191 | * Returns >= nr_cpumask_bits if no CPUs set. | ||
192 | */ | ||
193 | static inline unsigned int cpumask_last(const struct cpumask *srcp) | ||
194 | { | ||
195 | return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); | ||
196 | } | ||
197 | |||
182 | unsigned int cpumask_next(int n, const struct cpumask *srcp); | 198 | unsigned int cpumask_next(int n, const struct cpumask *srcp); |
183 | 199 | ||
184 | /** | 200 | /** |
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 2cdd74809899..627efac73e6d 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h | |||
@@ -3,6 +3,7 @@ | |||
3 | #define IOPRIO_H | 3 | #define IOPRIO_H |
4 | 4 | ||
5 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | #include <linux/sched/rt.h> | ||
6 | #include <linux/iocontext.h> | 7 | #include <linux/iocontext.h> |
7 | 8 | ||
8 | /* | 9 | /* |
@@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task) | |||
63 | { | 64 | { |
64 | if (task->policy == SCHED_IDLE) | 65 | if (task->policy == SCHED_IDLE) |
65 | return IOPRIO_CLASS_IDLE; | 66 | return IOPRIO_CLASS_IDLE; |
66 | else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR) | 67 | else if (task_is_realtime(task)) |
67 | return IOPRIO_CLASS_RT; | 68 | return IOPRIO_CLASS_RT; |
68 | else | 69 | else |
69 | return IOPRIO_CLASS_BE; | 70 | return IOPRIO_CLASS_BE; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index fdf74f27acf1..a5dc7c98b0a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -166,8 +166,6 @@ struct task_group; | |||
166 | /* Task command name length: */ | 166 | /* Task command name length: */ |
167 | #define TASK_COMM_LEN 16 | 167 | #define TASK_COMM_LEN 16 |
168 | 168 | ||
169 | extern cpumask_var_t cpu_isolated_map; | ||
170 | |||
171 | extern void scheduler_tick(void); | 169 | extern void scheduler_tick(void); |
172 | 170 | ||
173 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX | 171 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX |
@@ -332,9 +330,11 @@ struct load_weight { | |||
332 | struct sched_avg { | 330 | struct sched_avg { |
333 | u64 last_update_time; | 331 | u64 last_update_time; |
334 | u64 load_sum; | 332 | u64 load_sum; |
333 | u64 runnable_load_sum; | ||
335 | u32 util_sum; | 334 | u32 util_sum; |
336 | u32 period_contrib; | 335 | u32 period_contrib; |
337 | unsigned long load_avg; | 336 | unsigned long load_avg; |
337 | unsigned long runnable_load_avg; | ||
338 | unsigned long util_avg; | 338 | unsigned long util_avg; |
339 | }; | 339 | }; |
340 | 340 | ||
@@ -377,6 +377,7 @@ struct sched_statistics { | |||
377 | struct sched_entity { | 377 | struct sched_entity { |
378 | /* For load-balancing: */ | 378 | /* For load-balancing: */ |
379 | struct load_weight load; | 379 | struct load_weight load; |
380 | unsigned long runnable_weight; | ||
380 | struct rb_node run_node; | 381 | struct rb_node run_node; |
381 | struct list_head group_node; | 382 | struct list_head group_node; |
382 | unsigned int on_rq; | 383 | unsigned int on_rq; |
@@ -472,10 +473,10 @@ struct sched_dl_entity { | |||
472 | * conditions between the inactive timer handler and the wakeup | 473 | * conditions between the inactive timer handler and the wakeup |
473 | * code. | 474 | * code. |
474 | */ | 475 | */ |
475 | int dl_throttled; | 476 | int dl_throttled : 1; |
476 | int dl_boosted; | 477 | int dl_boosted : 1; |
477 | int dl_yielded; | 478 | int dl_yielded : 1; |
478 | int dl_non_contending; | 479 | int dl_non_contending : 1; |
479 | 480 | ||
480 | /* | 481 | /* |
481 | * Bandwidth enforcement timer. Each -deadline task has its | 482 | * Bandwidth enforcement timer. Each -deadline task has its |
@@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) | |||
1246 | #define TASK_REPORT_IDLE (TASK_REPORT + 1) | 1247 | #define TASK_REPORT_IDLE (TASK_REPORT + 1) |
1247 | #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) | 1248 | #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) |
1248 | 1249 | ||
1249 | static inline unsigned int __get_task_state(struct task_struct *tsk) | 1250 | static inline unsigned int task_state_index(struct task_struct *tsk) |
1250 | { | 1251 | { |
1251 | unsigned int tsk_state = READ_ONCE(tsk->state); | 1252 | unsigned int tsk_state = READ_ONCE(tsk->state); |
1252 | unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; | 1253 | unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; |
@@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk) | |||
1259 | return fls(state); | 1260 | return fls(state); |
1260 | } | 1261 | } |
1261 | 1262 | ||
1262 | static inline char __task_state_to_char(unsigned int state) | 1263 | static inline char task_index_to_char(unsigned int state) |
1263 | { | 1264 | { |
1264 | static const char state_char[] = "RSDTtXZPI"; | 1265 | static const char state_char[] = "RSDTtXZPI"; |
1265 | 1266 | ||
@@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state) | |||
1270 | 1271 | ||
1271 | static inline char task_state_to_char(struct task_struct *tsk) | 1272 | static inline char task_state_to_char(struct task_struct *tsk) |
1272 | { | 1273 | { |
1273 | return __task_state_to_char(__get_task_state(tsk)); | 1274 | return task_index_to_char(task_state_index(tsk)); |
1274 | } | 1275 | } |
1275 | 1276 | ||
1276 | /** | 1277 | /** |
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h new file mode 100644 index 000000000000..d849431c8060 --- /dev/null +++ b/include/linux/sched/isolation.h | |||
@@ -0,0 +1,51 @@ | |||
1 | #ifndef _LINUX_SCHED_ISOLATION_H | ||
2 | #define _LINUX_SCHED_ISOLATION_H | ||
3 | |||
4 | #include <linux/cpumask.h> | ||
5 | #include <linux/init.h> | ||
6 | #include <linux/tick.h> | ||
7 | |||
8 | enum hk_flags { | ||
9 | HK_FLAG_TIMER = 1, | ||
10 | HK_FLAG_RCU = (1 << 1), | ||
11 | HK_FLAG_MISC = (1 << 2), | ||
12 | HK_FLAG_SCHED = (1 << 3), | ||
13 | HK_FLAG_TICK = (1 << 4), | ||
14 | HK_FLAG_DOMAIN = (1 << 5), | ||
15 | }; | ||
16 | |||
17 | #ifdef CONFIG_CPU_ISOLATION | ||
18 | DECLARE_STATIC_KEY_FALSE(housekeeping_overriden); | ||
19 | extern int housekeeping_any_cpu(enum hk_flags flags); | ||
20 | extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); | ||
21 | extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); | ||
22 | extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); | ||
23 | extern void __init housekeeping_init(void); | ||
24 | |||
25 | #else | ||
26 | |||
27 | static inline int housekeeping_any_cpu(enum hk_flags flags) | ||
28 | { | ||
29 | return smp_processor_id(); | ||
30 | } | ||
31 | |||
32 | static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) | ||
33 | { | ||
34 | return cpu_possible_mask; | ||
35 | } | ||
36 | |||
37 | static inline void housekeeping_affine(struct task_struct *t, | ||
38 | enum hk_flags flags) { } | ||
39 | static inline void housekeeping_init(void) { } | ||
40 | #endif /* CONFIG_CPU_ISOLATION */ | ||
41 | |||
42 | static inline bool housekeeping_cpu(int cpu, enum hk_flags flags) | ||
43 | { | ||
44 | #ifdef CONFIG_CPU_ISOLATION | ||
45 | if (static_branch_unlikely(&housekeeping_overriden)) | ||
46 | return housekeeping_test_cpu(cpu, flags); | ||
47 | #endif | ||
48 | return true; | ||
49 | } | ||
50 | |||
51 | #endif /* _LINUX_SCHED_ISOLATION_H */ | ||
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index db865ed25ef3..e5af028c08b4 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h | |||
@@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p) | |||
18 | return rt_prio(p->prio); | 18 | return rt_prio(p->prio); |
19 | } | 19 | } |
20 | 20 | ||
21 | static inline bool task_is_realtime(struct task_struct *tsk) | ||
22 | { | ||
23 | int policy = tsk->policy; | ||
24 | |||
25 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
26 | return true; | ||
27 | if (policy == SCHED_DEADLINE) | ||
28 | return true; | ||
29 | return false; | ||
30 | } | ||
31 | |||
21 | #ifdef CONFIG_RT_MUTEXES | 32 | #ifdef CONFIG_RT_MUTEXES |
22 | /* | 33 | /* |
23 | * Must hold either p->pi_lock or task_rq(p)->lock. | 34 | * Must hold either p->pi_lock or task_rq(p)->lock. |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d6a18a3839cc..1c1a1512ec55 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; | |||
38 | extern unsigned int sysctl_numa_balancing_scan_size; | 38 | extern unsigned int sysctl_numa_balancing_scan_size; |
39 | 39 | ||
40 | #ifdef CONFIG_SCHED_DEBUG | 40 | #ifdef CONFIG_SCHED_DEBUG |
41 | extern unsigned int sysctl_sched_migration_cost; | 41 | extern __read_mostly unsigned int sysctl_sched_migration_cost; |
42 | extern unsigned int sysctl_sched_nr_migrate; | 42 | extern __read_mostly unsigned int sysctl_sched_nr_migrate; |
43 | extern unsigned int sysctl_sched_time_avg; | 43 | extern __read_mostly unsigned int sysctl_sched_time_avg; |
44 | 44 | ||
45 | int sched_proc_update_handler(struct ctl_table *table, int write, | 45 | int sched_proc_update_handler(struct ctl_table *table, int write, |
46 | void __user *buffer, size_t *length, | 46 | void __user *buffer, size_t *length, |
diff --git a/include/linux/tick.h b/include/linux/tick.h index cf413b344ddb..f442d1a42025 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h | |||
@@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } | |||
138 | #ifdef CONFIG_NO_HZ_FULL | 138 | #ifdef CONFIG_NO_HZ_FULL |
139 | extern bool tick_nohz_full_running; | 139 | extern bool tick_nohz_full_running; |
140 | extern cpumask_var_t tick_nohz_full_mask; | 140 | extern cpumask_var_t tick_nohz_full_mask; |
141 | extern cpumask_var_t housekeeping_mask; | ||
142 | 141 | ||
143 | static inline bool tick_nohz_full_enabled(void) | 142 | static inline bool tick_nohz_full_enabled(void) |
144 | { | 143 | { |
@@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) | |||
162 | cpumask_or(mask, mask, tick_nohz_full_mask); | 161 | cpumask_or(mask, mask, tick_nohz_full_mask); |
163 | } | 162 | } |
164 | 163 | ||
165 | static inline int housekeeping_any_cpu(void) | ||
166 | { | ||
167 | return cpumask_any_and(housekeeping_mask, cpu_online_mask); | ||
168 | } | ||
169 | |||
170 | extern void tick_nohz_dep_set(enum tick_dep_bits bit); | 164 | extern void tick_nohz_dep_set(enum tick_dep_bits bit); |
171 | extern void tick_nohz_dep_clear(enum tick_dep_bits bit); | 165 | extern void tick_nohz_dep_clear(enum tick_dep_bits bit); |
172 | extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); | 166 | extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit); |
@@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, | |||
235 | 229 | ||
236 | extern void tick_nohz_full_kick_cpu(int cpu); | 230 | extern void tick_nohz_full_kick_cpu(int cpu); |
237 | extern void __tick_nohz_task_switch(void); | 231 | extern void __tick_nohz_task_switch(void); |
232 | extern void __init tick_nohz_full_setup(cpumask_var_t cpumask); | ||
238 | #else | 233 | #else |
239 | static inline int housekeeping_any_cpu(void) | ||
240 | { | ||
241 | return smp_processor_id(); | ||
242 | } | ||
243 | static inline bool tick_nohz_full_enabled(void) { return false; } | 234 | static inline bool tick_nohz_full_enabled(void) { return false; } |
244 | static inline bool tick_nohz_full_cpu(int cpu) { return false; } | 235 | static inline bool tick_nohz_full_cpu(int cpu) { return false; } |
245 | static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } | 236 | static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } |
@@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal, | |||
259 | 250 | ||
260 | static inline void tick_nohz_full_kick_cpu(int cpu) { } | 251 | static inline void tick_nohz_full_kick_cpu(int cpu) { } |
261 | static inline void __tick_nohz_task_switch(void) { } | 252 | static inline void __tick_nohz_task_switch(void) { } |
253 | static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { } | ||
262 | #endif | 254 | #endif |
263 | 255 | ||
264 | static inline const struct cpumask *housekeeping_cpumask(void) | ||
265 | { | ||
266 | #ifdef CONFIG_NO_HZ_FULL | ||
267 | if (tick_nohz_full_enabled()) | ||
268 | return housekeeping_mask; | ||
269 | #endif | ||
270 | return cpu_possible_mask; | ||
271 | } | ||
272 | |||
273 | static inline bool is_housekeeping_cpu(int cpu) | ||
274 | { | ||
275 | #ifdef CONFIG_NO_HZ_FULL | ||
276 | if (tick_nohz_full_enabled()) | ||
277 | return cpumask_test_cpu(cpu, housekeeping_mask); | ||
278 | #endif | ||
279 | return true; | ||
280 | } | ||
281 | |||
282 | static inline void housekeeping_affine(struct task_struct *t) | ||
283 | { | ||
284 | #ifdef CONFIG_NO_HZ_FULL | ||
285 | if (tick_nohz_full_enabled()) | ||
286 | set_cpus_allowed_ptr(t, housekeeping_mask); | ||
287 | |||
288 | #endif | ||
289 | } | ||
290 | |||
291 | static inline void tick_nohz_task_switch(void) | 256 | static inline void tick_nohz_task_switch(void) |
292 | { | 257 | { |
293 | if (tick_nohz_full_enabled()) | 258 | if (tick_nohz_full_enabled()) |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index da10aa21bebc..306b31de5194 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * | |||
118 | if (preempt) | 118 | if (preempt) |
119 | return TASK_STATE_MAX; | 119 | return TASK_STATE_MAX; |
120 | 120 | ||
121 | return __get_task_state(p); | 121 | return task_state_index(p); |
122 | } | 122 | } |
123 | #endif /* CREATE_TRACE_POINTS */ | 123 | #endif /* CREATE_TRACE_POINTS */ |
124 | 124 | ||
diff --git a/init/Kconfig b/init/Kconfig index 3c1faaa2af4a..c1fd2863d4ba 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING | |||
472 | 472 | ||
473 | endmenu # "CPU/Task time and stats accounting" | 473 | endmenu # "CPU/Task time and stats accounting" |
474 | 474 | ||
475 | config CPU_ISOLATION | ||
476 | bool "CPU isolation" | ||
477 | help | ||
478 | Make sure that CPUs running critical tasks are not disturbed by | ||
479 | any source of "noise" such as unbound workqueues, timers, kthreads... | ||
480 | Unbound jobs get offloaded to housekeeping CPUs. | ||
481 | |||
475 | source "kernel/rcu/Kconfig" | 482 | source "kernel/rcu/Kconfig" |
476 | 483 | ||
477 | config BUILD_BIN2C | 484 | config BUILD_BIN2C |
diff --git a/init/main.c b/init/main.c index 0ee9c6866ada..4610c99ae306 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/cgroup.h> | 46 | #include <linux/cgroup.h> |
47 | #include <linux/efi.h> | 47 | #include <linux/efi.h> |
48 | #include <linux/tick.h> | 48 | #include <linux/tick.h> |
49 | #include <linux/sched/isolation.h> | ||
49 | #include <linux/interrupt.h> | 50 | #include <linux/interrupt.h> |
50 | #include <linux/taskstats_kern.h> | 51 | #include <linux/taskstats_kern.h> |
51 | #include <linux/delayacct.h> | 52 | #include <linux/delayacct.h> |
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void) | |||
606 | early_irq_init(); | 607 | early_irq_init(); |
607 | init_IRQ(); | 608 | init_IRQ(); |
608 | tick_init(); | 609 | tick_init(); |
610 | housekeeping_init(); | ||
609 | rcu_init_nohz(); | 611 | rcu_init_nohz(); |
610 | init_timers(); | 612 | init_timers(); |
611 | hrtimers_init(); | 613 | hrtimers_init(); |
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e2924ecb..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
@@ -57,7 +57,7 @@ | |||
57 | #include <linux/backing-dev.h> | 57 | #include <linux/backing-dev.h> |
58 | #include <linux/sort.h> | 58 | #include <linux/sort.h> |
59 | #include <linux/oom.h> | 59 | #include <linux/oom.h> |
60 | 60 | #include <linux/sched/isolation.h> | |
61 | #include <linux/uaccess.h> | 61 | #include <linux/uaccess.h> |
62 | #include <linux/atomic.h> | 62 | #include <linux/atomic.h> |
63 | #include <linux/mutex.h> | 63 | #include <linux/mutex.h> |
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
656 | int csn; /* how many cpuset ptrs in csa so far */ | 656 | int csn; /* how many cpuset ptrs in csa so far */ |
657 | int i, j, k; /* indices for partition finding loops */ | 657 | int i, j, k; /* indices for partition finding loops */ |
658 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ | 658 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ |
659 | cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ | ||
660 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 659 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
661 | int ndoms = 0; /* number of sched domains in result */ | 660 | int ndoms = 0; /* number of sched domains in result */ |
662 | int nslot; /* next empty doms[] struct cpumask slot */ | 661 | int nslot; /* next empty doms[] struct cpumask slot */ |
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
666 | dattr = NULL; | 665 | dattr = NULL; |
667 | csa = NULL; | 666 | csa = NULL; |
668 | 667 | ||
669 | if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) | ||
670 | goto done; | ||
671 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | ||
672 | |||
673 | /* Special case for the 99% of systems with one, full, sched domain */ | 668 | /* Special case for the 99% of systems with one, full, sched domain */ |
674 | if (is_sched_load_balance(&top_cpuset)) { | 669 | if (is_sched_load_balance(&top_cpuset)) { |
675 | ndoms = 1; | 670 | ndoms = 1; |
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
683 | update_domain_attr_tree(dattr, &top_cpuset); | 678 | update_domain_attr_tree(dattr, &top_cpuset); |
684 | } | 679 | } |
685 | cpumask_and(doms[0], top_cpuset.effective_cpus, | 680 | cpumask_and(doms[0], top_cpuset.effective_cpus, |
686 | non_isolated_cpus); | 681 | housekeeping_cpumask(HK_FLAG_DOMAIN)); |
687 | 682 | ||
688 | goto done; | 683 | goto done; |
689 | } | 684 | } |
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
707 | */ | 702 | */ |
708 | if (!cpumask_empty(cp->cpus_allowed) && | 703 | if (!cpumask_empty(cp->cpus_allowed) && |
709 | !(is_sched_load_balance(cp) && | 704 | !(is_sched_load_balance(cp) && |
710 | cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) | 705 | cpumask_intersects(cp->cpus_allowed, |
706 | housekeeping_cpumask(HK_FLAG_DOMAIN)))) | ||
711 | continue; | 707 | continue; |
712 | 708 | ||
713 | if (is_sched_load_balance(cp)) | 709 | if (is_sched_load_balance(cp)) |
@@ -789,7 +785,7 @@ restart: | |||
789 | 785 | ||
790 | if (apn == b->pn) { | 786 | if (apn == b->pn) { |
791 | cpumask_or(dp, dp, b->effective_cpus); | 787 | cpumask_or(dp, dp, b->effective_cpus); |
792 | cpumask_and(dp, dp, non_isolated_cpus); | 788 | cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); |
793 | if (dattr) | 789 | if (dattr) |
794 | update_domain_attr_tree(dattr + nslot, b); | 790 | update_domain_attr_tree(dattr + nslot, b); |
795 | 791 | ||
@@ -802,7 +798,6 @@ restart: | |||
802 | BUG_ON(nslot != ndoms); | 798 | BUG_ON(nslot != ndoms); |
803 | 799 | ||
804 | done: | 800 | done: |
805 | free_cpumask_var(non_isolated_cpus); | ||
806 | kfree(csa); | 801 | kfree(csa); |
807 | 802 | ||
808 | /* | 803 | /* |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index dd4d0d390e5b..910405dc6e5c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/sched/debug.h> | 30 | #include <linux/sched/debug.h> |
31 | #include <linux/smpboot.h> | 31 | #include <linux/smpboot.h> |
32 | #include <linux/sched/isolation.h> | ||
32 | #include <uapi/linux/sched/types.h> | 33 | #include <uapi/linux/sched/types.h> |
33 | #include "../time/tick-internal.h" | 34 | #include "../time/tick-internal.h" |
34 | 35 | ||
@@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void) | |||
2587 | 2588 | ||
2588 | if (!tick_nohz_full_enabled()) | 2589 | if (!tick_nohz_full_enabled()) |
2589 | return; | 2590 | return; |
2590 | housekeeping_affine(current); | 2591 | housekeeping_affine(current, HK_FLAG_RCU); |
2591 | } | 2592 | } |
2592 | 2593 | ||
2593 | /* Record the current task on dyntick-idle entry. */ | 2594 | /* Record the current task on dyntick-idle entry. */ |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 27694561f769..fbd56d6e575b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/kthread.h> | 51 | #include <linux/kthread.h> |
52 | #include <linux/tick.h> | 52 | #include <linux/tick.h> |
53 | #include <linux/rcupdate_wait.h> | 53 | #include <linux/rcupdate_wait.h> |
54 | #include <linux/sched/isolation.h> | ||
54 | 55 | ||
55 | #define CREATE_TRACE_POINTS | 56 | #define CREATE_TRACE_POINTS |
56 | 57 | ||
@@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) | |||
714 | LIST_HEAD(rcu_tasks_holdouts); | 715 | LIST_HEAD(rcu_tasks_holdouts); |
715 | 716 | ||
716 | /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ | 717 | /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ |
717 | housekeeping_affine(current); | 718 | housekeeping_affine(current, HK_FLAG_RCU); |
718 | 719 | ||
719 | /* | 720 | /* |
720 | * Each pass through the following loop makes one check for | 721 | * Each pass through the following loop makes one check for |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a9ee16bbc693..e2f9d4feff40 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o | |||
27 | obj-$(CONFIG_CPU_FREQ) += cpufreq.o | 27 | obj-$(CONFIG_CPU_FREQ) += cpufreq.o |
28 | obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o | 28 | obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o |
29 | obj-$(CONFIG_MEMBARRIER) += membarrier.o | 29 | obj-$(CONFIG_MEMBARRIER) += membarrier.o |
30 | obj-$(CONFIG_CPU_ISOLATION) += isolation.o | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9446b2e5eac5..5b82a0073532 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/profile.h> | 26 | #include <linux/profile.h> |
27 | #include <linux/security.h> | 27 | #include <linux/security.h> |
28 | #include <linux/syscalls.h> | 28 | #include <linux/syscalls.h> |
29 | #include <linux/sched/isolation.h> | ||
29 | 30 | ||
30 | #include <asm/switch_to.h> | 31 | #include <asm/switch_to.h> |
31 | #include <asm/tlb.h> | 32 | #include <asm/tlb.h> |
@@ -42,18 +43,21 @@ | |||
42 | 43 | ||
43 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 44 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
44 | 45 | ||
46 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | ||
45 | /* | 47 | /* |
46 | * Debugging: various feature bits | 48 | * Debugging: various feature bits |
49 | * | ||
50 | * If SCHED_DEBUG is disabled, each compilation unit has its own copy of | ||
51 | * sysctl_sched_features, defined in sched.h, to allow constants propagation | ||
52 | * at compile time and compiler optimization based on features default. | ||
47 | */ | 53 | */ |
48 | |||
49 | #define SCHED_FEAT(name, enabled) \ | 54 | #define SCHED_FEAT(name, enabled) \ |
50 | (1UL << __SCHED_FEAT_##name) * enabled | | 55 | (1UL << __SCHED_FEAT_##name) * enabled | |
51 | |||
52 | const_debug unsigned int sysctl_sched_features = | 56 | const_debug unsigned int sysctl_sched_features = |
53 | #include "features.h" | 57 | #include "features.h" |
54 | 0; | 58 | 0; |
55 | |||
56 | #undef SCHED_FEAT | 59 | #undef SCHED_FEAT |
60 | #endif | ||
57 | 61 | ||
58 | /* | 62 | /* |
59 | * Number of tasks to iterate in a single balance run. | 63 | * Number of tasks to iterate in a single balance run. |
@@ -83,9 +87,6 @@ __read_mostly int scheduler_running; | |||
83 | */ | 87 | */ |
84 | int sysctl_sched_rt_runtime = 950000; | 88 | int sysctl_sched_rt_runtime = 950000; |
85 | 89 | ||
86 | /* CPUs with isolated domains */ | ||
87 | cpumask_var_t cpu_isolated_map; | ||
88 | |||
89 | /* | 90 | /* |
90 | * __task_rq_lock - lock the rq @p resides on. | 91 | * __task_rq_lock - lock the rq @p resides on. |
91 | */ | 92 | */ |
@@ -525,7 +526,7 @@ int get_nohz_timer_target(void) | |||
525 | int i, cpu = smp_processor_id(); | 526 | int i, cpu = smp_processor_id(); |
526 | struct sched_domain *sd; | 527 | struct sched_domain *sd; |
527 | 528 | ||
528 | if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) | 529 | if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) |
529 | return cpu; | 530 | return cpu; |
530 | 531 | ||
531 | rcu_read_lock(); | 532 | rcu_read_lock(); |
@@ -534,15 +535,15 @@ int get_nohz_timer_target(void) | |||
534 | if (cpu == i) | 535 | if (cpu == i) |
535 | continue; | 536 | continue; |
536 | 537 | ||
537 | if (!idle_cpu(i) && is_housekeeping_cpu(i)) { | 538 | if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { |
538 | cpu = i; | 539 | cpu = i; |
539 | goto unlock; | 540 | goto unlock; |
540 | } | 541 | } |
541 | } | 542 | } |
542 | } | 543 | } |
543 | 544 | ||
544 | if (!is_housekeeping_cpu(cpu)) | 545 | if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) |
545 | cpu = housekeeping_any_cpu(); | 546 | cpu = housekeeping_any_cpu(HK_FLAG_TIMER); |
546 | unlock: | 547 | unlock: |
547 | rcu_read_unlock(); | 548 | rcu_read_unlock(); |
548 | return cpu; | 549 | return cpu; |
@@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data) | |||
732 | } | 733 | } |
733 | #endif | 734 | #endif |
734 | 735 | ||
735 | static void set_load_weight(struct task_struct *p) | 736 | static void set_load_weight(struct task_struct *p, bool update_load) |
736 | { | 737 | { |
737 | int prio = p->static_prio - MAX_RT_PRIO; | 738 | int prio = p->static_prio - MAX_RT_PRIO; |
738 | struct load_weight *load = &p->se.load; | 739 | struct load_weight *load = &p->se.load; |
@@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p) | |||
746 | return; | 747 | return; |
747 | } | 748 | } |
748 | 749 | ||
749 | load->weight = scale_load(sched_prio_to_weight[prio]); | 750 | /* |
750 | load->inv_weight = sched_prio_to_wmult[prio]; | 751 | * SCHED_OTHER tasks have to update their load when changing their |
752 | * weight | ||
753 | */ | ||
754 | if (update_load && p->sched_class == &fair_sched_class) { | ||
755 | reweight_task(p, prio); | ||
756 | } else { | ||
757 | load->weight = scale_load(sched_prio_to_weight[prio]); | ||
758 | load->inv_weight = sched_prio_to_wmult[prio]; | ||
759 | } | ||
751 | } | 760 | } |
752 | 761 | ||
753 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 762 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2357 | p->static_prio = NICE_TO_PRIO(0); | 2366 | p->static_prio = NICE_TO_PRIO(0); |
2358 | 2367 | ||
2359 | p->prio = p->normal_prio = __normal_prio(p); | 2368 | p->prio = p->normal_prio = __normal_prio(p); |
2360 | set_load_weight(p); | 2369 | set_load_weight(p, false); |
2361 | 2370 | ||
2362 | /* | 2371 | /* |
2363 | * We don't need the reset flag anymore after the fork. It has | 2372 | * We don't need the reset flag anymore after the fork. It has |
@@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3804 | put_prev_task(rq, p); | 3813 | put_prev_task(rq, p); |
3805 | 3814 | ||
3806 | p->static_prio = NICE_TO_PRIO(nice); | 3815 | p->static_prio = NICE_TO_PRIO(nice); |
3807 | set_load_weight(p); | 3816 | set_load_weight(p, true); |
3808 | old_prio = p->prio; | 3817 | old_prio = p->prio; |
3809 | p->prio = effective_prio(p); | 3818 | p->prio = effective_prio(p); |
3810 | delta = p->prio - old_prio; | 3819 | delta = p->prio - old_prio; |
@@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p, | |||
3961 | */ | 3970 | */ |
3962 | p->rt_priority = attr->sched_priority; | 3971 | p->rt_priority = attr->sched_priority; |
3963 | p->normal_prio = normal_prio(p); | 3972 | p->normal_prio = normal_prio(p); |
3964 | set_load_weight(p); | 3973 | set_load_weight(p, true); |
3965 | } | 3974 | } |
3966 | 3975 | ||
3967 | /* Actually do priority change: must hold pi & rq lock. */ | 3976 | /* Actually do priority change: must hold pi & rq lock. */ |
@@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { } | |||
5727 | 5736 | ||
5728 | void __init sched_init_smp(void) | 5737 | void __init sched_init_smp(void) |
5729 | { | 5738 | { |
5730 | cpumask_var_t non_isolated_cpus; | ||
5731 | |||
5732 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | ||
5733 | |||
5734 | sched_init_numa(); | 5739 | sched_init_numa(); |
5735 | 5740 | ||
5736 | /* | 5741 | /* |
@@ -5740,16 +5745,12 @@ void __init sched_init_smp(void) | |||
5740 | */ | 5745 | */ |
5741 | mutex_lock(&sched_domains_mutex); | 5746 | mutex_lock(&sched_domains_mutex); |
5742 | sched_init_domains(cpu_active_mask); | 5747 | sched_init_domains(cpu_active_mask); |
5743 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | ||
5744 | if (cpumask_empty(non_isolated_cpus)) | ||
5745 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | ||
5746 | mutex_unlock(&sched_domains_mutex); | 5748 | mutex_unlock(&sched_domains_mutex); |
5747 | 5749 | ||
5748 | /* Move init over to a non-isolated CPU */ | 5750 | /* Move init over to a non-isolated CPU */ |
5749 | if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) | 5751 | if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) |
5750 | BUG(); | 5752 | BUG(); |
5751 | sched_init_granularity(); | 5753 | sched_init_granularity(); |
5752 | free_cpumask_var(non_isolated_cpus); | ||
5753 | 5754 | ||
5754 | init_sched_rt_class(); | 5755 | init_sched_rt_class(); |
5755 | init_sched_dl_class(); | 5756 | init_sched_dl_class(); |
@@ -5934,7 +5935,7 @@ void __init sched_init(void) | |||
5934 | atomic_set(&rq->nr_iowait, 0); | 5935 | atomic_set(&rq->nr_iowait, 0); |
5935 | } | 5936 | } |
5936 | 5937 | ||
5937 | set_load_weight(&init_task); | 5938 | set_load_weight(&init_task, false); |
5938 | 5939 | ||
5939 | /* | 5940 | /* |
5940 | * The boot idle thread does lazy MMU switching as well: | 5941 | * The boot idle thread does lazy MMU switching as well: |
@@ -5953,9 +5954,6 @@ void __init sched_init(void) | |||
5953 | calc_load_update = jiffies + LOAD_FREQ; | 5954 | calc_load_update = jiffies + LOAD_FREQ; |
5954 | 5955 | ||
5955 | #ifdef CONFIG_SMP | 5956 | #ifdef CONFIG_SMP |
5956 | /* May be allocated at isolcpus cmdline parse time */ | ||
5957 | if (cpu_isolated_map == NULL) | ||
5958 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | ||
5959 | idle_thread_set_boot_cpu(); | 5957 | idle_thread_set_boot_cpu(); |
5960 | set_cpu_rq_start_time(smp_processor_id()); | 5958 | set_cpu_rq_start_time(smp_processor_id()); |
5961 | #endif | 5959 | #endif |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4ae5c1ea90e2..f349f7e98dec 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p) | |||
243 | if (p->state == TASK_DEAD) | 243 | if (p->state == TASK_DEAD) |
244 | sub_rq_bw(p->dl.dl_bw, &rq->dl); | 244 | sub_rq_bw(p->dl.dl_bw, &rq->dl); |
245 | raw_spin_lock(&dl_b->lock); | 245 | raw_spin_lock(&dl_b->lock); |
246 | __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); | 246 | __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); |
247 | __dl_clear_params(p); | 247 | __dl_clear_params(p); |
248 | raw_spin_unlock(&dl_b->lock); | 248 | raw_spin_unlock(&dl_b->lock); |
249 | } | 249 | } |
@@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) | |||
1210 | } | 1210 | } |
1211 | 1211 | ||
1212 | raw_spin_lock(&dl_b->lock); | 1212 | raw_spin_lock(&dl_b->lock); |
1213 | __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); | 1213 | __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); |
1214 | raw_spin_unlock(&dl_b->lock); | 1214 | raw_spin_unlock(&dl_b->lock); |
1215 | __dl_clear_params(p); | 1215 | __dl_clear_params(p); |
1216 | 1216 | ||
@@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, | |||
1365 | update_dl_entity(dl_se, pi_se); | 1365 | update_dl_entity(dl_se, pi_se); |
1366 | } else if (flags & ENQUEUE_REPLENISH) { | 1366 | } else if (flags & ENQUEUE_REPLENISH) { |
1367 | replenish_dl_entity(dl_se, pi_se); | 1367 | replenish_dl_entity(dl_se, pi_se); |
1368 | } else if ((flags & ENQUEUE_RESTORE) && | ||
1369 | dl_time_before(dl_se->deadline, | ||
1370 | rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { | ||
1371 | setup_new_dl_entity(dl_se); | ||
1368 | } | 1372 | } |
1369 | 1373 | ||
1370 | __enqueue_dl_entity(dl_se); | 1374 | __enqueue_dl_entity(dl_se); |
@@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
2167 | * until we complete the update. | 2171 | * until we complete the update. |
2168 | */ | 2172 | */ |
2169 | raw_spin_lock(&src_dl_b->lock); | 2173 | raw_spin_lock(&src_dl_b->lock); |
2170 | __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); | 2174 | __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); |
2171 | raw_spin_unlock(&src_dl_b->lock); | 2175 | raw_spin_unlock(&src_dl_b->lock); |
2172 | } | 2176 | } |
2173 | 2177 | ||
@@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
2256 | 2260 | ||
2257 | return; | 2261 | return; |
2258 | } | 2262 | } |
2259 | /* | ||
2260 | * If p is boosted we already updated its params in | ||
2261 | * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), | ||
2262 | * p's deadline being now already after rq_clock(rq). | ||
2263 | */ | ||
2264 | if (dl_time_before(p->dl.deadline, rq_clock(rq))) | ||
2265 | setup_new_dl_entity(&p->dl); | ||
2266 | 2263 | ||
2267 | if (rq->curr != p) { | 2264 | if (rq->curr != p) { |
2268 | #ifdef CONFIG_SMP | 2265 | #ifdef CONFIG_SMP |
@@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, | |||
2452 | if (dl_policy(policy) && !task_has_dl_policy(p) && | 2449 | if (dl_policy(policy) && !task_has_dl_policy(p) && |
2453 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { | 2450 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { |
2454 | if (hrtimer_active(&p->dl.inactive_timer)) | 2451 | if (hrtimer_active(&p->dl.inactive_timer)) |
2455 | __dl_clear(dl_b, p->dl.dl_bw, cpus); | 2452 | __dl_sub(dl_b, p->dl.dl_bw, cpus); |
2456 | __dl_add(dl_b, new_bw, cpus); | 2453 | __dl_add(dl_b, new_bw, cpus); |
2457 | err = 0; | 2454 | err = 0; |
2458 | } else if (dl_policy(policy) && task_has_dl_policy(p) && | 2455 | } else if (dl_policy(policy) && task_has_dl_policy(p) && |
@@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, | |||
2464 | * But this would require to set the task's "inactive | 2461 | * But this would require to set the task's "inactive |
2465 | * timer" when the task is not inactive. | 2462 | * timer" when the task is not inactive. |
2466 | */ | 2463 | */ |
2467 | __dl_clear(dl_b, p->dl.dl_bw, cpus); | 2464 | __dl_sub(dl_b, p->dl.dl_bw, cpus); |
2468 | __dl_add(dl_b, new_bw, cpus); | 2465 | __dl_add(dl_b, new_bw, cpus); |
2469 | dl_change_utilization(p, new_bw); | 2466 | dl_change_utilization(p, new_bw); |
2470 | err = 0; | 2467 | err = 0; |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..1ca0130ed4f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
441 | P_SCHEDSTAT(se->statistics.wait_count); | 441 | P_SCHEDSTAT(se->statistics.wait_count); |
442 | } | 442 | } |
443 | P(se->load.weight); | 443 | P(se->load.weight); |
444 | P(se->runnable_weight); | ||
444 | #ifdef CONFIG_SMP | 445 | #ifdef CONFIG_SMP |
445 | P(se->avg.load_avg); | 446 | P(se->avg.load_avg); |
446 | P(se->avg.util_avg); | 447 | P(se->avg.util_avg); |
448 | P(se->avg.runnable_load_avg); | ||
447 | #endif | 449 | #endif |
448 | 450 | ||
449 | #undef PN_SCHEDSTAT | 451 | #undef PN_SCHEDSTAT |
@@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
558 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); | 560 | SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); |
559 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | 561 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); |
560 | #ifdef CONFIG_SMP | 562 | #ifdef CONFIG_SMP |
563 | SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); | ||
561 | SEQ_printf(m, " .%-30s: %lu\n", "load_avg", | 564 | SEQ_printf(m, " .%-30s: %lu\n", "load_avg", |
562 | cfs_rq->avg.load_avg); | 565 | cfs_rq->avg.load_avg); |
563 | SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", | 566 | SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", |
564 | cfs_rq->runnable_load_avg); | 567 | cfs_rq->avg.runnable_load_avg); |
565 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", | 568 | SEQ_printf(m, " .%-30s: %lu\n", "util_avg", |
566 | cfs_rq->avg.util_avg); | 569 | cfs_rq->avg.util_avg); |
567 | SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", | 570 | SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", |
568 | atomic_long_read(&cfs_rq->removed_load_avg)); | 571 | cfs_rq->removed.load_avg); |
569 | SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", | 572 | SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", |
570 | atomic_long_read(&cfs_rq->removed_util_avg)); | 573 | cfs_rq->removed.util_avg); |
574 | SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", | ||
575 | cfs_rq->removed.runnable_sum); | ||
571 | #ifdef CONFIG_FAIR_GROUP_SCHED | 576 | #ifdef CONFIG_FAIR_GROUP_SCHED |
572 | SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", | 577 | SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", |
573 | cfs_rq->tg_load_avg_contrib); | 578 | cfs_rq->tg_load_avg_contrib); |
@@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, | |||
1004 | "nr_involuntary_switches", (long long)p->nivcsw); | 1009 | "nr_involuntary_switches", (long long)p->nivcsw); |
1005 | 1010 | ||
1006 | P(se.load.weight); | 1011 | P(se.load.weight); |
1012 | P(se.runnable_weight); | ||
1007 | #ifdef CONFIG_SMP | 1013 | #ifdef CONFIG_SMP |
1008 | P(se.avg.load_sum); | 1014 | P(se.avg.load_sum); |
1015 | P(se.avg.runnable_load_sum); | ||
1009 | P(se.avg.util_sum); | 1016 | P(se.avg.util_sum); |
1010 | P(se.avg.load_avg); | 1017 | P(se.avg.load_avg); |
1018 | P(se.avg.runnable_load_avg); | ||
1011 | P(se.avg.util_avg); | 1019 | P(se.avg.util_avg); |
1012 | P(se.avg.last_update_time); | 1020 | P(se.avg.last_update_time); |
1013 | #endif | 1021 | #endif |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c09ddf8c832..0989676c50e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/mempolicy.h> | 33 | #include <linux/mempolicy.h> |
34 | #include <linux/migrate.h> | 34 | #include <linux/migrate.h> |
35 | #include <linux/task_work.h> | 35 | #include <linux/task_work.h> |
36 | #include <linux/sched/isolation.h> | ||
36 | 37 | ||
37 | #include <trace/events/sched.h> | 38 | #include <trace/events/sched.h> |
38 | 39 | ||
@@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
717 | { | 718 | { |
718 | struct sched_avg *sa = &se->avg; | 719 | struct sched_avg *sa = &se->avg; |
719 | 720 | ||
720 | sa->last_update_time = 0; | 721 | memset(sa, 0, sizeof(*sa)); |
721 | /* | 722 | |
722 | * sched_avg's period_contrib should be strictly less then 1024, so | ||
723 | * we give it 1023 to make sure it is almost a period (1024us), and | ||
724 | * will definitely be update (after enqueue). | ||
725 | */ | ||
726 | sa->period_contrib = 1023; | ||
727 | /* | 723 | /* |
728 | * Tasks are intialized with full load to be seen as heavy tasks until | 724 | * Tasks are intialized with full load to be seen as heavy tasks until |
729 | * they get a chance to stabilize to their real load level. | 725 | * they get a chance to stabilize to their real load level. |
@@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
731 | * nothing has been attached to the task group yet. | 727 | * nothing has been attached to the task group yet. |
732 | */ | 728 | */ |
733 | if (entity_is_task(se)) | 729 | if (entity_is_task(se)) |
734 | sa->load_avg = scale_load_down(se->load.weight); | 730 | sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); |
735 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | 731 | |
736 | /* | 732 | se->runnable_weight = se->load.weight; |
737 | * At this point, util_avg won't be used in select_task_rq_fair anyway | 733 | |
738 | */ | ||
739 | sa->util_avg = 0; | ||
740 | sa->util_sum = 0; | ||
741 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 734 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
742 | } | 735 | } |
743 | 736 | ||
@@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
785 | } else { | 778 | } else { |
786 | sa->util_avg = cap; | 779 | sa->util_avg = cap; |
787 | } | 780 | } |
788 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; | ||
789 | } | 781 | } |
790 | 782 | ||
791 | if (entity_is_task(se)) { | 783 | if (entity_is_task(se)) { |
@@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
2026 | delta = runtime - p->last_sum_exec_runtime; | 2018 | delta = runtime - p->last_sum_exec_runtime; |
2027 | *period = now - p->last_task_numa_placement; | 2019 | *period = now - p->last_task_numa_placement; |
2028 | } else { | 2020 | } else { |
2029 | delta = p->se.avg.load_sum / p->se.load.weight; | 2021 | delta = p->se.avg.load_sum; |
2030 | *period = LOAD_AVG_MAX; | 2022 | *period = LOAD_AVG_MAX; |
2031 | } | 2023 | } |
2032 | 2024 | ||
@@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2693 | cfs_rq->nr_running--; | 2685 | cfs_rq->nr_running--; |
2694 | } | 2686 | } |
2695 | 2687 | ||
2688 | /* | ||
2689 | * Signed add and clamp on underflow. | ||
2690 | * | ||
2691 | * Explicitly do a load-store to ensure the intermediate value never hits | ||
2692 | * memory. This allows lockless observations without ever seeing the negative | ||
2693 | * values. | ||
2694 | */ | ||
2695 | #define add_positive(_ptr, _val) do { \ | ||
2696 | typeof(_ptr) ptr = (_ptr); \ | ||
2697 | typeof(_val) val = (_val); \ | ||
2698 | typeof(*ptr) res, var = READ_ONCE(*ptr); \ | ||
2699 | \ | ||
2700 | res = var + val; \ | ||
2701 | \ | ||
2702 | if (val < 0 && res > var) \ | ||
2703 | res = 0; \ | ||
2704 | \ | ||
2705 | WRITE_ONCE(*ptr, res); \ | ||
2706 | } while (0) | ||
2707 | |||
2708 | /* | ||
2709 | * Unsigned subtract and clamp on underflow. | ||
2710 | * | ||
2711 | * Explicitly do a load-store to ensure the intermediate value never hits | ||
2712 | * memory. This allows lockless observations without ever seeing the negative | ||
2713 | * values. | ||
2714 | */ | ||
2715 | #define sub_positive(_ptr, _val) do { \ | ||
2716 | typeof(_ptr) ptr = (_ptr); \ | ||
2717 | typeof(*ptr) val = (_val); \ | ||
2718 | typeof(*ptr) res, var = READ_ONCE(*ptr); \ | ||
2719 | res = var - val; \ | ||
2720 | if (res > var) \ | ||
2721 | res = 0; \ | ||
2722 | WRITE_ONCE(*ptr, res); \ | ||
2723 | } while (0) | ||
2724 | |||
2725 | #ifdef CONFIG_SMP | ||
2726 | /* | ||
2727 | * XXX we want to get rid of these helpers and use the full load resolution. | ||
2728 | */ | ||
2729 | static inline long se_weight(struct sched_entity *se) | ||
2730 | { | ||
2731 | return scale_load_down(se->load.weight); | ||
2732 | } | ||
2733 | |||
2734 | static inline long se_runnable(struct sched_entity *se) | ||
2735 | { | ||
2736 | return scale_load_down(se->runnable_weight); | ||
2737 | } | ||
2738 | |||
2739 | static inline void | ||
2740 | enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2741 | { | ||
2742 | cfs_rq->runnable_weight += se->runnable_weight; | ||
2743 | |||
2744 | cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; | ||
2745 | cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; | ||
2746 | } | ||
2747 | |||
2748 | static inline void | ||
2749 | dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2750 | { | ||
2751 | cfs_rq->runnable_weight -= se->runnable_weight; | ||
2752 | |||
2753 | sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); | ||
2754 | sub_positive(&cfs_rq->avg.runnable_load_sum, | ||
2755 | se_runnable(se) * se->avg.runnable_load_sum); | ||
2756 | } | ||
2757 | |||
2758 | static inline void | ||
2759 | enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2760 | { | ||
2761 | cfs_rq->avg.load_avg += se->avg.load_avg; | ||
2762 | cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; | ||
2763 | } | ||
2764 | |||
2765 | static inline void | ||
2766 | dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2767 | { | ||
2768 | sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); | ||
2769 | sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); | ||
2770 | } | ||
2771 | #else | ||
2772 | static inline void | ||
2773 | enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
2774 | static inline void | ||
2775 | dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
2776 | static inline void | ||
2777 | enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
2778 | static inline void | ||
2779 | dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | ||
2780 | #endif | ||
2781 | |||
2782 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
2783 | unsigned long weight, unsigned long runnable) | ||
2784 | { | ||
2785 | if (se->on_rq) { | ||
2786 | /* commit outstanding execution time */ | ||
2787 | if (cfs_rq->curr == se) | ||
2788 | update_curr(cfs_rq); | ||
2789 | account_entity_dequeue(cfs_rq, se); | ||
2790 | dequeue_runnable_load_avg(cfs_rq, se); | ||
2791 | } | ||
2792 | dequeue_load_avg(cfs_rq, se); | ||
2793 | |||
2794 | se->runnable_weight = runnable; | ||
2795 | update_load_set(&se->load, weight); | ||
2796 | |||
2797 | #ifdef CONFIG_SMP | ||
2798 | do { | ||
2799 | u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; | ||
2800 | |||
2801 | se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); | ||
2802 | se->avg.runnable_load_avg = | ||
2803 | div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider); | ||
2804 | } while (0); | ||
2805 | #endif | ||
2806 | |||
2807 | enqueue_load_avg(cfs_rq, se); | ||
2808 | if (se->on_rq) { | ||
2809 | account_entity_enqueue(cfs_rq, se); | ||
2810 | enqueue_runnable_load_avg(cfs_rq, se); | ||
2811 | } | ||
2812 | } | ||
2813 | |||
2814 | void reweight_task(struct task_struct *p, int prio) | ||
2815 | { | ||
2816 | struct sched_entity *se = &p->se; | ||
2817 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
2818 | struct load_weight *load = &se->load; | ||
2819 | unsigned long weight = scale_load(sched_prio_to_weight[prio]); | ||
2820 | |||
2821 | reweight_entity(cfs_rq, se, weight, weight); | ||
2822 | load->inv_weight = sched_prio_to_wmult[prio]; | ||
2823 | } | ||
2824 | |||
2696 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2825 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2697 | # ifdef CONFIG_SMP | 2826 | # ifdef CONFIG_SMP |
2698 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | 2827 | /* |
2828 | * All this does is approximate the hierarchical proportion which includes that | ||
2829 | * global sum we all love to hate. | ||
2830 | * | ||
2831 | * That is, the weight of a group entity, is the proportional share of the | ||
2832 | * group weight based on the group runqueue weights. That is: | ||
2833 | * | ||
2834 | * tg->weight * grq->load.weight | ||
2835 | * ge->load.weight = ----------------------------- (1) | ||
2836 | * \Sum grq->load.weight | ||
2837 | * | ||
2838 | * Now, because computing that sum is prohibitively expensive to compute (been | ||
2839 | * there, done that) we approximate it with this average stuff. The average | ||
2840 | * moves slower and therefore the approximation is cheaper and more stable. | ||
2841 | * | ||
2842 | * So instead of the above, we substitute: | ||
2843 | * | ||
2844 | * grq->load.weight -> grq->avg.load_avg (2) | ||
2845 | * | ||
2846 | * which yields the following: | ||
2847 | * | ||
2848 | * tg->weight * grq->avg.load_avg | ||
2849 | * ge->load.weight = ------------------------------ (3) | ||
2850 | * tg->load_avg | ||
2851 | * | ||
2852 | * Where: tg->load_avg ~= \Sum grq->avg.load_avg | ||
2853 | * | ||
2854 | * That is shares_avg, and it is right (given the approximation (2)). | ||
2855 | * | ||
2856 | * The problem with it is that because the average is slow -- it was designed | ||
2857 | * to be exactly that of course -- this leads to transients in boundary | ||
2858 | * conditions. In specific, the case where the group was idle and we start the | ||
2859 | * one task. It takes time for our CPU's grq->avg.load_avg to build up, | ||
2860 | * yielding bad latency etc.. | ||
2861 | * | ||
2862 | * Now, in that special case (1) reduces to: | ||
2863 | * | ||
2864 | * tg->weight * grq->load.weight | ||
2865 | * ge->load.weight = ----------------------------- = tg->weight (4) | ||
2866 | * grp->load.weight | ||
2867 | * | ||
2868 | * That is, the sum collapses because all other CPUs are idle; the UP scenario. | ||
2869 | * | ||
2870 | * So what we do is modify our approximation (3) to approach (4) in the (near) | ||
2871 | * UP case, like: | ||
2872 | * | ||
2873 | * ge->load.weight = | ||
2874 | * | ||
2875 | * tg->weight * grq->load.weight | ||
2876 | * --------------------------------------------------- (5) | ||
2877 | * tg->load_avg - grq->avg.load_avg + grq->load.weight | ||
2878 | * | ||
2879 | * But because grq->load.weight can drop to 0, resulting in a divide by zero, | ||
2880 | * we need to use grq->avg.load_avg as its lower bound, which then gives: | ||
2881 | * | ||
2882 | * | ||
2883 | * tg->weight * grq->load.weight | ||
2884 | * ge->load.weight = ----------------------------- (6) | ||
2885 | * tg_load_avg' | ||
2886 | * | ||
2887 | * Where: | ||
2888 | * | ||
2889 | * tg_load_avg' = tg->load_avg - grq->avg.load_avg + | ||
2890 | * max(grq->load.weight, grq->avg.load_avg) | ||
2891 | * | ||
2892 | * And that is shares_weight and is icky. In the (near) UP case it approaches | ||
2893 | * (4) while in the normal case it approaches (3). It consistently | ||
2894 | * overestimates the ge->load.weight and therefore: | ||
2895 | * | ||
2896 | * \Sum ge->load.weight >= tg->weight | ||
2897 | * | ||
2898 | * hence icky! | ||
2899 | */ | ||
2900 | static long calc_group_shares(struct cfs_rq *cfs_rq) | ||
2699 | { | 2901 | { |
2700 | long tg_weight, load, shares; | 2902 | long tg_weight, tg_shares, load, shares; |
2903 | struct task_group *tg = cfs_rq->tg; | ||
2701 | 2904 | ||
2702 | /* | 2905 | tg_shares = READ_ONCE(tg->shares); |
2703 | * This really should be: cfs_rq->avg.load_avg, but instead we use | 2906 | |
2704 | * cfs_rq->load.weight, which is its upper bound. This helps ramp up | 2907 | load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg); |
2705 | * the shares for small weight interactive tasks. | ||
2706 | */ | ||
2707 | load = scale_load_down(cfs_rq->load.weight); | ||
2708 | 2908 | ||
2709 | tg_weight = atomic_long_read(&tg->load_avg); | 2909 | tg_weight = atomic_long_read(&tg->load_avg); |
2710 | 2910 | ||
@@ -2712,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
2712 | tg_weight -= cfs_rq->tg_load_avg_contrib; | 2912 | tg_weight -= cfs_rq->tg_load_avg_contrib; |
2713 | tg_weight += load; | 2913 | tg_weight += load; |
2714 | 2914 | ||
2715 | shares = (tg->shares * load); | 2915 | shares = (tg_shares * load); |
2716 | if (tg_weight) | 2916 | if (tg_weight) |
2717 | shares /= tg_weight; | 2917 | shares /= tg_weight; |
2718 | 2918 | ||
@@ -2728,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
2728 | * case no task is runnable on a CPU MIN_SHARES=2 should be returned | 2928 | * case no task is runnable on a CPU MIN_SHARES=2 should be returned |
2729 | * instead of 0. | 2929 | * instead of 0. |
2730 | */ | 2930 | */ |
2731 | if (shares < MIN_SHARES) | 2931 | return clamp_t(long, shares, MIN_SHARES, tg_shares); |
2732 | shares = MIN_SHARES; | ||
2733 | if (shares > tg->shares) | ||
2734 | shares = tg->shares; | ||
2735 | |||
2736 | return shares; | ||
2737 | } | ||
2738 | # else /* CONFIG_SMP */ | ||
2739 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | ||
2740 | { | ||
2741 | return tg->shares; | ||
2742 | } | 2932 | } |
2743 | # endif /* CONFIG_SMP */ | ||
2744 | 2933 | ||
2745 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 2934 | /* |
2746 | unsigned long weight) | 2935 | * This calculates the effective runnable weight for a group entity based on |
2936 | * the group entity weight calculated above. | ||
2937 | * | ||
2938 | * Because of the above approximation (2), our group entity weight is | ||
2939 | * an load_avg based ratio (3). This means that it includes blocked load and | ||
2940 | * does not represent the runnable weight. | ||
2941 | * | ||
2942 | * Approximate the group entity's runnable weight per ratio from the group | ||
2943 | * runqueue: | ||
2944 | * | ||
2945 | * grq->avg.runnable_load_avg | ||
2946 | * ge->runnable_weight = ge->load.weight * -------------------------- (7) | ||
2947 | * grq->avg.load_avg | ||
2948 | * | ||
2949 | * However, analogous to above, since the avg numbers are slow, this leads to | ||
2950 | * transients in the from-idle case. Instead we use: | ||
2951 | * | ||
2952 | * ge->runnable_weight = ge->load.weight * | ||
2953 | * | ||
2954 | * max(grq->avg.runnable_load_avg, grq->runnable_weight) | ||
2955 | * ----------------------------------------------------- (8) | ||
2956 | * max(grq->avg.load_avg, grq->load.weight) | ||
2957 | * | ||
2958 | * Where these max() serve both to use the 'instant' values to fix the slow | ||
2959 | * from-idle and avoid the /0 on to-idle, similar to (6). | ||
2960 | */ | ||
2961 | static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) | ||
2747 | { | 2962 | { |
2748 | if (se->on_rq) { | 2963 | long runnable, load_avg; |
2749 | /* commit outstanding execution time */ | ||
2750 | if (cfs_rq->curr == se) | ||
2751 | update_curr(cfs_rq); | ||
2752 | account_entity_dequeue(cfs_rq, se); | ||
2753 | } | ||
2754 | 2964 | ||
2755 | update_load_set(&se->load, weight); | 2965 | load_avg = max(cfs_rq->avg.load_avg, |
2966 | scale_load_down(cfs_rq->load.weight)); | ||
2756 | 2967 | ||
2757 | if (se->on_rq) | 2968 | runnable = max(cfs_rq->avg.runnable_load_avg, |
2758 | account_entity_enqueue(cfs_rq, se); | 2969 | scale_load_down(cfs_rq->runnable_weight)); |
2970 | |||
2971 | runnable *= shares; | ||
2972 | if (load_avg) | ||
2973 | runnable /= load_avg; | ||
2974 | |||
2975 | return clamp_t(long, runnable, MIN_SHARES, shares); | ||
2759 | } | 2976 | } |
2977 | # endif /* CONFIG_SMP */ | ||
2760 | 2978 | ||
2761 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | 2979 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); |
2762 | 2980 | ||
2763 | static void update_cfs_shares(struct sched_entity *se) | 2981 | /* |
2982 | * Recomputes the group entity based on the current state of its group | ||
2983 | * runqueue. | ||
2984 | */ | ||
2985 | static void update_cfs_group(struct sched_entity *se) | ||
2764 | { | 2986 | { |
2765 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | 2987 | struct cfs_rq *gcfs_rq = group_cfs_rq(se); |
2766 | struct task_group *tg; | 2988 | long shares, runnable; |
2767 | long shares; | ||
2768 | 2989 | ||
2769 | if (!cfs_rq) | 2990 | if (!gcfs_rq) |
2770 | return; | 2991 | return; |
2771 | 2992 | ||
2772 | if (throttled_hierarchy(cfs_rq)) | 2993 | if (throttled_hierarchy(gcfs_rq)) |
2773 | return; | 2994 | return; |
2774 | 2995 | ||
2775 | tg = cfs_rq->tg; | ||
2776 | |||
2777 | #ifndef CONFIG_SMP | 2996 | #ifndef CONFIG_SMP |
2778 | if (likely(se->load.weight == tg->shares)) | 2997 | runnable = shares = READ_ONCE(gcfs_rq->tg->shares); |
2998 | |||
2999 | if (likely(se->load.weight == shares)) | ||
2779 | return; | 3000 | return; |
3001 | #else | ||
3002 | shares = calc_group_shares(gcfs_rq); | ||
3003 | runnable = calc_group_runnable(gcfs_rq, shares); | ||
2780 | #endif | 3004 | #endif |
2781 | shares = calc_cfs_shares(cfs_rq, tg); | ||
2782 | 3005 | ||
2783 | reweight_entity(cfs_rq_of(se), se, shares); | 3006 | reweight_entity(cfs_rq_of(se), se, shares, runnable); |
2784 | } | 3007 | } |
2785 | 3008 | ||
2786 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 3009 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
2787 | static inline void update_cfs_shares(struct sched_entity *se) | 3010 | static inline void update_cfs_group(struct sched_entity *se) |
2788 | { | 3011 | { |
2789 | } | 3012 | } |
2790 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 3013 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -2893,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) | |||
2893 | */ | 3116 | */ |
2894 | static __always_inline u32 | 3117 | static __always_inline u32 |
2895 | accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | 3118 | accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, |
2896 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | 3119 | unsigned long load, unsigned long runnable, int running) |
2897 | { | 3120 | { |
2898 | unsigned long scale_freq, scale_cpu; | 3121 | unsigned long scale_freq, scale_cpu; |
2899 | u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ | 3122 | u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ |
@@ -2910,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | |||
2910 | */ | 3133 | */ |
2911 | if (periods) { | 3134 | if (periods) { |
2912 | sa->load_sum = decay_load(sa->load_sum, periods); | 3135 | sa->load_sum = decay_load(sa->load_sum, periods); |
2913 | if (cfs_rq) { | 3136 | sa->runnable_load_sum = |
2914 | cfs_rq->runnable_load_sum = | 3137 | decay_load(sa->runnable_load_sum, periods); |
2915 | decay_load(cfs_rq->runnable_load_sum, periods); | ||
2916 | } | ||
2917 | sa->util_sum = decay_load((u64)(sa->util_sum), periods); | 3138 | sa->util_sum = decay_load((u64)(sa->util_sum), periods); |
2918 | 3139 | ||
2919 | /* | 3140 | /* |
@@ -2926,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | |||
2926 | sa->period_contrib = delta; | 3147 | sa->period_contrib = delta; |
2927 | 3148 | ||
2928 | contrib = cap_scale(contrib, scale_freq); | 3149 | contrib = cap_scale(contrib, scale_freq); |
2929 | if (weight) { | 3150 | if (load) |
2930 | sa->load_sum += weight * contrib; | 3151 | sa->load_sum += load * contrib; |
2931 | if (cfs_rq) | 3152 | if (runnable) |
2932 | cfs_rq->runnable_load_sum += weight * contrib; | 3153 | sa->runnable_load_sum += runnable * contrib; |
2933 | } | ||
2934 | if (running) | 3154 | if (running) |
2935 | sa->util_sum += contrib * scale_cpu; | 3155 | sa->util_sum += contrib * scale_cpu; |
2936 | 3156 | ||
@@ -2966,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | |||
2966 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 3186 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] |
2967 | */ | 3187 | */ |
2968 | static __always_inline int | 3188 | static __always_inline int |
2969 | ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, | 3189 | ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, |
2970 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | 3190 | unsigned long load, unsigned long runnable, int running) |
2971 | { | 3191 | { |
2972 | u64 delta; | 3192 | u64 delta; |
2973 | 3193 | ||
@@ -3000,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
3000 | * this happens during idle_balance() which calls | 3220 | * this happens during idle_balance() which calls |
3001 | * update_blocked_averages() | 3221 | * update_blocked_averages() |
3002 | */ | 3222 | */ |
3003 | if (!weight) | 3223 | if (!load) |
3004 | running = 0; | 3224 | runnable = running = 0; |
3005 | 3225 | ||
3006 | /* | 3226 | /* |
3007 | * Now we know we crossed measurement unit boundaries. The *_avg | 3227 | * Now we know we crossed measurement unit boundaries. The *_avg |
@@ -3010,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
3010 | * Step 1: accumulate *_sum since last_update_time. If we haven't | 3230 | * Step 1: accumulate *_sum since last_update_time. If we haven't |
3011 | * crossed period boundaries, finish. | 3231 | * crossed period boundaries, finish. |
3012 | */ | 3232 | */ |
3013 | if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) | 3233 | if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) |
3014 | return 0; | 3234 | return 0; |
3015 | 3235 | ||
3236 | return 1; | ||
3237 | } | ||
3238 | |||
3239 | static __always_inline void | ||
3240 | ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) | ||
3241 | { | ||
3242 | u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; | ||
3243 | |||
3016 | /* | 3244 | /* |
3017 | * Step 2: update *_avg. | 3245 | * Step 2: update *_avg. |
3018 | */ | 3246 | */ |
3019 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); | 3247 | sa->load_avg = div_u64(load * sa->load_sum, divider); |
3020 | if (cfs_rq) { | 3248 | sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider); |
3021 | cfs_rq->runnable_load_avg = | 3249 | sa->util_avg = sa->util_sum / divider; |
3022 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); | ||
3023 | } | ||
3024 | sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); | ||
3025 | |||
3026 | return 1; | ||
3027 | } | 3250 | } |
3028 | 3251 | ||
3252 | /* | ||
3253 | * sched_entity: | ||
3254 | * | ||
3255 | * task: | ||
3256 | * se_runnable() == se_weight() | ||
3257 | * | ||
3258 | * group: [ see update_cfs_group() ] | ||
3259 | * se_weight() = tg->weight * grq->load_avg / tg->load_avg | ||
3260 | * se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg | ||
3261 | * | ||
3262 | * load_sum := runnable_sum | ||
3263 | * load_avg = se_weight(se) * runnable_avg | ||
3264 | * | ||
3265 | * runnable_load_sum := runnable_sum | ||
3266 | * runnable_load_avg = se_runnable(se) * runnable_avg | ||
3267 | * | ||
3268 | * XXX collapse load_sum and runnable_load_sum | ||
3269 | * | ||
3270 | * cfq_rs: | ||
3271 | * | ||
3272 | * load_sum = \Sum se_weight(se) * se->avg.load_sum | ||
3273 | * load_avg = \Sum se->avg.load_avg | ||
3274 | * | ||
3275 | * runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum | ||
3276 | * runnable_load_avg = \Sum se->avg.runable_load_avg | ||
3277 | */ | ||
3278 | |||
3029 | static int | 3279 | static int |
3030 | __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) | 3280 | __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) |
3031 | { | 3281 | { |
3032 | return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); | 3282 | if (entity_is_task(se)) |
3283 | se->runnable_weight = se->load.weight; | ||
3284 | |||
3285 | if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { | ||
3286 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | ||
3287 | return 1; | ||
3288 | } | ||
3289 | |||
3290 | return 0; | ||
3033 | } | 3291 | } |
3034 | 3292 | ||
3035 | static int | 3293 | static int |
3036 | __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) | 3294 | __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) |
3037 | { | 3295 | { |
3038 | return ___update_load_avg(now, cpu, &se->avg, | 3296 | if (entity_is_task(se)) |
3039 | se->on_rq * scale_load_down(se->load.weight), | 3297 | se->runnable_weight = se->load.weight; |
3040 | cfs_rq->curr == se, NULL); | 3298 | |
3299 | if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, | ||
3300 | cfs_rq->curr == se)) { | ||
3301 | |||
3302 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | ||
3303 | return 1; | ||
3304 | } | ||
3305 | |||
3306 | return 0; | ||
3041 | } | 3307 | } |
3042 | 3308 | ||
3043 | static int | 3309 | static int |
3044 | __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) | 3310 | __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) |
3045 | { | 3311 | { |
3046 | return ___update_load_avg(now, cpu, &cfs_rq->avg, | 3312 | if (___update_load_sum(now, cpu, &cfs_rq->avg, |
3047 | scale_load_down(cfs_rq->load.weight), | 3313 | scale_load_down(cfs_rq->load.weight), |
3048 | cfs_rq->curr != NULL, cfs_rq); | 3314 | scale_load_down(cfs_rq->runnable_weight), |
3049 | } | 3315 | cfs_rq->curr != NULL)) { |
3050 | 3316 | ||
3051 | /* | 3317 | ___update_load_avg(&cfs_rq->avg, 1, 1); |
3052 | * Signed add and clamp on underflow. | 3318 | return 1; |
3053 | * | 3319 | } |
3054 | * Explicitly do a load-store to ensure the intermediate value never hits | 3320 | |
3055 | * memory. This allows lockless observations without ever seeing the negative | 3321 | return 0; |
3056 | * values. | 3322 | } |
3057 | */ | ||
3058 | #define add_positive(_ptr, _val) do { \ | ||
3059 | typeof(_ptr) ptr = (_ptr); \ | ||
3060 | typeof(_val) val = (_val); \ | ||
3061 | typeof(*ptr) res, var = READ_ONCE(*ptr); \ | ||
3062 | \ | ||
3063 | res = var + val; \ | ||
3064 | \ | ||
3065 | if (val < 0 && res > var) \ | ||
3066 | res = 0; \ | ||
3067 | \ | ||
3068 | WRITE_ONCE(*ptr, res); \ | ||
3069 | } while (0) | ||
3070 | 3323 | ||
3071 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3324 | #ifdef CONFIG_FAIR_GROUP_SCHED |
3072 | /** | 3325 | /** |
@@ -3149,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se, | |||
3149 | se->avg.last_update_time = n_last_update_time; | 3402 | se->avg.last_update_time = n_last_update_time; |
3150 | } | 3403 | } |
3151 | 3404 | ||
3152 | /* Take into account change of utilization of a child task group */ | 3405 | |
3406 | /* | ||
3407 | * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to | ||
3408 | * propagate its contribution. The key to this propagation is the invariant | ||
3409 | * that for each group: | ||
3410 | * | ||
3411 | * ge->avg == grq->avg (1) | ||
3412 | * | ||
3413 | * _IFF_ we look at the pure running and runnable sums. Because they | ||
3414 | * represent the very same entity, just at different points in the hierarchy. | ||
3415 | * | ||
3416 | * | ||
3417 | * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and | ||
3418 | * simply copies the running sum over. | ||
3419 | * | ||
3420 | * However, update_tg_cfs_runnable() is more complex. So we have: | ||
3421 | * | ||
3422 | * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) | ||
3423 | * | ||
3424 | * And since, like util, the runnable part should be directly transferable, | ||
3425 | * the following would _appear_ to be the straight forward approach: | ||
3426 | * | ||
3427 | * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) | ||
3428 | * | ||
3429 | * And per (1) we have: | ||
3430 | * | ||
3431 | * ge->avg.running_avg == grq->avg.running_avg | ||
3432 | * | ||
3433 | * Which gives: | ||
3434 | * | ||
3435 | * ge->load.weight * grq->avg.load_avg | ||
3436 | * ge->avg.load_avg = ----------------------------------- (4) | ||
3437 | * grq->load.weight | ||
3438 | * | ||
3439 | * Except that is wrong! | ||
3440 | * | ||
3441 | * Because while for entities historical weight is not important and we | ||
3442 | * really only care about our future and therefore can consider a pure | ||
3443 | * runnable sum, runqueues can NOT do this. | ||
3444 | * | ||
3445 | * We specifically want runqueues to have a load_avg that includes | ||
3446 | * historical weights. Those represent the blocked load, the load we expect | ||
3447 | * to (shortly) return to us. This only works by keeping the weights as | ||
3448 | * integral part of the sum. We therefore cannot decompose as per (3). | ||
3449 | * | ||
3450 | * OK, so what then? | ||
3451 | * | ||
3452 | * | ||
3453 | * Another way to look at things is: | ||
3454 | * | ||
3455 | * grq->avg.load_avg = \Sum se->avg.load_avg | ||
3456 | * | ||
3457 | * Therefore, per (2): | ||
3458 | * | ||
3459 | * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg | ||
3460 | * | ||
3461 | * And the very thing we're propagating is a change in that sum (someone | ||
3462 | * joined/left). So we can easily know the runnable change, which would be, per | ||
3463 | * (2) the already tracked se->load_avg divided by the corresponding | ||
3464 | * se->weight. | ||
3465 | * | ||
3466 | * Basically (4) but in differential form: | ||
3467 | * | ||
3468 | * d(runnable_avg) += se->avg.load_avg / se->load.weight | ||
3469 | * (5) | ||
3470 | * ge->avg.load_avg += ge->load.weight * d(runnable_avg) | ||
3471 | */ | ||
3472 | |||
3153 | static inline void | 3473 | static inline void |
3154 | update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3474 | update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) |
3155 | { | 3475 | { |
3156 | struct cfs_rq *gcfs_rq = group_cfs_rq(se); | ||
3157 | long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; | 3476 | long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; |
3158 | 3477 | ||
3159 | /* Nothing to update */ | 3478 | /* Nothing to update */ |
@@ -3169,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3169 | cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; | 3488 | cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; |
3170 | } | 3489 | } |
3171 | 3490 | ||
3172 | /* Take into account change of load of a child task group */ | ||
3173 | static inline void | 3491 | static inline void |
3174 | update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3492 | update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) |
3175 | { | 3493 | { |
3176 | struct cfs_rq *gcfs_rq = group_cfs_rq(se); | 3494 | long runnable_sum = gcfs_rq->prop_runnable_sum; |
3177 | long delta, load = gcfs_rq->avg.load_avg; | 3495 | long runnable_load_avg, load_avg; |
3496 | s64 runnable_load_sum, load_sum; | ||
3178 | 3497 | ||
3179 | /* | 3498 | if (!runnable_sum) |
3180 | * If the load of group cfs_rq is null, the load of the | 3499 | return; |
3181 | * sched_entity will also be null so we can skip the formula | ||
3182 | */ | ||
3183 | if (load) { | ||
3184 | long tg_load; | ||
3185 | 3500 | ||
3186 | /* Get tg's load and ensure tg_load > 0 */ | 3501 | gcfs_rq->prop_runnable_sum = 0; |
3187 | tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; | ||
3188 | 3502 | ||
3189 | /* Ensure tg_load >= load and updated with current load*/ | 3503 | load_sum = (s64)se_weight(se) * runnable_sum; |
3190 | tg_load -= gcfs_rq->tg_load_avg_contrib; | 3504 | load_avg = div_s64(load_sum, LOAD_AVG_MAX); |
3191 | tg_load += load; | ||
3192 | 3505 | ||
3193 | /* | 3506 | add_positive(&se->avg.load_sum, runnable_sum); |
3194 | * We need to compute a correction term in the case that the | 3507 | add_positive(&se->avg.load_avg, load_avg); |
3195 | * task group is consuming more CPU than a task of equal | ||
3196 | * weight. A task with a weight equals to tg->shares will have | ||
3197 | * a load less or equal to scale_load_down(tg->shares). | ||
3198 | * Similarly, the sched_entities that represent the task group | ||
3199 | * at parent level, can't have a load higher than | ||
3200 | * scale_load_down(tg->shares). And the Sum of sched_entities' | ||
3201 | * load must be <= scale_load_down(tg->shares). | ||
3202 | */ | ||
3203 | if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { | ||
3204 | /* scale gcfs_rq's load into tg's shares*/ | ||
3205 | load *= scale_load_down(gcfs_rq->tg->shares); | ||
3206 | load /= tg_load; | ||
3207 | } | ||
3208 | } | ||
3209 | 3508 | ||
3210 | delta = load - se->avg.load_avg; | 3509 | add_positive(&cfs_rq->avg.load_avg, load_avg); |
3510 | add_positive(&cfs_rq->avg.load_sum, load_sum); | ||
3211 | 3511 | ||
3212 | /* Nothing to update */ | 3512 | runnable_load_sum = (s64)se_runnable(se) * runnable_sum; |
3213 | if (!delta) | 3513 | runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); |
3214 | return; | ||
3215 | |||
3216 | /* Set new sched_entity's load */ | ||
3217 | se->avg.load_avg = load; | ||
3218 | se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; | ||
3219 | 3514 | ||
3220 | /* Update parent cfs_rq load */ | 3515 | add_positive(&se->avg.runnable_load_sum, runnable_sum); |
3221 | add_positive(&cfs_rq->avg.load_avg, delta); | 3516 | add_positive(&se->avg.runnable_load_avg, runnable_load_avg); |
3222 | cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; | ||
3223 | 3517 | ||
3224 | /* | ||
3225 | * If the sched_entity is already enqueued, we also have to update the | ||
3226 | * runnable load avg. | ||
3227 | */ | ||
3228 | if (se->on_rq) { | 3518 | if (se->on_rq) { |
3229 | /* Update parent cfs_rq runnable_load_avg */ | 3519 | add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); |
3230 | add_positive(&cfs_rq->runnable_load_avg, delta); | 3520 | add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); |
3231 | cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; | ||
3232 | } | 3521 | } |
3233 | } | 3522 | } |
3234 | 3523 | ||
3235 | static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) | 3524 | static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) |
3236 | { | ||
3237 | cfs_rq->propagate_avg = 1; | ||
3238 | } | ||
3239 | |||
3240 | static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) | ||
3241 | { | 3525 | { |
3242 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | 3526 | cfs_rq->propagate = 1; |
3243 | 3527 | cfs_rq->prop_runnable_sum += runnable_sum; | |
3244 | if (!cfs_rq->propagate_avg) | ||
3245 | return 0; | ||
3246 | |||
3247 | cfs_rq->propagate_avg = 0; | ||
3248 | return 1; | ||
3249 | } | 3528 | } |
3250 | 3529 | ||
3251 | /* Update task and its cfs_rq load average */ | 3530 | /* Update task and its cfs_rq load average */ |
3252 | static inline int propagate_entity_load_avg(struct sched_entity *se) | 3531 | static inline int propagate_entity_load_avg(struct sched_entity *se) |
3253 | { | 3532 | { |
3254 | struct cfs_rq *cfs_rq; | 3533 | struct cfs_rq *cfs_rq, *gcfs_rq; |
3255 | 3534 | ||
3256 | if (entity_is_task(se)) | 3535 | if (entity_is_task(se)) |
3257 | return 0; | 3536 | return 0; |
3258 | 3537 | ||
3259 | if (!test_and_clear_tg_cfs_propagate(se)) | 3538 | gcfs_rq = group_cfs_rq(se); |
3539 | if (!gcfs_rq->propagate) | ||
3260 | return 0; | 3540 | return 0; |
3261 | 3541 | ||
3542 | gcfs_rq->propagate = 0; | ||
3543 | |||
3262 | cfs_rq = cfs_rq_of(se); | 3544 | cfs_rq = cfs_rq_of(se); |
3263 | 3545 | ||
3264 | set_tg_cfs_propagate(cfs_rq); | 3546 | add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum); |
3265 | 3547 | ||
3266 | update_tg_cfs_util(cfs_rq, se); | 3548 | update_tg_cfs_util(cfs_rq, se, gcfs_rq); |
3267 | update_tg_cfs_load(cfs_rq, se); | 3549 | update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); |
3268 | 3550 | ||
3269 | return 1; | 3551 | return 1; |
3270 | } | 3552 | } |
@@ -3288,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) | |||
3288 | * If there is a pending propagation, we have to update the load and | 3570 | * If there is a pending propagation, we have to update the load and |
3289 | * the utilization of the sched_entity: | 3571 | * the utilization of the sched_entity: |
3290 | */ | 3572 | */ |
3291 | if (gcfs_rq->propagate_avg) | 3573 | if (gcfs_rq->propagate) |
3292 | return false; | 3574 | return false; |
3293 | 3575 | ||
3294 | /* | 3576 | /* |
@@ -3308,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) | |||
3308 | return 0; | 3590 | return 0; |
3309 | } | 3591 | } |
3310 | 3592 | ||
3311 | static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} | 3593 | static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} |
3312 | 3594 | ||
3313 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 3595 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
3314 | 3596 | ||
3315 | /* | ||
3316 | * Unsigned subtract and clamp on underflow. | ||
3317 | * | ||
3318 | * Explicitly do a load-store to ensure the intermediate value never hits | ||
3319 | * memory. This allows lockless observations without ever seeing the negative | ||
3320 | * values. | ||
3321 | */ | ||
3322 | #define sub_positive(_ptr, _val) do { \ | ||
3323 | typeof(_ptr) ptr = (_ptr); \ | ||
3324 | typeof(*ptr) val = (_val); \ | ||
3325 | typeof(*ptr) res, var = READ_ONCE(*ptr); \ | ||
3326 | res = var - val; \ | ||
3327 | if (res > var) \ | ||
3328 | res = 0; \ | ||
3329 | WRITE_ONCE(*ptr, res); \ | ||
3330 | } while (0) | ||
3331 | |||
3332 | /** | 3597 | /** |
3333 | * update_cfs_rq_load_avg - update the cfs_rq's load/util averages | 3598 | * update_cfs_rq_load_avg - update the cfs_rq's load/util averages |
3334 | * @now: current time, as per cfs_rq_clock_task() | 3599 | * @now: current time, as per cfs_rq_clock_task() |
@@ -3348,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} | |||
3348 | static inline int | 3613 | static inline int |
3349 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | 3614 | update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) |
3350 | { | 3615 | { |
3616 | unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0; | ||
3351 | struct sched_avg *sa = &cfs_rq->avg; | 3617 | struct sched_avg *sa = &cfs_rq->avg; |
3352 | int decayed, removed_load = 0, removed_util = 0; | 3618 | int decayed = 0; |
3353 | 3619 | ||
3354 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { | 3620 | if (cfs_rq->removed.nr) { |
3355 | s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); | 3621 | unsigned long r; |
3622 | u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; | ||
3623 | |||
3624 | raw_spin_lock(&cfs_rq->removed.lock); | ||
3625 | swap(cfs_rq->removed.util_avg, removed_util); | ||
3626 | swap(cfs_rq->removed.load_avg, removed_load); | ||
3627 | swap(cfs_rq->removed.runnable_sum, removed_runnable_sum); | ||
3628 | cfs_rq->removed.nr = 0; | ||
3629 | raw_spin_unlock(&cfs_rq->removed.lock); | ||
3630 | |||
3631 | r = removed_load; | ||
3356 | sub_positive(&sa->load_avg, r); | 3632 | sub_positive(&sa->load_avg, r); |
3357 | sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); | 3633 | sub_positive(&sa->load_sum, r * divider); |
3358 | removed_load = 1; | ||
3359 | set_tg_cfs_propagate(cfs_rq); | ||
3360 | } | ||
3361 | 3634 | ||
3362 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | 3635 | r = removed_util; |
3363 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); | ||
3364 | sub_positive(&sa->util_avg, r); | 3636 | sub_positive(&sa->util_avg, r); |
3365 | sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); | 3637 | sub_positive(&sa->util_sum, r * divider); |
3366 | removed_util = 1; | 3638 | |
3367 | set_tg_cfs_propagate(cfs_rq); | 3639 | add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum); |
3640 | |||
3641 | decayed = 1; | ||
3368 | } | 3642 | } |
3369 | 3643 | ||
3370 | decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); | 3644 | decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); |
3371 | 3645 | ||
3372 | #ifndef CONFIG_64BIT | 3646 | #ifndef CONFIG_64BIT |
3373 | smp_wmb(); | 3647 | smp_wmb(); |
3374 | cfs_rq->load_last_update_time_copy = sa->last_update_time; | 3648 | cfs_rq->load_last_update_time_copy = sa->last_update_time; |
3375 | #endif | 3649 | #endif |
3376 | 3650 | ||
3377 | if (decayed || removed_util) | 3651 | if (decayed) |
3378 | cfs_rq_util_change(cfs_rq); | 3652 | cfs_rq_util_change(cfs_rq); |
3379 | 3653 | ||
3380 | return decayed || removed_load; | 3654 | return decayed; |
3381 | } | ||
3382 | |||
3383 | /* | ||
3384 | * Optional action to be done while updating the load average | ||
3385 | */ | ||
3386 | #define UPDATE_TG 0x1 | ||
3387 | #define SKIP_AGE_LOAD 0x2 | ||
3388 | |||
3389 | /* Update task and its cfs_rq load average */ | ||
3390 | static inline void update_load_avg(struct sched_entity *se, int flags) | ||
3391 | { | ||
3392 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
3393 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
3394 | struct rq *rq = rq_of(cfs_rq); | ||
3395 | int cpu = cpu_of(rq); | ||
3396 | int decayed; | ||
3397 | |||
3398 | /* | ||
3399 | * Track task load average for carrying it to new CPU after migrated, and | ||
3400 | * track group sched_entity load average for task_h_load calc in migration | ||
3401 | */ | ||
3402 | if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) | ||
3403 | __update_load_avg_se(now, cpu, cfs_rq, se); | ||
3404 | |||
3405 | decayed = update_cfs_rq_load_avg(now, cfs_rq); | ||
3406 | decayed |= propagate_entity_load_avg(se); | ||
3407 | |||
3408 | if (decayed && (flags & UPDATE_TG)) | ||
3409 | update_tg_load_avg(cfs_rq, 0); | ||
3410 | } | 3655 | } |
3411 | 3656 | ||
3412 | /** | 3657 | /** |
@@ -3419,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags) | |||
3419 | */ | 3664 | */ |
3420 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3665 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3421 | { | 3666 | { |
3667 | u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; | ||
3668 | |||
3669 | /* | ||
3670 | * When we attach the @se to the @cfs_rq, we must align the decay | ||
3671 | * window because without that, really weird and wonderful things can | ||
3672 | * happen. | ||
3673 | * | ||
3674 | * XXX illustrate | ||
3675 | */ | ||
3422 | se->avg.last_update_time = cfs_rq->avg.last_update_time; | 3676 | se->avg.last_update_time = cfs_rq->avg.last_update_time; |
3423 | cfs_rq->avg.load_avg += se->avg.load_avg; | 3677 | se->avg.period_contrib = cfs_rq->avg.period_contrib; |
3424 | cfs_rq->avg.load_sum += se->avg.load_sum; | 3678 | |
3679 | /* | ||
3680 | * Hell(o) Nasty stuff.. we need to recompute _sum based on the new | ||
3681 | * period_contrib. This isn't strictly correct, but since we're | ||
3682 | * entirely outside of the PELT hierarchy, nobody cares if we truncate | ||
3683 | * _sum a little. | ||
3684 | */ | ||
3685 | se->avg.util_sum = se->avg.util_avg * divider; | ||
3686 | |||
3687 | se->avg.load_sum = divider; | ||
3688 | if (se_weight(se)) { | ||
3689 | se->avg.load_sum = | ||
3690 | div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); | ||
3691 | } | ||
3692 | |||
3693 | se->avg.runnable_load_sum = se->avg.load_sum; | ||
3694 | |||
3695 | enqueue_load_avg(cfs_rq, se); | ||
3425 | cfs_rq->avg.util_avg += se->avg.util_avg; | 3696 | cfs_rq->avg.util_avg += se->avg.util_avg; |
3426 | cfs_rq->avg.util_sum += se->avg.util_sum; | 3697 | cfs_rq->avg.util_sum += se->avg.util_sum; |
3427 | set_tg_cfs_propagate(cfs_rq); | 3698 | |
3699 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); | ||
3428 | 3700 | ||
3429 | cfs_rq_util_change(cfs_rq); | 3701 | cfs_rq_util_change(cfs_rq); |
3430 | } | 3702 | } |
@@ -3439,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3439 | */ | 3711 | */ |
3440 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3712 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3441 | { | 3713 | { |
3442 | 3714 | dequeue_load_avg(cfs_rq, se); | |
3443 | sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); | ||
3444 | sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); | ||
3445 | sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); | 3715 | sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); |
3446 | sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); | 3716 | sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); |
3447 | set_tg_cfs_propagate(cfs_rq); | 3717 | |
3718 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); | ||
3448 | 3719 | ||
3449 | cfs_rq_util_change(cfs_rq); | 3720 | cfs_rq_util_change(cfs_rq); |
3450 | } | 3721 | } |
3451 | 3722 | ||
3452 | /* Add the load generated by se into cfs_rq's load average */ | 3723 | /* |
3453 | static inline void | 3724 | * Optional action to be done while updating the load average |
3454 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3725 | */ |
3726 | #define UPDATE_TG 0x1 | ||
3727 | #define SKIP_AGE_LOAD 0x2 | ||
3728 | #define DO_ATTACH 0x4 | ||
3729 | |||
3730 | /* Update task and its cfs_rq load average */ | ||
3731 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||
3455 | { | 3732 | { |
3456 | struct sched_avg *sa = &se->avg; | 3733 | u64 now = cfs_rq_clock_task(cfs_rq); |
3734 | struct rq *rq = rq_of(cfs_rq); | ||
3735 | int cpu = cpu_of(rq); | ||
3736 | int decayed; | ||
3737 | |||
3738 | /* | ||
3739 | * Track task load average for carrying it to new CPU after migrated, and | ||
3740 | * track group sched_entity load average for task_h_load calc in migration | ||
3741 | */ | ||
3742 | if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) | ||
3743 | __update_load_avg_se(now, cpu, cfs_rq, se); | ||
3457 | 3744 | ||
3458 | cfs_rq->runnable_load_avg += sa->load_avg; | 3745 | decayed = update_cfs_rq_load_avg(now, cfs_rq); |
3459 | cfs_rq->runnable_load_sum += sa->load_sum; | 3746 | decayed |= propagate_entity_load_avg(se); |
3747 | |||
3748 | if (!se->avg.last_update_time && (flags & DO_ATTACH)) { | ||
3460 | 3749 | ||
3461 | if (!sa->last_update_time) { | ||
3462 | attach_entity_load_avg(cfs_rq, se); | 3750 | attach_entity_load_avg(cfs_rq, se); |
3463 | update_tg_load_avg(cfs_rq, 0); | 3751 | update_tg_load_avg(cfs_rq, 0); |
3464 | } | ||
3465 | } | ||
3466 | 3752 | ||
3467 | /* Remove the runnable load generated by se from cfs_rq's runnable load average */ | 3753 | } else if (decayed && (flags & UPDATE_TG)) |
3468 | static inline void | 3754 | update_tg_load_avg(cfs_rq, 0); |
3469 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
3470 | { | ||
3471 | cfs_rq->runnable_load_avg = | ||
3472 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); | ||
3473 | cfs_rq->runnable_load_sum = | ||
3474 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); | ||
3475 | } | 3755 | } |
3476 | 3756 | ||
3477 | #ifndef CONFIG_64BIT | 3757 | #ifndef CONFIG_64BIT |
@@ -3515,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se) | |||
3515 | void remove_entity_load_avg(struct sched_entity *se) | 3795 | void remove_entity_load_avg(struct sched_entity *se) |
3516 | { | 3796 | { |
3517 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3797 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
3798 | unsigned long flags; | ||
3518 | 3799 | ||
3519 | /* | 3800 | /* |
3520 | * tasks cannot exit without having gone through wake_up_new_task() -> | 3801 | * tasks cannot exit without having gone through wake_up_new_task() -> |
@@ -3527,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se) | |||
3527 | */ | 3808 | */ |
3528 | 3809 | ||
3529 | sync_entity_load_avg(se); | 3810 | sync_entity_load_avg(se); |
3530 | atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); | 3811 | |
3531 | atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); | 3812 | raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); |
3813 | ++cfs_rq->removed.nr; | ||
3814 | cfs_rq->removed.util_avg += se->avg.util_avg; | ||
3815 | cfs_rq->removed.load_avg += se->avg.load_avg; | ||
3816 | cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */ | ||
3817 | raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); | ||
3532 | } | 3818 | } |
3533 | 3819 | ||
3534 | static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) | 3820 | static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) |
3535 | { | 3821 | { |
3536 | return cfs_rq->runnable_load_avg; | 3822 | return cfs_rq->avg.runnable_load_avg; |
3537 | } | 3823 | } |
3538 | 3824 | ||
3539 | static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | 3825 | static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) |
@@ -3553,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
3553 | 3839 | ||
3554 | #define UPDATE_TG 0x0 | 3840 | #define UPDATE_TG 0x0 |
3555 | #define SKIP_AGE_LOAD 0x0 | 3841 | #define SKIP_AGE_LOAD 0x0 |
3842 | #define DO_ATTACH 0x0 | ||
3556 | 3843 | ||
3557 | static inline void update_load_avg(struct sched_entity *se, int not_used1) | 3844 | static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) |
3558 | { | 3845 | { |
3559 | cfs_rq_util_change(cfs_rq_of(se)); | 3846 | cfs_rq_util_change(cfs_rq); |
3560 | } | 3847 | } |
3561 | 3848 | ||
3562 | static inline void | ||
3563 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
3564 | static inline void | ||
3565 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
3566 | static inline void remove_entity_load_avg(struct sched_entity *se) {} | 3849 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
3567 | 3850 | ||
3568 | static inline void | 3851 | static inline void |
@@ -3707,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3707 | * its group cfs_rq | 3990 | * its group cfs_rq |
3708 | * - Add its new weight to cfs_rq->load.weight | 3991 | * - Add its new weight to cfs_rq->load.weight |
3709 | */ | 3992 | */ |
3710 | update_load_avg(se, UPDATE_TG); | 3993 | update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); |
3711 | enqueue_entity_load_avg(cfs_rq, se); | 3994 | update_cfs_group(se); |
3712 | update_cfs_shares(se); | 3995 | enqueue_runnable_load_avg(cfs_rq, se); |
3713 | account_entity_enqueue(cfs_rq, se); | 3996 | account_entity_enqueue(cfs_rq, se); |
3714 | 3997 | ||
3715 | if (flags & ENQUEUE_WAKEUP) | 3998 | if (flags & ENQUEUE_WAKEUP) |
@@ -3791,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3791 | * - For group entity, update its weight to reflect the new share | 4074 | * - For group entity, update its weight to reflect the new share |
3792 | * of its group cfs_rq. | 4075 | * of its group cfs_rq. |
3793 | */ | 4076 | */ |
3794 | update_load_avg(se, UPDATE_TG); | 4077 | update_load_avg(cfs_rq, se, UPDATE_TG); |
3795 | dequeue_entity_load_avg(cfs_rq, se); | 4078 | dequeue_runnable_load_avg(cfs_rq, se); |
3796 | 4079 | ||
3797 | update_stats_dequeue(cfs_rq, se, flags); | 4080 | update_stats_dequeue(cfs_rq, se, flags); |
3798 | 4081 | ||
@@ -3815,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3815 | /* return excess runtime on last dequeue */ | 4098 | /* return excess runtime on last dequeue */ |
3816 | return_cfs_rq_runtime(cfs_rq); | 4099 | return_cfs_rq_runtime(cfs_rq); |
3817 | 4100 | ||
3818 | update_cfs_shares(se); | 4101 | update_cfs_group(se); |
3819 | 4102 | ||
3820 | /* | 4103 | /* |
3821 | * Now advance min_vruntime if @se was the entity holding it back, | 4104 | * Now advance min_vruntime if @se was the entity holding it back, |
@@ -3879,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3879 | */ | 4162 | */ |
3880 | update_stats_wait_end(cfs_rq, se); | 4163 | update_stats_wait_end(cfs_rq, se); |
3881 | __dequeue_entity(cfs_rq, se); | 4164 | __dequeue_entity(cfs_rq, se); |
3882 | update_load_avg(se, UPDATE_TG); | 4165 | update_load_avg(cfs_rq, se, UPDATE_TG); |
3883 | } | 4166 | } |
3884 | 4167 | ||
3885 | update_stats_curr_start(cfs_rq, se); | 4168 | update_stats_curr_start(cfs_rq, se); |
@@ -3981,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
3981 | /* Put 'current' back into the tree. */ | 4264 | /* Put 'current' back into the tree. */ |
3982 | __enqueue_entity(cfs_rq, prev); | 4265 | __enqueue_entity(cfs_rq, prev); |
3983 | /* in !on_rq case, update occurred at dequeue */ | 4266 | /* in !on_rq case, update occurred at dequeue */ |
3984 | update_load_avg(prev, 0); | 4267 | update_load_avg(cfs_rq, prev, 0); |
3985 | } | 4268 | } |
3986 | cfs_rq->curr = NULL; | 4269 | cfs_rq->curr = NULL; |
3987 | } | 4270 | } |
@@ -3997,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
3997 | /* | 4280 | /* |
3998 | * Ensure that runnable average is periodically updated. | 4281 | * Ensure that runnable average is periodically updated. |
3999 | */ | 4282 | */ |
4000 | update_load_avg(curr, UPDATE_TG); | 4283 | update_load_avg(cfs_rq, curr, UPDATE_TG); |
4001 | update_cfs_shares(curr); | 4284 | update_cfs_group(curr); |
4002 | 4285 | ||
4003 | #ifdef CONFIG_SCHED_HRTICK | 4286 | #ifdef CONFIG_SCHED_HRTICK |
4004 | /* | 4287 | /* |
@@ -4915,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4915 | if (cfs_rq_throttled(cfs_rq)) | 5198 | if (cfs_rq_throttled(cfs_rq)) |
4916 | break; | 5199 | break; |
4917 | 5200 | ||
4918 | update_load_avg(se, UPDATE_TG); | 5201 | update_load_avg(cfs_rq, se, UPDATE_TG); |
4919 | update_cfs_shares(se); | 5202 | update_cfs_group(se); |
4920 | } | 5203 | } |
4921 | 5204 | ||
4922 | if (!se) | 5205 | if (!se) |
@@ -4974,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4974 | if (cfs_rq_throttled(cfs_rq)) | 5257 | if (cfs_rq_throttled(cfs_rq)) |
4975 | break; | 5258 | break; |
4976 | 5259 | ||
4977 | update_load_avg(se, UPDATE_TG); | 5260 | update_load_avg(cfs_rq, se, UPDATE_TG); |
4978 | update_cfs_shares(se); | 5261 | update_cfs_group(se); |
4979 | } | 5262 | } |
4980 | 5263 | ||
4981 | if (!se) | 5264 | if (!se) |
@@ -5449,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) | |||
5449 | /* | 5732 | /* |
5450 | * find_idlest_group finds and returns the least busy CPU group within the | 5733 | * find_idlest_group finds and returns the least busy CPU group within the |
5451 | * domain. | 5734 | * domain. |
5735 | * | ||
5736 | * Assumes p is allowed on at least one CPU in sd. | ||
5452 | */ | 5737 | */ |
5453 | static struct sched_group * | 5738 | static struct sched_group * |
5454 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 5739 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
@@ -5456,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5456 | { | 5741 | { |
5457 | struct sched_group *idlest = NULL, *group = sd->groups; | 5742 | struct sched_group *idlest = NULL, *group = sd->groups; |
5458 | struct sched_group *most_spare_sg = NULL; | 5743 | struct sched_group *most_spare_sg = NULL; |
5459 | unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; | 5744 | unsigned long min_runnable_load = ULONG_MAX; |
5460 | unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; | 5745 | unsigned long this_runnable_load = ULONG_MAX; |
5746 | unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; | ||
5461 | unsigned long most_spare = 0, this_spare = 0; | 5747 | unsigned long most_spare = 0, this_spare = 0; |
5462 | int load_idx = sd->forkexec_idx; | 5748 | int load_idx = sd->forkexec_idx; |
5463 | int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; | 5749 | int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; |
@@ -5578,10 +5864,10 @@ skip_spare: | |||
5578 | } | 5864 | } |
5579 | 5865 | ||
5580 | /* | 5866 | /* |
5581 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 5867 | * find_idlest_group_cpu - find the idlest cpu among the cpus in group. |
5582 | */ | 5868 | */ |
5583 | static int | 5869 | static int |
5584 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 5870 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
5585 | { | 5871 | { |
5586 | unsigned long load, min_load = ULONG_MAX; | 5872 | unsigned long load, min_load = ULONG_MAX; |
5587 | unsigned int min_exit_latency = UINT_MAX; | 5873 | unsigned int min_exit_latency = UINT_MAX; |
@@ -5630,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
5630 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; | 5916 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; |
5631 | } | 5917 | } |
5632 | 5918 | ||
5919 | static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, | ||
5920 | int cpu, int prev_cpu, int sd_flag) | ||
5921 | { | ||
5922 | int new_cpu = cpu; | ||
5923 | |||
5924 | if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) | ||
5925 | return prev_cpu; | ||
5926 | |||
5927 | while (sd) { | ||
5928 | struct sched_group *group; | ||
5929 | struct sched_domain *tmp; | ||
5930 | int weight; | ||
5931 | |||
5932 | if (!(sd->flags & sd_flag)) { | ||
5933 | sd = sd->child; | ||
5934 | continue; | ||
5935 | } | ||
5936 | |||
5937 | group = find_idlest_group(sd, p, cpu, sd_flag); | ||
5938 | if (!group) { | ||
5939 | sd = sd->child; | ||
5940 | continue; | ||
5941 | } | ||
5942 | |||
5943 | new_cpu = find_idlest_group_cpu(group, p, cpu); | ||
5944 | if (new_cpu == cpu) { | ||
5945 | /* Now try balancing at a lower domain level of cpu */ | ||
5946 | sd = sd->child; | ||
5947 | continue; | ||
5948 | } | ||
5949 | |||
5950 | /* Now try balancing at a lower domain level of new_cpu */ | ||
5951 | cpu = new_cpu; | ||
5952 | weight = sd->span_weight; | ||
5953 | sd = NULL; | ||
5954 | for_each_domain(cpu, tmp) { | ||
5955 | if (weight <= tmp->span_weight) | ||
5956 | break; | ||
5957 | if (tmp->flags & sd_flag) | ||
5958 | sd = tmp; | ||
5959 | } | ||
5960 | /* while loop will break here if sd == NULL */ | ||
5961 | } | ||
5962 | |||
5963 | return new_cpu; | ||
5964 | } | ||
5965 | |||
5633 | #ifdef CONFIG_SCHED_SMT | 5966 | #ifdef CONFIG_SCHED_SMT |
5634 | 5967 | ||
5635 | static inline void set_idle_cores(int cpu, int val) | 5968 | static inline void set_idle_cores(int cpu, int val) |
@@ -5982,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
5982 | new_cpu = cpu; | 6315 | new_cpu = cpu; |
5983 | } | 6316 | } |
5984 | 6317 | ||
6318 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { | ||
6319 | /* | ||
6320 | * We're going to need the task's util for capacity_spare_wake | ||
6321 | * in find_idlest_group. Sync it up to prev_cpu's | ||
6322 | * last_update_time. | ||
6323 | */ | ||
6324 | sync_entity_load_avg(&p->se); | ||
6325 | } | ||
6326 | |||
5985 | if (!sd) { | 6327 | if (!sd) { |
5986 | pick_cpu: | 6328 | pick_cpu: |
5987 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ | 6329 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
5988 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); | 6330 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); |
5989 | 6331 | ||
5990 | } else while (sd) { | 6332 | } else { |
5991 | struct sched_group *group; | 6333 | new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); |
5992 | int weight; | ||
5993 | |||
5994 | if (!(sd->flags & sd_flag)) { | ||
5995 | sd = sd->child; | ||
5996 | continue; | ||
5997 | } | ||
5998 | |||
5999 | group = find_idlest_group(sd, p, cpu, sd_flag); | ||
6000 | if (!group) { | ||
6001 | sd = sd->child; | ||
6002 | continue; | ||
6003 | } | ||
6004 | |||
6005 | new_cpu = find_idlest_cpu(group, p, cpu); | ||
6006 | if (new_cpu == -1 || new_cpu == cpu) { | ||
6007 | /* Now try balancing at a lower domain level of cpu */ | ||
6008 | sd = sd->child; | ||
6009 | continue; | ||
6010 | } | ||
6011 | |||
6012 | /* Now try balancing at a lower domain level of new_cpu */ | ||
6013 | cpu = new_cpu; | ||
6014 | weight = sd->span_weight; | ||
6015 | sd = NULL; | ||
6016 | for_each_domain(cpu, tmp) { | ||
6017 | if (weight <= tmp->span_weight) | ||
6018 | break; | ||
6019 | if (tmp->flags & sd_flag) | ||
6020 | sd = tmp; | ||
6021 | } | ||
6022 | /* while loop will break here if sd == NULL */ | ||
6023 | } | 6334 | } |
6024 | rcu_read_unlock(); | 6335 | rcu_read_unlock(); |
6025 | 6336 | ||
6026 | return new_cpu; | 6337 | return new_cpu; |
6027 | } | 6338 | } |
6028 | 6339 | ||
6340 | static void detach_entity_cfs_rq(struct sched_entity *se); | ||
6341 | |||
6029 | /* | 6342 | /* |
6030 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 6343 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and |
6031 | * cfs_rq_of(p) references at time of call are still valid and identify the | 6344 | * cfs_rq_of(p) references at time of call are still valid and identify the |
@@ -6059,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p) | |||
6059 | se->vruntime -= min_vruntime; | 6372 | se->vruntime -= min_vruntime; |
6060 | } | 6373 | } |
6061 | 6374 | ||
6062 | /* | 6375 | if (p->on_rq == TASK_ON_RQ_MIGRATING) { |
6063 | * We are supposed to update the task to "current" time, then its up to date | 6376 | /* |
6064 | * and ready to go to new CPU/cfs_rq. But we have difficulty in getting | 6377 | * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' |
6065 | * what current time is, so simply throw away the out-of-date time. This | 6378 | * rq->lock and can modify state directly. |
6066 | * will result in the wakee task is less decayed, but giving the wakee more | 6379 | */ |
6067 | * load sounds not bad. | 6380 | lockdep_assert_held(&task_rq(p)->lock); |
6068 | */ | 6381 | detach_entity_cfs_rq(&p->se); |
6069 | remove_entity_load_avg(&p->se); | 6382 | |
6383 | } else { | ||
6384 | /* | ||
6385 | * We are supposed to update the task to "current" time, then | ||
6386 | * its up to date and ready to go to new CPU/cfs_rq. But we | ||
6387 | * have difficulty in getting what current time is, so simply | ||
6388 | * throw away the out-of-date time. This will result in the | ||
6389 | * wakee task is less decayed, but giving the wakee more load | ||
6390 | * sounds not bad. | ||
6391 | */ | ||
6392 | remove_entity_load_avg(&p->se); | ||
6393 | } | ||
6070 | 6394 | ||
6071 | /* Tell new CPU we are migrated */ | 6395 | /* Tell new CPU we are migrated */ |
6072 | p->se.avg.last_update_time = 0; | 6396 | p->se.avg.last_update_time = 0; |
@@ -6334,10 +6658,7 @@ again: | |||
6334 | set_next_entity(cfs_rq, se); | 6658 | set_next_entity(cfs_rq, se); |
6335 | } | 6659 | } |
6336 | 6660 | ||
6337 | if (hrtick_enabled(rq)) | 6661 | goto done; |
6338 | hrtick_start_fair(rq, p); | ||
6339 | |||
6340 | return p; | ||
6341 | simple: | 6662 | simple: |
6342 | #endif | 6663 | #endif |
6343 | 6664 | ||
@@ -6351,6 +6672,16 @@ simple: | |||
6351 | 6672 | ||
6352 | p = task_of(se); | 6673 | p = task_of(se); |
6353 | 6674 | ||
6675 | done: __maybe_unused | ||
6676 | #ifdef CONFIG_SMP | ||
6677 | /* | ||
6678 | * Move the next running task to the front of | ||
6679 | * the list, so our cfs_tasks list becomes MRU | ||
6680 | * one. | ||
6681 | */ | ||
6682 | list_move(&p->se.group_node, &rq->cfs_tasks); | ||
6683 | #endif | ||
6684 | |||
6354 | if (hrtick_enabled(rq)) | 6685 | if (hrtick_enabled(rq)) |
6355 | hrtick_start_fair(rq, p); | 6686 | hrtick_start_fair(rq, p); |
6356 | 6687 | ||
@@ -6786,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) | |||
6786 | */ | 7117 | */ |
6787 | static struct task_struct *detach_one_task(struct lb_env *env) | 7118 | static struct task_struct *detach_one_task(struct lb_env *env) |
6788 | { | 7119 | { |
6789 | struct task_struct *p, *n; | 7120 | struct task_struct *p; |
6790 | 7121 | ||
6791 | lockdep_assert_held(&env->src_rq->lock); | 7122 | lockdep_assert_held(&env->src_rq->lock); |
6792 | 7123 | ||
6793 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 7124 | list_for_each_entry_reverse(p, |
7125 | &env->src_rq->cfs_tasks, se.group_node) { | ||
6794 | if (!can_migrate_task(p, env)) | 7126 | if (!can_migrate_task(p, env)) |
6795 | continue; | 7127 | continue; |
6796 | 7128 | ||
@@ -6836,7 +7168,7 @@ static int detach_tasks(struct lb_env *env) | |||
6836 | if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) | 7168 | if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) |
6837 | break; | 7169 | break; |
6838 | 7170 | ||
6839 | p = list_first_entry(tasks, struct task_struct, se.group_node); | 7171 | p = list_last_entry(tasks, struct task_struct, se.group_node); |
6840 | 7172 | ||
6841 | env->loop++; | 7173 | env->loop++; |
6842 | /* We've more or less seen every task there is, call it quits */ | 7174 | /* We've more or less seen every task there is, call it quits */ |
@@ -6886,7 +7218,7 @@ static int detach_tasks(struct lb_env *env) | |||
6886 | 7218 | ||
6887 | continue; | 7219 | continue; |
6888 | next: | 7220 | next: |
6889 | list_move_tail(&p->se.group_node, tasks); | 7221 | list_move(&p->se.group_node, tasks); |
6890 | } | 7222 | } |
6891 | 7223 | ||
6892 | /* | 7224 | /* |
@@ -6962,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | |||
6962 | if (cfs_rq->avg.util_sum) | 7294 | if (cfs_rq->avg.util_sum) |
6963 | return false; | 7295 | return false; |
6964 | 7296 | ||
6965 | if (cfs_rq->runnable_load_sum) | 7297 | if (cfs_rq->avg.runnable_load_sum) |
6966 | return false; | 7298 | return false; |
6967 | 7299 | ||
6968 | return true; | 7300 | return true; |
@@ -6994,7 +7326,7 @@ static void update_blocked_averages(int cpu) | |||
6994 | /* Propagate pending load changes to the parent, if any: */ | 7326 | /* Propagate pending load changes to the parent, if any: */ |
6995 | se = cfs_rq->tg->se[cpu]; | 7327 | se = cfs_rq->tg->se[cpu]; |
6996 | if (se && !skip_blocked_update(se)) | 7328 | if (se && !skip_blocked_update(se)) |
6997 | update_load_avg(se, 0); | 7329 | update_load_avg(cfs_rq_of(se), se, 0); |
6998 | 7330 | ||
6999 | /* | 7331 | /* |
7000 | * There can be a lot of idle CPU cgroups. Don't let fully | 7332 | * There can be a lot of idle CPU cgroups. Don't let fully |
@@ -7875,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
7875 | if (busiest->group_type == group_imbalanced) | 8207 | if (busiest->group_type == group_imbalanced) |
7876 | goto force_balance; | 8208 | goto force_balance; |
7877 | 8209 | ||
7878 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 8210 | /* |
7879 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && | 8211 | * When dst_cpu is idle, prevent SMP nice and/or asymmetric group |
8212 | * capacities from resulting in underutilization due to avg_load. | ||
8213 | */ | ||
8214 | if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && | ||
7880 | busiest->group_no_capacity) | 8215 | busiest->group_no_capacity) |
7881 | goto force_balance; | 8216 | goto force_balance; |
7882 | 8217 | ||
@@ -8693,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu) | |||
8693 | return; | 9028 | return; |
8694 | 9029 | ||
8695 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | 9030 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ |
8696 | if (!is_housekeeping_cpu(cpu)) | 9031 | if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) |
8697 | return; | 9032 | return; |
8698 | 9033 | ||
8699 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 9034 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
@@ -9158,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) | |||
9158 | if (cfs_rq_throttled(cfs_rq)) | 9493 | if (cfs_rq_throttled(cfs_rq)) |
9159 | break; | 9494 | break; |
9160 | 9495 | ||
9161 | update_load_avg(se, UPDATE_TG); | 9496 | update_load_avg(cfs_rq, se, UPDATE_TG); |
9162 | } | 9497 | } |
9163 | } | 9498 | } |
9164 | #else | 9499 | #else |
@@ -9170,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) | |||
9170 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 9505 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
9171 | 9506 | ||
9172 | /* Catch up with the cfs_rq and remove our load when we leave */ | 9507 | /* Catch up with the cfs_rq and remove our load when we leave */ |
9173 | update_load_avg(se, 0); | 9508 | update_load_avg(cfs_rq, se, 0); |
9174 | detach_entity_load_avg(cfs_rq, se); | 9509 | detach_entity_load_avg(cfs_rq, se); |
9175 | update_tg_load_avg(cfs_rq, false); | 9510 | update_tg_load_avg(cfs_rq, false); |
9176 | propagate_entity_cfs_rq(se); | 9511 | propagate_entity_cfs_rq(se); |
@@ -9189,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) | |||
9189 | #endif | 9524 | #endif |
9190 | 9525 | ||
9191 | /* Synchronize entity with its cfs_rq */ | 9526 | /* Synchronize entity with its cfs_rq */ |
9192 | update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); | 9527 | update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); |
9193 | attach_entity_load_avg(cfs_rq, se); | 9528 | attach_entity_load_avg(cfs_rq, se); |
9194 | update_tg_load_avg(cfs_rq, false); | 9529 | update_tg_load_avg(cfs_rq, false); |
9195 | propagate_entity_cfs_rq(se); | 9530 | propagate_entity_cfs_rq(se); |
@@ -9271,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
9271 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 9606 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
9272 | #endif | 9607 | #endif |
9273 | #ifdef CONFIG_SMP | 9608 | #ifdef CONFIG_SMP |
9274 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9609 | raw_spin_lock_init(&cfs_rq->removed.lock); |
9275 | cfs_rq->propagate_avg = 0; | ||
9276 | #endif | ||
9277 | atomic_long_set(&cfs_rq->removed_load_avg, 0); | ||
9278 | atomic_long_set(&cfs_rq->removed_util_avg, 0); | ||
9279 | #endif | 9610 | #endif |
9280 | } | 9611 | } |
9281 | 9612 | ||
@@ -9473,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
9473 | rq_lock_irqsave(rq, &rf); | 9804 | rq_lock_irqsave(rq, &rf); |
9474 | update_rq_clock(rq); | 9805 | update_rq_clock(rq); |
9475 | for_each_sched_entity(se) { | 9806 | for_each_sched_entity(se) { |
9476 | update_load_avg(se, UPDATE_TG); | 9807 | update_load_avg(cfs_rq_of(se), se, UPDATE_TG); |
9477 | update_cfs_shares(se); | 9808 | update_cfs_group(se); |
9478 | } | 9809 | } |
9479 | rq_unlock_irqrestore(rq, &rf); | 9810 | rq_unlock_irqrestore(rq, &rf); |
9480 | } | 9811 | } |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 257f4f0b4532..7dae9eb8c042 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -209,6 +209,7 @@ exit_idle: | |||
209 | */ | 209 | */ |
210 | static void do_idle(void) | 210 | static void do_idle(void) |
211 | { | 211 | { |
212 | int cpu = smp_processor_id(); | ||
212 | /* | 213 | /* |
213 | * If the arch has a polling bit, we maintain an invariant: | 214 | * If the arch has a polling bit, we maintain an invariant: |
214 | * | 215 | * |
@@ -219,14 +220,13 @@ static void do_idle(void) | |||
219 | */ | 220 | */ |
220 | 221 | ||
221 | __current_set_polling(); | 222 | __current_set_polling(); |
222 | quiet_vmstat(); | ||
223 | tick_nohz_idle_enter(); | 223 | tick_nohz_idle_enter(); |
224 | 224 | ||
225 | while (!need_resched()) { | 225 | while (!need_resched()) { |
226 | check_pgt_cache(); | 226 | check_pgt_cache(); |
227 | rmb(); | 227 | rmb(); |
228 | 228 | ||
229 | if (cpu_is_offline(smp_processor_id())) { | 229 | if (cpu_is_offline(cpu)) { |
230 | cpuhp_report_idle_dead(); | 230 | cpuhp_report_idle_dead(); |
231 | arch_cpu_idle_dead(); | 231 | arch_cpu_idle_dead(); |
232 | } | 232 | } |
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c new file mode 100644 index 000000000000..b71b436f59f2 --- /dev/null +++ b/kernel/sched/isolation.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * Housekeeping management. Manage the targets for routine code that can run on | ||
3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. | ||
4 | * | ||
5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker | ||
6 | * | ||
7 | */ | ||
8 | |||
9 | #include <linux/sched/isolation.h> | ||
10 | #include <linux/tick.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/static_key.h> | ||
14 | #include <linux/ctype.h> | ||
15 | |||
16 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); | ||
17 | EXPORT_SYMBOL_GPL(housekeeping_overriden); | ||
18 | static cpumask_var_t housekeeping_mask; | ||
19 | static unsigned int housekeeping_flags; | ||
20 | |||
21 | int housekeeping_any_cpu(enum hk_flags flags) | ||
22 | { | ||
23 | if (static_branch_unlikely(&housekeeping_overriden)) | ||
24 | if (housekeeping_flags & flags) | ||
25 | return cpumask_any_and(housekeeping_mask, cpu_online_mask); | ||
26 | return smp_processor_id(); | ||
27 | } | ||
28 | EXPORT_SYMBOL_GPL(housekeeping_any_cpu); | ||
29 | |||
30 | const struct cpumask *housekeeping_cpumask(enum hk_flags flags) | ||
31 | { | ||
32 | if (static_branch_unlikely(&housekeeping_overriden)) | ||
33 | if (housekeeping_flags & flags) | ||
34 | return housekeeping_mask; | ||
35 | return cpu_possible_mask; | ||
36 | } | ||
37 | EXPORT_SYMBOL_GPL(housekeeping_cpumask); | ||
38 | |||
39 | void housekeeping_affine(struct task_struct *t, enum hk_flags flags) | ||
40 | { | ||
41 | if (static_branch_unlikely(&housekeeping_overriden)) | ||
42 | if (housekeeping_flags & flags) | ||
43 | set_cpus_allowed_ptr(t, housekeeping_mask); | ||
44 | } | ||
45 | EXPORT_SYMBOL_GPL(housekeeping_affine); | ||
46 | |||
47 | bool housekeeping_test_cpu(int cpu, enum hk_flags flags) | ||
48 | { | ||
49 | if (static_branch_unlikely(&housekeeping_overriden)) | ||
50 | if (housekeeping_flags & flags) | ||
51 | return cpumask_test_cpu(cpu, housekeeping_mask); | ||
52 | return true; | ||
53 | } | ||
54 | EXPORT_SYMBOL_GPL(housekeeping_test_cpu); | ||
55 | |||
56 | void __init housekeeping_init(void) | ||
57 | { | ||
58 | if (!housekeeping_flags) | ||
59 | return; | ||
60 | |||
61 | static_branch_enable(&housekeeping_overriden); | ||
62 | |||
63 | /* We need at least one CPU to handle housekeeping work */ | ||
64 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); | ||
65 | } | ||
66 | |||
67 | static int __init housekeeping_setup(char *str, enum hk_flags flags) | ||
68 | { | ||
69 | cpumask_var_t non_housekeeping_mask; | ||
70 | int err; | ||
71 | |||
72 | alloc_bootmem_cpumask_var(&non_housekeeping_mask); | ||
73 | err = cpulist_parse(str, non_housekeeping_mask); | ||
74 | if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) { | ||
75 | pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n"); | ||
76 | free_bootmem_cpumask_var(non_housekeeping_mask); | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | if (!housekeeping_flags) { | ||
81 | alloc_bootmem_cpumask_var(&housekeeping_mask); | ||
82 | cpumask_andnot(housekeeping_mask, | ||
83 | cpu_possible_mask, non_housekeeping_mask); | ||
84 | if (cpumask_empty(housekeeping_mask)) | ||
85 | cpumask_set_cpu(smp_processor_id(), housekeeping_mask); | ||
86 | } else { | ||
87 | cpumask_var_t tmp; | ||
88 | |||
89 | alloc_bootmem_cpumask_var(&tmp); | ||
90 | cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); | ||
91 | if (!cpumask_equal(tmp, housekeeping_mask)) { | ||
92 | pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); | ||
93 | free_bootmem_cpumask_var(tmp); | ||
94 | free_bootmem_cpumask_var(non_housekeeping_mask); | ||
95 | return 0; | ||
96 | } | ||
97 | free_bootmem_cpumask_var(tmp); | ||
98 | } | ||
99 | |||
100 | if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { | ||
101 | if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { | ||
102 | tick_nohz_full_setup(non_housekeeping_mask); | ||
103 | } else { | ||
104 | pr_warn("Housekeeping: nohz unsupported." | ||
105 | " Build with CONFIG_NO_HZ_FULL\n"); | ||
106 | free_bootmem_cpumask_var(non_housekeeping_mask); | ||
107 | return 0; | ||
108 | } | ||
109 | } | ||
110 | |||
111 | housekeeping_flags |= flags; | ||
112 | |||
113 | free_bootmem_cpumask_var(non_housekeeping_mask); | ||
114 | |||
115 | return 1; | ||
116 | } | ||
117 | |||
118 | static int __init housekeeping_nohz_full_setup(char *str) | ||
119 | { | ||
120 | unsigned int flags; | ||
121 | |||
122 | flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; | ||
123 | |||
124 | return housekeeping_setup(str, flags); | ||
125 | } | ||
126 | __setup("nohz_full=", housekeeping_nohz_full_setup); | ||
127 | |||
128 | static int __init housekeeping_isolcpus_setup(char *str) | ||
129 | { | ||
130 | unsigned int flags = 0; | ||
131 | |||
132 | while (isalpha(*str)) { | ||
133 | if (!strncmp(str, "nohz,", 5)) { | ||
134 | str += 5; | ||
135 | flags |= HK_FLAG_TICK; | ||
136 | continue; | ||
137 | } | ||
138 | |||
139 | if (!strncmp(str, "domain,", 7)) { | ||
140 | str += 7; | ||
141 | flags |= HK_FLAG_DOMAIN; | ||
142 | continue; | ||
143 | } | ||
144 | |||
145 | pr_warn("isolcpus: Error, unknown flag\n"); | ||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | /* Default behaviour for isolcpus without flags */ | ||
150 | if (!flags) | ||
151 | flags |= HK_FLAG_DOMAIN; | ||
152 | |||
153 | return housekeeping_setup(str, flags); | ||
154 | } | ||
155 | __setup("isolcpus=", housekeeping_isolcpus_setup); | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3c96c80e0992..d8c43d73e078 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
74 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 74 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
75 | } | 75 | } |
76 | 76 | ||
77 | #if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) | ||
78 | static void push_irq_work_func(struct irq_work *work); | ||
79 | #endif | ||
80 | |||
81 | void init_rt_rq(struct rt_rq *rt_rq) | 77 | void init_rt_rq(struct rt_rq *rt_rq) |
82 | { | 78 | { |
83 | struct rt_prio_array *array; | 79 | struct rt_prio_array *array; |
@@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq) | |||
97 | rt_rq->rt_nr_migratory = 0; | 93 | rt_rq->rt_nr_migratory = 0; |
98 | rt_rq->overloaded = 0; | 94 | rt_rq->overloaded = 0; |
99 | plist_head_init(&rt_rq->pushable_tasks); | 95 | plist_head_init(&rt_rq->pushable_tasks); |
100 | |||
101 | #ifdef HAVE_RT_PUSH_IPI | ||
102 | rt_rq->push_flags = 0; | ||
103 | rt_rq->push_cpu = nr_cpu_ids; | ||
104 | raw_spin_lock_init(&rt_rq->push_lock); | ||
105 | init_irq_work(&rt_rq->push_work, push_irq_work_func); | ||
106 | #endif | ||
107 | #endif /* CONFIG_SMP */ | 96 | #endif /* CONFIG_SMP */ |
108 | /* We start is dequeued state, because no RT tasks are queued */ | 97 | /* We start is dequeued state, because no RT tasks are queued */ |
109 | rt_rq->rt_queued = 0; | 98 | rt_rq->rt_queued = 0; |
@@ -1876,241 +1865,166 @@ static void push_rt_tasks(struct rq *rq) | |||
1876 | } | 1865 | } |
1877 | 1866 | ||
1878 | #ifdef HAVE_RT_PUSH_IPI | 1867 | #ifdef HAVE_RT_PUSH_IPI |
1868 | |||
1879 | /* | 1869 | /* |
1880 | * The search for the next cpu always starts at rq->cpu and ends | 1870 | * When a high priority task schedules out from a CPU and a lower priority |
1881 | * when we reach rq->cpu again. It will never return rq->cpu. | 1871 | * task is scheduled in, a check is made to see if there's any RT tasks |
1882 | * This returns the next cpu to check, or nr_cpu_ids if the loop | 1872 | * on other CPUs that are waiting to run because a higher priority RT task |
1883 | * is complete. | 1873 | * is currently running on its CPU. In this case, the CPU with multiple RT |
1874 | * tasks queued on it (overloaded) needs to be notified that a CPU has opened | ||
1875 | * up that may be able to run one of its non-running queued RT tasks. | ||
1876 | * | ||
1877 | * All CPUs with overloaded RT tasks need to be notified as there is currently | ||
1878 | * no way to know which of these CPUs have the highest priority task waiting | ||
1879 | * to run. Instead of trying to take a spinlock on each of these CPUs, | ||
1880 | * which has shown to cause large latency when done on machines with many | ||
1881 | * CPUs, sending an IPI to the CPUs to have them push off the overloaded | ||
1882 | * RT tasks waiting to run. | ||
1883 | * | ||
1884 | * Just sending an IPI to each of the CPUs is also an issue, as on large | ||
1885 | * count CPU machines, this can cause an IPI storm on a CPU, especially | ||
1886 | * if its the only CPU with multiple RT tasks queued, and a large number | ||
1887 | * of CPUs scheduling a lower priority task at the same time. | ||
1888 | * | ||
1889 | * Each root domain has its own irq work function that can iterate over | ||
1890 | * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT | ||
1891 | * tassk must be checked if there's one or many CPUs that are lowering | ||
1892 | * their priority, there's a single irq work iterator that will try to | ||
1893 | * push off RT tasks that are waiting to run. | ||
1894 | * | ||
1895 | * When a CPU schedules a lower priority task, it will kick off the | ||
1896 | * irq work iterator that will jump to each CPU with overloaded RT tasks. | ||
1897 | * As it only takes the first CPU that schedules a lower priority task | ||
1898 | * to start the process, the rto_start variable is incremented and if | ||
1899 | * the atomic result is one, then that CPU will try to take the rto_lock. | ||
1900 | * This prevents high contention on the lock as the process handles all | ||
1901 | * CPUs scheduling lower priority tasks. | ||
1902 | * | ||
1903 | * All CPUs that are scheduling a lower priority task will increment the | ||
1904 | * rt_loop_next variable. This will make sure that the irq work iterator | ||
1905 | * checks all RT overloaded CPUs whenever a CPU schedules a new lower | ||
1906 | * priority task, even if the iterator is in the middle of a scan. Incrementing | ||
1907 | * the rt_loop_next will cause the iterator to perform another scan. | ||
1884 | * | 1908 | * |
1885 | * rq->rt.push_cpu holds the last cpu returned by this function, | ||
1886 | * or if this is the first instance, it must hold rq->cpu. | ||
1887 | */ | 1909 | */ |
1888 | static int rto_next_cpu(struct rq *rq) | 1910 | static int rto_next_cpu(struct rq *rq) |
1889 | { | 1911 | { |
1890 | int prev_cpu = rq->rt.push_cpu; | 1912 | struct root_domain *rd = rq->rd; |
1913 | int next; | ||
1891 | int cpu; | 1914 | int cpu; |
1892 | 1915 | ||
1893 | cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); | ||
1894 | |||
1895 | /* | 1916 | /* |
1896 | * If the previous cpu is less than the rq's CPU, then it already | 1917 | * When starting the IPI RT pushing, the rto_cpu is set to -1, |
1897 | * passed the end of the mask, and has started from the beginning. | 1918 | * rt_next_cpu() will simply return the first CPU found in |
1898 | * We end if the next CPU is greater or equal to rq's CPU. | 1919 | * the rto_mask. |
1920 | * | ||
1921 | * If rto_next_cpu() is called with rto_cpu is a valid cpu, it | ||
1922 | * will return the next CPU found in the rto_mask. | ||
1923 | * | ||
1924 | * If there are no more CPUs left in the rto_mask, then a check is made | ||
1925 | * against rto_loop and rto_loop_next. rto_loop is only updated with | ||
1926 | * the rto_lock held, but any CPU may increment the rto_loop_next | ||
1927 | * without any locking. | ||
1899 | */ | 1928 | */ |
1900 | if (prev_cpu < rq->cpu) { | 1929 | for (;;) { |
1901 | if (cpu >= rq->cpu) | ||
1902 | return nr_cpu_ids; | ||
1903 | 1930 | ||
1904 | } else if (cpu >= nr_cpu_ids) { | 1931 | /* When rto_cpu is -1 this acts like cpumask_first() */ |
1905 | /* | 1932 | cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); |
1906 | * We passed the end of the mask, start at the beginning. | ||
1907 | * If the result is greater or equal to the rq's CPU, then | ||
1908 | * the loop is finished. | ||
1909 | */ | ||
1910 | cpu = cpumask_first(rq->rd->rto_mask); | ||
1911 | if (cpu >= rq->cpu) | ||
1912 | return nr_cpu_ids; | ||
1913 | } | ||
1914 | rq->rt.push_cpu = cpu; | ||
1915 | 1933 | ||
1916 | /* Return cpu to let the caller know if the loop is finished or not */ | 1934 | rd->rto_cpu = cpu; |
1917 | return cpu; | ||
1918 | } | ||
1919 | 1935 | ||
1920 | static int find_next_push_cpu(struct rq *rq) | 1936 | if (cpu < nr_cpu_ids) |
1921 | { | 1937 | return cpu; |
1922 | struct rq *next_rq; | ||
1923 | int cpu; | ||
1924 | 1938 | ||
1925 | while (1) { | 1939 | rd->rto_cpu = -1; |
1926 | cpu = rto_next_cpu(rq); | 1940 | |
1927 | if (cpu >= nr_cpu_ids) | 1941 | /* |
1928 | break; | 1942 | * ACQUIRE ensures we see the @rto_mask changes |
1929 | next_rq = cpu_rq(cpu); | 1943 | * made prior to the @next value observed. |
1944 | * | ||
1945 | * Matches WMB in rt_set_overload(). | ||
1946 | */ | ||
1947 | next = atomic_read_acquire(&rd->rto_loop_next); | ||
1930 | 1948 | ||
1931 | /* Make sure the next rq can push to this rq */ | 1949 | if (rd->rto_loop == next) |
1932 | if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) | ||
1933 | break; | 1950 | break; |
1951 | |||
1952 | rd->rto_loop = next; | ||
1934 | } | 1953 | } |
1935 | 1954 | ||
1936 | return cpu; | 1955 | return -1; |
1937 | } | 1956 | } |
1938 | 1957 | ||
1939 | #define RT_PUSH_IPI_EXECUTING 1 | 1958 | static inline bool rto_start_trylock(atomic_t *v) |
1940 | #define RT_PUSH_IPI_RESTART 2 | 1959 | { |
1960 | return !atomic_cmpxchg_acquire(v, 0, 1); | ||
1961 | } | ||
1941 | 1962 | ||
1942 | /* | 1963 | static inline void rto_start_unlock(atomic_t *v) |
1943 | * When a high priority task schedules out from a CPU and a lower priority | ||
1944 | * task is scheduled in, a check is made to see if there's any RT tasks | ||
1945 | * on other CPUs that are waiting to run because a higher priority RT task | ||
1946 | * is currently running on its CPU. In this case, the CPU with multiple RT | ||
1947 | * tasks queued on it (overloaded) needs to be notified that a CPU has opened | ||
1948 | * up that may be able to run one of its non-running queued RT tasks. | ||
1949 | * | ||
1950 | * On large CPU boxes, there's the case that several CPUs could schedule | ||
1951 | * a lower priority task at the same time, in which case it will look for | ||
1952 | * any overloaded CPUs that it could pull a task from. To do this, the runqueue | ||
1953 | * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting | ||
1954 | * for a single overloaded CPU's runqueue lock can produce a large latency. | ||
1955 | * (This has actually been observed on large boxes running cyclictest). | ||
1956 | * Instead of taking the runqueue lock of the overloaded CPU, each of the | ||
1957 | * CPUs that scheduled a lower priority task simply sends an IPI to the | ||
1958 | * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with | ||
1959 | * lots of contention. The overloaded CPU will look to push its non-running | ||
1960 | * RT task off, and if it does, it can then ignore the other IPIs coming | ||
1961 | * in, and just pass those IPIs off to any other overloaded CPU. | ||
1962 | * | ||
1963 | * When a CPU schedules a lower priority task, it only sends an IPI to | ||
1964 | * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, | ||
1965 | * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with | ||
1966 | * RT overloaded tasks, would cause 100 IPIs to go out at once. | ||
1967 | * | ||
1968 | * The overloaded RT CPU, when receiving an IPI, will try to push off its | ||
1969 | * overloaded RT tasks and then send an IPI to the next CPU that has | ||
1970 | * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks | ||
1971 | * have completed. Just because a CPU may have pushed off its own overloaded | ||
1972 | * RT task does not mean it should stop sending the IPI around to other | ||
1973 | * overloaded CPUs. There may be another RT task waiting to run on one of | ||
1974 | * those CPUs that are of higher priority than the one that was just | ||
1975 | * pushed. | ||
1976 | * | ||
1977 | * An optimization that could possibly be made is to make a CPU array similar | ||
1978 | * to the cpupri array mask of all running RT tasks, but for the overloaded | ||
1979 | * case, then the IPI could be sent to only the CPU with the highest priority | ||
1980 | * RT task waiting, and that CPU could send off further IPIs to the CPU with | ||
1981 | * the next highest waiting task. Since the overloaded case is much less likely | ||
1982 | * to happen, the complexity of this implementation may not be worth it. | ||
1983 | * Instead, just send an IPI around to all overloaded CPUs. | ||
1984 | * | ||
1985 | * The rq->rt.push_flags holds the status of the IPI that is going around. | ||
1986 | * A run queue can only send out a single IPI at a time. The possible flags | ||
1987 | * for rq->rt.push_flags are: | ||
1988 | * | ||
1989 | * (None or zero): No IPI is going around for the current rq | ||
1990 | * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around | ||
1991 | * RT_PUSH_IPI_RESTART: The priority of the running task for the rq | ||
1992 | * has changed, and the IPI should restart | ||
1993 | * circulating the overloaded CPUs again. | ||
1994 | * | ||
1995 | * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated | ||
1996 | * before sending to the next CPU. | ||
1997 | * | ||
1998 | * Instead of having all CPUs that schedule a lower priority task send | ||
1999 | * an IPI to the same "first" CPU in the RT overload mask, they send it | ||
2000 | * to the next overloaded CPU after their own CPU. This helps distribute | ||
2001 | * the work when there's more than one overloaded CPU and multiple CPUs | ||
2002 | * scheduling in lower priority tasks. | ||
2003 | * | ||
2004 | * When a rq schedules a lower priority task than what was currently | ||
2005 | * running, the next CPU with overloaded RT tasks is examined first. | ||
2006 | * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower | ||
2007 | * priority task, it will send an IPI first to CPU 5, then CPU 5 will | ||
2008 | * send to CPU 1 if it is still overloaded. CPU 1 will clear the | ||
2009 | * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. | ||
2010 | * | ||
2011 | * The first CPU to notice IPI_RESTART is set, will clear that flag and then | ||
2012 | * send an IPI to the next overloaded CPU after the rq->cpu and not the next | ||
2013 | * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 | ||
2014 | * schedules a lower priority task, and the IPI_RESTART gets set while the | ||
2015 | * handling is being done on CPU 5, it will clear the flag and send it back to | ||
2016 | * CPU 4 instead of CPU 1. | ||
2017 | * | ||
2018 | * Note, the above logic can be disabled by turning off the sched_feature | ||
2019 | * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be | ||
2020 | * taken by the CPU requesting a pull and the waiting RT task will be pulled | ||
2021 | * by that CPU. This may be fine for machines with few CPUs. | ||
2022 | */ | ||
2023 | static void tell_cpu_to_push(struct rq *rq) | ||
2024 | { | 1964 | { |
2025 | int cpu; | 1965 | atomic_set_release(v, 0); |
1966 | } | ||
2026 | 1967 | ||
2027 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | 1968 | static void tell_cpu_to_push(struct rq *rq) |
2028 | raw_spin_lock(&rq->rt.push_lock); | 1969 | { |
2029 | /* Make sure it's still executing */ | 1970 | int cpu = -1; |
2030 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
2031 | /* | ||
2032 | * Tell the IPI to restart the loop as things have | ||
2033 | * changed since it started. | ||
2034 | */ | ||
2035 | rq->rt.push_flags |= RT_PUSH_IPI_RESTART; | ||
2036 | raw_spin_unlock(&rq->rt.push_lock); | ||
2037 | return; | ||
2038 | } | ||
2039 | raw_spin_unlock(&rq->rt.push_lock); | ||
2040 | } | ||
2041 | 1971 | ||
2042 | /* When here, there's no IPI going around */ | 1972 | /* Keep the loop going if the IPI is currently active */ |
1973 | atomic_inc(&rq->rd->rto_loop_next); | ||
2043 | 1974 | ||
2044 | rq->rt.push_cpu = rq->cpu; | 1975 | /* Only one CPU can initiate a loop at a time */ |
2045 | cpu = find_next_push_cpu(rq); | 1976 | if (!rto_start_trylock(&rq->rd->rto_loop_start)) |
2046 | if (cpu >= nr_cpu_ids) | ||
2047 | return; | 1977 | return; |
2048 | 1978 | ||
2049 | rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; | 1979 | raw_spin_lock(&rq->rd->rto_lock); |
1980 | |||
1981 | /* | ||
1982 | * The rto_cpu is updated under the lock, if it has a valid cpu | ||
1983 | * then the IPI is still running and will continue due to the | ||
1984 | * update to loop_next, and nothing needs to be done here. | ||
1985 | * Otherwise it is finishing up and an ipi needs to be sent. | ||
1986 | */ | ||
1987 | if (rq->rd->rto_cpu < 0) | ||
1988 | cpu = rto_next_cpu(rq); | ||
2050 | 1989 | ||
2051 | irq_work_queue_on(&rq->rt.push_work, cpu); | 1990 | raw_spin_unlock(&rq->rd->rto_lock); |
1991 | |||
1992 | rto_start_unlock(&rq->rd->rto_loop_start); | ||
1993 | |||
1994 | if (cpu >= 0) | ||
1995 | irq_work_queue_on(&rq->rd->rto_push_work, cpu); | ||
2052 | } | 1996 | } |
2053 | 1997 | ||
2054 | /* Called from hardirq context */ | 1998 | /* Called from hardirq context */ |
2055 | static void try_to_push_tasks(void *arg) | 1999 | void rto_push_irq_work_func(struct irq_work *work) |
2056 | { | 2000 | { |
2057 | struct rt_rq *rt_rq = arg; | 2001 | struct rq *rq; |
2058 | struct rq *rq, *src_rq; | ||
2059 | int this_cpu; | ||
2060 | int cpu; | 2002 | int cpu; |
2061 | 2003 | ||
2062 | this_cpu = rt_rq->push_cpu; | 2004 | rq = this_rq(); |
2063 | 2005 | ||
2064 | /* Paranoid check */ | 2006 | /* |
2065 | BUG_ON(this_cpu != smp_processor_id()); | 2007 | * We do not need to grab the lock to check for has_pushable_tasks. |
2066 | 2008 | * When it gets updated, a check is made if a push is possible. | |
2067 | rq = cpu_rq(this_cpu); | 2009 | */ |
2068 | src_rq = rq_of_rt_rq(rt_rq); | ||
2069 | |||
2070 | again: | ||
2071 | if (has_pushable_tasks(rq)) { | 2010 | if (has_pushable_tasks(rq)) { |
2072 | raw_spin_lock(&rq->lock); | 2011 | raw_spin_lock(&rq->lock); |
2073 | push_rt_task(rq); | 2012 | push_rt_tasks(rq); |
2074 | raw_spin_unlock(&rq->lock); | 2013 | raw_spin_unlock(&rq->lock); |
2075 | } | 2014 | } |
2076 | 2015 | ||
2077 | /* Pass the IPI to the next rt overloaded queue */ | 2016 | raw_spin_lock(&rq->rd->rto_lock); |
2078 | raw_spin_lock(&rt_rq->push_lock); | ||
2079 | /* | ||
2080 | * If the source queue changed since the IPI went out, | ||
2081 | * we need to restart the search from that CPU again. | ||
2082 | */ | ||
2083 | if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { | ||
2084 | rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; | ||
2085 | rt_rq->push_cpu = src_rq->cpu; | ||
2086 | } | ||
2087 | 2017 | ||
2088 | cpu = find_next_push_cpu(src_rq); | 2018 | /* Pass the IPI to the next rt overloaded queue */ |
2019 | cpu = rto_next_cpu(rq); | ||
2089 | 2020 | ||
2090 | if (cpu >= nr_cpu_ids) | 2021 | raw_spin_unlock(&rq->rd->rto_lock); |
2091 | rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; | ||
2092 | raw_spin_unlock(&rt_rq->push_lock); | ||
2093 | 2022 | ||
2094 | if (cpu >= nr_cpu_ids) | 2023 | if (cpu < 0) |
2095 | return; | 2024 | return; |
2096 | 2025 | ||
2097 | /* | ||
2098 | * It is possible that a restart caused this CPU to be | ||
2099 | * chosen again. Don't bother with an IPI, just see if we | ||
2100 | * have more to push. | ||
2101 | */ | ||
2102 | if (unlikely(cpu == rq->cpu)) | ||
2103 | goto again; | ||
2104 | |||
2105 | /* Try the next RT overloaded CPU */ | 2026 | /* Try the next RT overloaded CPU */ |
2106 | irq_work_queue_on(&rt_rq->push_work, cpu); | 2027 | irq_work_queue_on(&rq->rd->rto_push_work, cpu); |
2107 | } | ||
2108 | |||
2109 | static void push_irq_work_func(struct irq_work *work) | ||
2110 | { | ||
2111 | struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); | ||
2112 | |||
2113 | try_to_push_tasks(rt_rq); | ||
2114 | } | 2028 | } |
2115 | #endif /* HAVE_RT_PUSH_IPI */ | 2029 | #endif /* HAVE_RT_PUSH_IPI */ |
2116 | 2030 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3b448ba82225..45ab0bf564e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -227,7 +227,7 @@ struct dl_bw { | |||
227 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); | 227 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); |
228 | 228 | ||
229 | static inline | 229 | static inline |
230 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) | 230 | void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) |
231 | { | 231 | { |
232 | dl_b->total_bw -= tsk_bw; | 232 | dl_b->total_bw -= tsk_bw; |
233 | __dl_update(dl_b, (s32)tsk_bw / cpus); | 233 | __dl_update(dl_b, (s32)tsk_bw / cpus); |
@@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy, | |||
256 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); | 256 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); |
257 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); | 257 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); |
258 | extern bool __checkparam_dl(const struct sched_attr *attr); | 258 | extern bool __checkparam_dl(const struct sched_attr *attr); |
259 | extern void __dl_clear_params(struct task_struct *p); | ||
260 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); | 259 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); |
261 | extern int dl_task_can_attach(struct task_struct *p, | 260 | extern int dl_task_can_attach(struct task_struct *p, |
262 | const struct cpumask *cs_cpus_allowed); | 261 | const struct cpumask *cs_cpus_allowed); |
@@ -419,6 +418,7 @@ struct cfs_bandwidth { }; | |||
419 | /* CFS-related fields in a runqueue */ | 418 | /* CFS-related fields in a runqueue */ |
420 | struct cfs_rq { | 419 | struct cfs_rq { |
421 | struct load_weight load; | 420 | struct load_weight load; |
421 | unsigned long runnable_weight; | ||
422 | unsigned int nr_running, h_nr_running; | 422 | unsigned int nr_running, h_nr_running; |
423 | 423 | ||
424 | u64 exec_clock; | 424 | u64 exec_clock; |
@@ -444,18 +444,22 @@ struct cfs_rq { | |||
444 | * CFS load tracking | 444 | * CFS load tracking |
445 | */ | 445 | */ |
446 | struct sched_avg avg; | 446 | struct sched_avg avg; |
447 | u64 runnable_load_sum; | ||
448 | unsigned long runnable_load_avg; | ||
449 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
450 | unsigned long tg_load_avg_contrib; | ||
451 | unsigned long propagate_avg; | ||
452 | #endif | ||
453 | atomic_long_t removed_load_avg, removed_util_avg; | ||
454 | #ifndef CONFIG_64BIT | 447 | #ifndef CONFIG_64BIT |
455 | u64 load_last_update_time_copy; | 448 | u64 load_last_update_time_copy; |
456 | #endif | 449 | #endif |
450 | struct { | ||
451 | raw_spinlock_t lock ____cacheline_aligned; | ||
452 | int nr; | ||
453 | unsigned long load_avg; | ||
454 | unsigned long util_avg; | ||
455 | unsigned long runnable_sum; | ||
456 | } removed; | ||
457 | 457 | ||
458 | #ifdef CONFIG_FAIR_GROUP_SCHED | 458 | #ifdef CONFIG_FAIR_GROUP_SCHED |
459 | unsigned long tg_load_avg_contrib; | ||
460 | long propagate; | ||
461 | long prop_runnable_sum; | ||
462 | |||
459 | /* | 463 | /* |
460 | * h_load = weight * f(tg) | 464 | * h_load = weight * f(tg) |
461 | * | 465 | * |
@@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void) | |||
502 | } | 506 | } |
503 | 507 | ||
504 | /* RT IPI pull logic requires IRQ_WORK */ | 508 | /* RT IPI pull logic requires IRQ_WORK */ |
505 | #ifdef CONFIG_IRQ_WORK | 509 | #if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) |
506 | # define HAVE_RT_PUSH_IPI | 510 | # define HAVE_RT_PUSH_IPI |
507 | #endif | 511 | #endif |
508 | 512 | ||
@@ -524,12 +528,6 @@ struct rt_rq { | |||
524 | unsigned long rt_nr_total; | 528 | unsigned long rt_nr_total; |
525 | int overloaded; | 529 | int overloaded; |
526 | struct plist_head pushable_tasks; | 530 | struct plist_head pushable_tasks; |
527 | #ifdef HAVE_RT_PUSH_IPI | ||
528 | int push_flags; | ||
529 | int push_cpu; | ||
530 | struct irq_work push_work; | ||
531 | raw_spinlock_t push_lock; | ||
532 | #endif | ||
533 | #endif /* CONFIG_SMP */ | 531 | #endif /* CONFIG_SMP */ |
534 | int rt_queued; | 532 | int rt_queued; |
535 | 533 | ||
@@ -638,6 +636,19 @@ struct root_domain { | |||
638 | struct dl_bw dl_bw; | 636 | struct dl_bw dl_bw; |
639 | struct cpudl cpudl; | 637 | struct cpudl cpudl; |
640 | 638 | ||
639 | #ifdef HAVE_RT_PUSH_IPI | ||
640 | /* | ||
641 | * For IPI pull requests, loop across the rto_mask. | ||
642 | */ | ||
643 | struct irq_work rto_push_work; | ||
644 | raw_spinlock_t rto_lock; | ||
645 | /* These are only updated and read within rto_lock */ | ||
646 | int rto_loop; | ||
647 | int rto_cpu; | ||
648 | /* These atomics are updated outside of a lock */ | ||
649 | atomic_t rto_loop_next; | ||
650 | atomic_t rto_loop_start; | ||
651 | #endif | ||
641 | /* | 652 | /* |
642 | * The "RT overload" flag: it gets set if a CPU has more than | 653 | * The "RT overload" flag: it gets set if a CPU has more than |
643 | * one runnable RT task. | 654 | * one runnable RT task. |
@@ -655,6 +666,9 @@ extern void init_defrootdomain(void); | |||
655 | extern int sched_init_domains(const struct cpumask *cpu_map); | 666 | extern int sched_init_domains(const struct cpumask *cpu_map); |
656 | extern void rq_attach_root(struct rq *rq, struct root_domain *rd); | 667 | extern void rq_attach_root(struct rq *rq, struct root_domain *rd); |
657 | 668 | ||
669 | #ifdef HAVE_RT_PUSH_IPI | ||
670 | extern void rto_push_irq_work_func(struct irq_work *work); | ||
671 | #endif | ||
658 | #endif /* CONFIG_SMP */ | 672 | #endif /* CONFIG_SMP */ |
659 | 673 | ||
660 | /* | 674 | /* |
@@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1219 | # define const_debug const | 1233 | # define const_debug const |
1220 | #endif | 1234 | #endif |
1221 | 1235 | ||
1222 | extern const_debug unsigned int sysctl_sched_features; | ||
1223 | |||
1224 | #define SCHED_FEAT(name, enabled) \ | 1236 | #define SCHED_FEAT(name, enabled) \ |
1225 | __SCHED_FEAT_##name , | 1237 | __SCHED_FEAT_##name , |
1226 | 1238 | ||
@@ -1232,6 +1244,13 @@ enum { | |||
1232 | #undef SCHED_FEAT | 1244 | #undef SCHED_FEAT |
1233 | 1245 | ||
1234 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | 1246 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) |
1247 | |||
1248 | /* | ||
1249 | * To support run-time toggling of sched features, all the translation units | ||
1250 | * (but core.c) reference the sysctl_sched_features defined in core.c. | ||
1251 | */ | ||
1252 | extern const_debug unsigned int sysctl_sched_features; | ||
1253 | |||
1235 | #define SCHED_FEAT(name, enabled) \ | 1254 | #define SCHED_FEAT(name, enabled) \ |
1236 | static __always_inline bool static_branch_##name(struct static_key *key) \ | 1255 | static __always_inline bool static_branch_##name(struct static_key *key) \ |
1237 | { \ | 1256 | { \ |
@@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ | |||
1239 | } | 1258 | } |
1240 | 1259 | ||
1241 | #include "features.h" | 1260 | #include "features.h" |
1242 | |||
1243 | #undef SCHED_FEAT | 1261 | #undef SCHED_FEAT |
1244 | 1262 | ||
1245 | extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | 1263 | extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; |
1246 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) | 1264 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) |
1265 | |||
1247 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ | 1266 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ |
1267 | |||
1268 | /* | ||
1269 | * Each translation unit has its own copy of sysctl_sched_features to allow | ||
1270 | * constants propagation at compile time and compiler optimization based on | ||
1271 | * features default. | ||
1272 | */ | ||
1273 | #define SCHED_FEAT(name, enabled) \ | ||
1274 | (1UL << __SCHED_FEAT_##name) * enabled | | ||
1275 | static const_debug __maybe_unused unsigned int sysctl_sched_features = | ||
1276 | #include "features.h" | ||
1277 | 0; | ||
1278 | #undef SCHED_FEAT | ||
1279 | |||
1248 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 1280 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
1281 | |||
1249 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 1282 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
1250 | 1283 | ||
1251 | extern struct static_key_false sched_numa_balancing; | 1284 | extern struct static_key_false sched_numa_balancing; |
@@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void); | |||
1530 | extern void init_sched_rt_class(void); | 1563 | extern void init_sched_rt_class(void); |
1531 | extern void init_sched_fair_class(void); | 1564 | extern void init_sched_fair_class(void); |
1532 | 1565 | ||
1566 | extern void reweight_task(struct task_struct *p, int prio); | ||
1567 | |||
1533 | extern void resched_curr(struct rq *rq); | 1568 | extern void resched_curr(struct rq *rq); |
1534 | extern void resched_cpu(int cpu); | 1569 | extern void resched_cpu(int cpu); |
1535 | 1570 | ||
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6798276d29af..034cbed7f88b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
@@ -4,6 +4,7 @@ | |||
4 | */ | 4 | */ |
5 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
7 | #include <linux/sched/isolation.h> | ||
7 | 8 | ||
8 | #include "sched.h" | 9 | #include "sched.h" |
9 | 10 | ||
@@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd) | |||
269 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | 270 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
270 | goto free_dlo_mask; | 271 | goto free_dlo_mask; |
271 | 272 | ||
273 | #ifdef HAVE_RT_PUSH_IPI | ||
274 | rd->rto_cpu = -1; | ||
275 | raw_spin_lock_init(&rd->rto_lock); | ||
276 | init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); | ||
277 | #endif | ||
278 | |||
272 | init_dl_bw(&rd->dl_bw); | 279 | init_dl_bw(&rd->dl_bw); |
273 | if (cpudl_init(&rd->cpudl) != 0) | 280 | if (cpudl_init(&rd->cpudl) != 0) |
274 | goto free_rto_mask; | 281 | goto free_rto_mask; |
@@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
464 | update_top_cache_domain(cpu); | 471 | update_top_cache_domain(cpu); |
465 | } | 472 | } |
466 | 473 | ||
467 | /* Setup the mask of CPUs configured for isolated domains */ | ||
468 | static int __init isolated_cpu_setup(char *str) | ||
469 | { | ||
470 | int ret; | ||
471 | |||
472 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
473 | ret = cpulist_parse(str, cpu_isolated_map); | ||
474 | if (ret) { | ||
475 | pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); | ||
476 | return 0; | ||
477 | } | ||
478 | return 1; | ||
479 | } | ||
480 | __setup("isolcpus=", isolated_cpu_setup); | ||
481 | |||
482 | struct s_data { | 474 | struct s_data { |
483 | struct sched_domain ** __percpu sd; | 475 | struct sched_domain ** __percpu sd; |
484 | struct root_domain *rd; | 476 | struct root_domain *rd; |
@@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl, | |||
1158 | sd->smt_gain = 1178; /* ~15% */ | 1150 | sd->smt_gain = 1178; /* ~15% */ |
1159 | 1151 | ||
1160 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | 1152 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { |
1153 | sd->flags |= SD_PREFER_SIBLING; | ||
1161 | sd->imbalance_pct = 117; | 1154 | sd->imbalance_pct = 117; |
1162 | sd->cache_nice_tries = 1; | 1155 | sd->cache_nice_tries = 1; |
1163 | sd->busy_idx = 2; | 1156 | sd->busy_idx = 2; |
@@ -1332,6 +1325,10 @@ void sched_init_numa(void) | |||
1332 | if (!sched_domains_numa_distance) | 1325 | if (!sched_domains_numa_distance) |
1333 | return; | 1326 | return; |
1334 | 1327 | ||
1328 | /* Includes NUMA identity node at level 0. */ | ||
1329 | sched_domains_numa_distance[level++] = curr_distance; | ||
1330 | sched_domains_numa_levels = level; | ||
1331 | |||
1335 | /* | 1332 | /* |
1336 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | 1333 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the |
1337 | * unique distances in the node_distance() table. | 1334 | * unique distances in the node_distance() table. |
@@ -1379,8 +1376,7 @@ void sched_init_numa(void) | |||
1379 | return; | 1376 | return; |
1380 | 1377 | ||
1381 | /* | 1378 | /* |
1382 | * 'level' contains the number of unique distances, excluding the | 1379 | * 'level' contains the number of unique distances |
1383 | * identity distance node_distance(i,i). | ||
1384 | * | 1380 | * |
1385 | * The sched_domains_numa_distance[] array includes the actual distance | 1381 | * The sched_domains_numa_distance[] array includes the actual distance |
1386 | * numbers. | 1382 | * numbers. |
@@ -1442,9 +1438,18 @@ void sched_init_numa(void) | |||
1442 | tl[i] = sched_domain_topology[i]; | 1438 | tl[i] = sched_domain_topology[i]; |
1443 | 1439 | ||
1444 | /* | 1440 | /* |
1441 | * Add the NUMA identity distance, aka single NODE. | ||
1442 | */ | ||
1443 | tl[i++] = (struct sched_domain_topology_level){ | ||
1444 | .mask = sd_numa_mask, | ||
1445 | .numa_level = 0, | ||
1446 | SD_INIT_NAME(NODE) | ||
1447 | }; | ||
1448 | |||
1449 | /* | ||
1445 | * .. and append 'j' levels of NUMA goodness. | 1450 | * .. and append 'j' levels of NUMA goodness. |
1446 | */ | 1451 | */ |
1447 | for (j = 0; j < level; i++, j++) { | 1452 | for (j = 1; j < level; i++, j++) { |
1448 | tl[i] = (struct sched_domain_topology_level){ | 1453 | tl[i] = (struct sched_domain_topology_level){ |
1449 | .mask = sd_numa_mask, | 1454 | .mask = sd_numa_mask, |
1450 | .sd_flags = cpu_numa_flags, | 1455 | .sd_flags = cpu_numa_flags, |
@@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map) | |||
1774 | doms_cur = alloc_sched_domains(ndoms_cur); | 1779 | doms_cur = alloc_sched_domains(ndoms_cur); |
1775 | if (!doms_cur) | 1780 | if (!doms_cur) |
1776 | doms_cur = &fallback_doms; | 1781 | doms_cur = &fallback_doms; |
1777 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 1782 | cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN)); |
1778 | err = build_sched_domains(doms_cur[0], NULL); | 1783 | err = build_sched_domains(doms_cur[0], NULL); |
1779 | register_sched_domain_sysctl(); | 1784 | register_sched_domain_sysctl(); |
1780 | 1785 | ||
@@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | |||
1857 | doms_new = alloc_sched_domains(1); | 1862 | doms_new = alloc_sched_domains(1); |
1858 | if (doms_new) { | 1863 | if (doms_new) { |
1859 | n = 1; | 1864 | n = 1; |
1860 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 1865 | cpumask_and(doms_new[0], cpu_active_mask, |
1866 | housekeeping_cpumask(HK_FLAG_DOMAIN)); | ||
1861 | } | 1867 | } |
1862 | } else { | 1868 | } else { |
1863 | n = ndoms_new; | 1869 | n = ndoms_new; |
@@ -1880,7 +1886,8 @@ match1: | |||
1880 | if (!doms_new) { | 1886 | if (!doms_new) { |
1881 | n = 0; | 1887 | n = 0; |
1882 | doms_new = &fallback_doms; | 1888 | doms_new = &fallback_doms; |
1883 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 1889 | cpumask_and(doms_new[0], cpu_active_mask, |
1890 | housekeeping_cpumask(HK_FLAG_DOMAIN)); | ||
1884 | } | 1891 | } |
1885 | 1892 | ||
1886 | /* Build new domains: */ | 1893 | /* Build new domains: */ |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index dd4b7b492c9b..99578f06c8d4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/irq_work.h> | 27 | #include <linux/irq_work.h> |
28 | #include <linux/posix-timers.h> | 28 | #include <linux/posix-timers.h> |
29 | #include <linux/context_tracking.h> | 29 | #include <linux/context_tracking.h> |
30 | #include <linux/mm.h> | ||
30 | 31 | ||
31 | #include <asm/irq_regs.h> | 32 | #include <asm/irq_regs.h> |
32 | 33 | ||
@@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
165 | 166 | ||
166 | #ifdef CONFIG_NO_HZ_FULL | 167 | #ifdef CONFIG_NO_HZ_FULL |
167 | cpumask_var_t tick_nohz_full_mask; | 168 | cpumask_var_t tick_nohz_full_mask; |
168 | cpumask_var_t housekeeping_mask; | ||
169 | bool tick_nohz_full_running; | 169 | bool tick_nohz_full_running; |
170 | static atomic_t tick_dep_mask; | 170 | static atomic_t tick_dep_mask; |
171 | 171 | ||
@@ -385,20 +385,13 @@ out: | |||
385 | local_irq_restore(flags); | 385 | local_irq_restore(flags); |
386 | } | 386 | } |
387 | 387 | ||
388 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 388 | /* Get the boot-time nohz CPU list from the kernel parameters. */ |
389 | static int __init tick_nohz_full_setup(char *str) | 389 | void __init tick_nohz_full_setup(cpumask_var_t cpumask) |
390 | { | 390 | { |
391 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); | 391 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
392 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { | 392 | cpumask_copy(tick_nohz_full_mask, cpumask); |
393 | pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); | ||
394 | free_bootmem_cpumask_var(tick_nohz_full_mask); | ||
395 | return 1; | ||
396 | } | ||
397 | tick_nohz_full_running = true; | 393 | tick_nohz_full_running = true; |
398 | |||
399 | return 1; | ||
400 | } | 394 | } |
401 | __setup("nohz_full=", tick_nohz_full_setup); | ||
402 | 395 | ||
403 | static int tick_nohz_cpu_down(unsigned int cpu) | 396 | static int tick_nohz_cpu_down(unsigned int cpu) |
404 | { | 397 | { |
@@ -437,13 +430,6 @@ void __init tick_nohz_init(void) | |||
437 | return; | 430 | return; |
438 | } | 431 | } |
439 | 432 | ||
440 | if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { | ||
441 | WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); | ||
442 | cpumask_clear(tick_nohz_full_mask); | ||
443 | tick_nohz_full_running = false; | ||
444 | return; | ||
445 | } | ||
446 | |||
447 | /* | 433 | /* |
448 | * Full dynticks uses irq work to drive the tick rescheduling on safe | 434 | * Full dynticks uses irq work to drive the tick rescheduling on safe |
449 | * locking contexts. But then we need irq work to raise its own | 435 | * locking contexts. But then we need irq work to raise its own |
@@ -452,7 +438,6 @@ void __init tick_nohz_init(void) | |||
452 | if (!arch_irq_work_has_interrupt()) { | 438 | if (!arch_irq_work_has_interrupt()) { |
453 | pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); | 439 | pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); |
454 | cpumask_clear(tick_nohz_full_mask); | 440 | cpumask_clear(tick_nohz_full_mask); |
455 | cpumask_copy(housekeeping_mask, cpu_possible_mask); | ||
456 | tick_nohz_full_running = false; | 441 | tick_nohz_full_running = false; |
457 | return; | 442 | return; |
458 | } | 443 | } |
@@ -465,9 +450,6 @@ void __init tick_nohz_init(void) | |||
465 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); | 450 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
466 | } | 451 | } |
467 | 452 | ||
468 | cpumask_andnot(housekeeping_mask, | ||
469 | cpu_possible_mask, tick_nohz_full_mask); | ||
470 | |||
471 | for_each_cpu(cpu, tick_nohz_full_mask) | 453 | for_each_cpu(cpu, tick_nohz_full_mask) |
472 | context_tracking_cpu_set(cpu); | 454 | context_tracking_cpu_set(cpu); |
473 | 455 | ||
@@ -477,12 +459,6 @@ void __init tick_nohz_init(void) | |||
477 | WARN_ON(ret < 0); | 459 | WARN_ON(ret < 0); |
478 | pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", | 460 | pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", |
479 | cpumask_pr_args(tick_nohz_full_mask)); | 461 | cpumask_pr_args(tick_nohz_full_mask)); |
480 | |||
481 | /* | ||
482 | * We need at least one CPU to handle housekeeping work such | ||
483 | * as timekeeping, unbound timers, workqueues, ... | ||
484 | */ | ||
485 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); | ||
486 | } | 462 | } |
487 | #endif | 463 | #endif |
488 | 464 | ||
@@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
787 | if (!ts->tick_stopped) { | 763 | if (!ts->tick_stopped) { |
788 | calc_load_nohz_start(); | 764 | calc_load_nohz_start(); |
789 | cpu_load_update_nohz_start(); | 765 | cpu_load_update_nohz_start(); |
766 | quiet_vmstat(); | ||
790 | 767 | ||
791 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 768 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
792 | ts->tick_stopped = 1; | 769 | ts->tick_stopped = 1; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c738e764e2a5..90db994ac900 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, | |||
921 | 921 | ||
922 | trace_assign_type(field, iter->ent); | 922 | trace_assign_type(field, iter->ent); |
923 | 923 | ||
924 | T = __task_state_to_char(field->next_state); | 924 | T = task_index_to_char(field->next_state); |
925 | S = __task_state_to_char(field->prev_state); | 925 | S = task_index_to_char(field->prev_state); |
926 | trace_find_cmdline(field->next_pid, comm); | 926 | trace_find_cmdline(field->next_pid, comm); |
927 | trace_seq_printf(&iter->seq, | 927 | trace_seq_printf(&iter->seq, |
928 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", | 928 | " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", |
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) | |||
957 | trace_assign_type(field, iter->ent); | 957 | trace_assign_type(field, iter->ent); |
958 | 958 | ||
959 | if (!S) | 959 | if (!S) |
960 | S = __task_state_to_char(field->prev_state); | 960 | S = task_index_to_char(field->prev_state); |
961 | T = __task_state_to_char(field->next_state); | 961 | T = task_index_to_char(field->next_state); |
962 | trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", | 962 | trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", |
963 | field->prev_pid, | 963 | field->prev_pid, |
964 | field->prev_prio, | 964 | field->prev_prio, |
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) | |||
993 | trace_assign_type(field, iter->ent); | 993 | trace_assign_type(field, iter->ent); |
994 | 994 | ||
995 | if (!S) | 995 | if (!S) |
996 | S = __task_state_to_char(field->prev_state); | 996 | S = task_index_to_char(field->prev_state); |
997 | T = __task_state_to_char(field->next_state); | 997 | T = task_index_to_char(field->next_state); |
998 | 998 | ||
999 | SEQ_PUT_HEX_FIELD(s, field->prev_pid); | 999 | SEQ_PUT_HEX_FIELD(s, field->prev_pid); |
1000 | SEQ_PUT_HEX_FIELD(s, field->prev_prio); | 1000 | SEQ_PUT_HEX_FIELD(s, field->prev_prio); |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7d461dcd4831..a86b303e6c67 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr, | |||
398 | entry = ring_buffer_event_data(event); | 398 | entry = ring_buffer_event_data(event); |
399 | entry->prev_pid = prev->pid; | 399 | entry->prev_pid = prev->pid; |
400 | entry->prev_prio = prev->prio; | 400 | entry->prev_prio = prev->prio; |
401 | entry->prev_state = __get_task_state(prev); | 401 | entry->prev_state = task_state_index(prev); |
402 | entry->next_pid = next->pid; | 402 | entry->next_pid = next->pid; |
403 | entry->next_prio = next->prio; | 403 | entry->next_prio = next->prio; |
404 | entry->next_state = __get_task_state(next); | 404 | entry->next_state = task_state_index(next); |
405 | entry->next_cpu = task_cpu(next); | 405 | entry->next_cpu = task_cpu(next); |
406 | 406 | ||
407 | if (!call_filter_check_discard(call, entry, buffer, event)) | 407 | if (!call_filter_check_discard(call, entry, buffer, event)) |
@@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
426 | entry = ring_buffer_event_data(event); | 426 | entry = ring_buffer_event_data(event); |
427 | entry->prev_pid = curr->pid; | 427 | entry->prev_pid = curr->pid; |
428 | entry->prev_prio = curr->prio; | 428 | entry->prev_prio = curr->prio; |
429 | entry->prev_state = __get_task_state(curr); | 429 | entry->prev_state = task_state_index(curr); |
430 | entry->next_pid = wakee->pid; | 430 | entry->next_pid = wakee->pid; |
431 | entry->next_prio = wakee->prio; | 431 | entry->next_prio = wakee->prio; |
432 | entry->next_state = __get_task_state(wakee); | 432 | entry->next_state = task_state_index(wakee); |
433 | entry->next_cpu = task_cpu(wakee); | 433 | entry->next_cpu = task_cpu(wakee); |
434 | 434 | ||
435 | if (!call_filter_check_discard(call, entry, buffer, event)) | 435 | if (!call_filter_check_discard(call, entry, buffer, event)) |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c8e06703e44c..576d18045811 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/workqueue.h> | 25 | #include <linux/workqueue.h> |
26 | #include <linux/sched/clock.h> | 26 | #include <linux/sched/clock.h> |
27 | #include <linux/sched/debug.h> | 27 | #include <linux/sched/debug.h> |
28 | #include <linux/sched/isolation.h> | ||
28 | 29 | ||
29 | #include <asm/irq_regs.h> | 30 | #include <asm/irq_regs.h> |
30 | #include <linux/kvm_para.h> | 31 | #include <linux/kvm_para.h> |
@@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
774 | 775 | ||
775 | void __init lockup_detector_init(void) | 776 | void __init lockup_detector_init(void) |
776 | { | 777 | { |
777 | #ifdef CONFIG_NO_HZ_FULL | 778 | if (tick_nohz_full_enabled()) |
778 | if (tick_nohz_full_enabled()) { | ||
779 | pr_info("Disabling watchdog on nohz_full cores by default\n"); | 779 | pr_info("Disabling watchdog on nohz_full cores by default\n"); |
780 | cpumask_copy(&watchdog_cpumask, housekeeping_mask); | 780 | |
781 | } else | 781 | cpumask_copy(&watchdog_cpumask, |
782 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); | 782 | housekeeping_cpumask(HK_FLAG_TIMER)); |
783 | #else | ||
784 | cpumask_copy(&watchdog_cpumask, cpu_possible_mask); | ||
785 | #endif | ||
786 | 783 | ||
787 | if (!watchdog_nmi_probe()) | 784 | if (!watchdog_nmi_probe()) |
788 | nmi_watchdog_available = true; | 785 | nmi_watchdog_available = true; |