diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 4 | ||||
-rw-r--r-- | kernel/sched/autogroup.c (renamed from kernel/sched/auto_group.c) | 0 | ||||
-rw-r--r-- | kernel/sched/autogroup.h (renamed from kernel/sched/auto_group.h) | 0 | ||||
-rw-r--r-- | kernel/sched/clock.c | 158 | ||||
-rw-r--r-- | kernel/sched/completion.c | 10 | ||||
-rw-r--r-- | kernel/sched/core.c | 2333 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 2 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 178 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 13 | ||||
-rw-r--r-- | kernel/sched/debug.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 94 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 10 | ||||
-rw-r--r-- | kernel/sched/sched.h | 137 | ||||
-rw-r--r-- | kernel/sched/stats.h | 4 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
-rw-r--r-- | kernel/sched/topology.c | 1658 |
17 files changed, 2406 insertions, 2203 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5e59b832ae2b..89ab6758667b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -18,8 +18,8 @@ endif | |||
18 | obj-y += core.o loadavg.o clock.o cputime.o | 18 | obj-y += core.o loadavg.o clock.o cputime.o |
19 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 19 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
20 | obj-y += wait.o swait.o completion.o idle.o | 20 | obj-y += wait.o swait.o completion.o idle.o |
21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o |
22 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 22 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
23 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 23 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
25 | obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o | 25 | obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/autogroup.c index da39489d2d80..da39489d2d80 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/autogroup.c | |||
diff --git a/kernel/sched/auto_group.h b/kernel/sched/autogroup.h index 890c95f2587a..890c95f2587a 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/autogroup.h | |||
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e85a725e5c34..ad64efe41722 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -77,41 +77,88 @@ EXPORT_SYMBOL_GPL(sched_clock); | |||
77 | 77 | ||
78 | __read_mostly int sched_clock_running; | 78 | __read_mostly int sched_clock_running; |
79 | 79 | ||
80 | void sched_clock_init(void) | ||
81 | { | ||
82 | sched_clock_running = 1; | ||
83 | } | ||
84 | |||
80 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 85 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
81 | static struct static_key __sched_clock_stable = STATIC_KEY_INIT; | 86 | /* |
82 | static int __sched_clock_stable_early; | 87 | * We must start with !__sched_clock_stable because the unstable -> stable |
88 | * transition is accurate, while the stable -> unstable transition is not. | ||
89 | * | ||
90 | * Similarly we start with __sched_clock_stable_early, thereby assuming we | ||
91 | * will become stable, such that there's only a single 1 -> 0 transition. | ||
92 | */ | ||
93 | static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); | ||
94 | static int __sched_clock_stable_early = 1; | ||
83 | 95 | ||
84 | int sched_clock_stable(void) | 96 | /* |
97 | * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset | ||
98 | */ | ||
99 | static __read_mostly u64 raw_offset; | ||
100 | static __read_mostly u64 gtod_offset; | ||
101 | |||
102 | struct sched_clock_data { | ||
103 | u64 tick_raw; | ||
104 | u64 tick_gtod; | ||
105 | u64 clock; | ||
106 | }; | ||
107 | |||
108 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | ||
109 | |||
110 | static inline struct sched_clock_data *this_scd(void) | ||
85 | { | 111 | { |
86 | return static_key_false(&__sched_clock_stable); | 112 | return this_cpu_ptr(&sched_clock_data); |
87 | } | 113 | } |
88 | 114 | ||
89 | static void __set_sched_clock_stable(void) | 115 | static inline struct sched_clock_data *cpu_sdc(int cpu) |
90 | { | 116 | { |
91 | if (!sched_clock_stable()) | 117 | return &per_cpu(sched_clock_data, cpu); |
92 | static_key_slow_inc(&__sched_clock_stable); | 118 | } |
93 | 119 | ||
94 | tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); | 120 | int sched_clock_stable(void) |
121 | { | ||
122 | return static_branch_likely(&__sched_clock_stable); | ||
95 | } | 123 | } |
96 | 124 | ||
97 | void set_sched_clock_stable(void) | 125 | static void __set_sched_clock_stable(void) |
98 | { | 126 | { |
99 | __sched_clock_stable_early = 1; | 127 | struct sched_clock_data *scd = this_scd(); |
100 | 128 | ||
101 | smp_mb(); /* matches sched_clock_init() */ | 129 | /* |
130 | * Attempt to make the (initial) unstable->stable transition continuous. | ||
131 | */ | ||
132 | raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw); | ||
102 | 133 | ||
103 | if (!sched_clock_running) | 134 | printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", |
104 | return; | 135 | scd->tick_gtod, gtod_offset, |
136 | scd->tick_raw, raw_offset); | ||
105 | 137 | ||
106 | __set_sched_clock_stable(); | 138 | static_branch_enable(&__sched_clock_stable); |
139 | tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); | ||
107 | } | 140 | } |
108 | 141 | ||
109 | static void __clear_sched_clock_stable(struct work_struct *work) | 142 | static void __clear_sched_clock_stable(struct work_struct *work) |
110 | { | 143 | { |
111 | /* XXX worry about clock continuity */ | 144 | struct sched_clock_data *scd = this_scd(); |
112 | if (sched_clock_stable()) | 145 | |
113 | static_key_slow_dec(&__sched_clock_stable); | 146 | /* |
147 | * Attempt to make the stable->unstable transition continuous. | ||
148 | * | ||
149 | * Trouble is, this is typically called from the TSC watchdog | ||
150 | * timer, which is late per definition. This means the tick | ||
151 | * values can already be screwy. | ||
152 | * | ||
153 | * Still do what we can. | ||
154 | */ | ||
155 | gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod); | ||
156 | |||
157 | printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", | ||
158 | scd->tick_gtod, gtod_offset, | ||
159 | scd->tick_raw, raw_offset); | ||
114 | 160 | ||
161 | static_branch_disable(&__sched_clock_stable); | ||
115 | tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); | 162 | tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); |
116 | } | 163 | } |
117 | 164 | ||
@@ -121,47 +168,15 @@ void clear_sched_clock_stable(void) | |||
121 | { | 168 | { |
122 | __sched_clock_stable_early = 0; | 169 | __sched_clock_stable_early = 0; |
123 | 170 | ||
124 | smp_mb(); /* matches sched_clock_init() */ | 171 | smp_mb(); /* matches sched_clock_init_late() */ |
125 | |||
126 | if (!sched_clock_running) | ||
127 | return; | ||
128 | 172 | ||
129 | schedule_work(&sched_clock_work); | 173 | if (sched_clock_running == 2) |
174 | schedule_work(&sched_clock_work); | ||
130 | } | 175 | } |
131 | 176 | ||
132 | struct sched_clock_data { | 177 | void sched_clock_init_late(void) |
133 | u64 tick_raw; | ||
134 | u64 tick_gtod; | ||
135 | u64 clock; | ||
136 | }; | ||
137 | |||
138 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | ||
139 | |||
140 | static inline struct sched_clock_data *this_scd(void) | ||
141 | { | 178 | { |
142 | return this_cpu_ptr(&sched_clock_data); | 179 | sched_clock_running = 2; |
143 | } | ||
144 | |||
145 | static inline struct sched_clock_data *cpu_sdc(int cpu) | ||
146 | { | ||
147 | return &per_cpu(sched_clock_data, cpu); | ||
148 | } | ||
149 | |||
150 | void sched_clock_init(void) | ||
151 | { | ||
152 | u64 ktime_now = ktime_to_ns(ktime_get()); | ||
153 | int cpu; | ||
154 | |||
155 | for_each_possible_cpu(cpu) { | ||
156 | struct sched_clock_data *scd = cpu_sdc(cpu); | ||
157 | |||
158 | scd->tick_raw = 0; | ||
159 | scd->tick_gtod = ktime_now; | ||
160 | scd->clock = ktime_now; | ||
161 | } | ||
162 | |||
163 | sched_clock_running = 1; | ||
164 | |||
165 | /* | 180 | /* |
166 | * Ensure that it is impossible to not do a static_key update. | 181 | * Ensure that it is impossible to not do a static_key update. |
167 | * | 182 | * |
@@ -173,8 +188,6 @@ void sched_clock_init(void) | |||
173 | 188 | ||
174 | if (__sched_clock_stable_early) | 189 | if (__sched_clock_stable_early) |
175 | __set_sched_clock_stable(); | 190 | __set_sched_clock_stable(); |
176 | else | ||
177 | __clear_sched_clock_stable(NULL); | ||
178 | } | 191 | } |
179 | 192 | ||
180 | /* | 193 | /* |
@@ -216,7 +229,7 @@ again: | |||
216 | * scd->tick_gtod + TICK_NSEC); | 229 | * scd->tick_gtod + TICK_NSEC); |
217 | */ | 230 | */ |
218 | 231 | ||
219 | clock = scd->tick_gtod + delta; | 232 | clock = scd->tick_gtod + gtod_offset + delta; |
220 | min_clock = wrap_max(scd->tick_gtod, old_clock); | 233 | min_clock = wrap_max(scd->tick_gtod, old_clock); |
221 | max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); | 234 | max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); |
222 | 235 | ||
@@ -302,7 +315,7 @@ u64 sched_clock_cpu(int cpu) | |||
302 | u64 clock; | 315 | u64 clock; |
303 | 316 | ||
304 | if (sched_clock_stable()) | 317 | if (sched_clock_stable()) |
305 | return sched_clock(); | 318 | return sched_clock() + raw_offset; |
306 | 319 | ||
307 | if (unlikely(!sched_clock_running)) | 320 | if (unlikely(!sched_clock_running)) |
308 | return 0ull; | 321 | return 0ull; |
@@ -323,23 +336,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu); | |||
323 | void sched_clock_tick(void) | 336 | void sched_clock_tick(void) |
324 | { | 337 | { |
325 | struct sched_clock_data *scd; | 338 | struct sched_clock_data *scd; |
326 | u64 now, now_gtod; | ||
327 | |||
328 | if (sched_clock_stable()) | ||
329 | return; | ||
330 | |||
331 | if (unlikely(!sched_clock_running)) | ||
332 | return; | ||
333 | 339 | ||
334 | WARN_ON_ONCE(!irqs_disabled()); | 340 | WARN_ON_ONCE(!irqs_disabled()); |
335 | 341 | ||
342 | /* | ||
343 | * Update these values even if sched_clock_stable(), because it can | ||
344 | * become unstable at any point in time at which point we need some | ||
345 | * values to fall back on. | ||
346 | * | ||
347 | * XXX arguably we can skip this if we expose tsc_clocksource_reliable | ||
348 | */ | ||
336 | scd = this_scd(); | 349 | scd = this_scd(); |
337 | now_gtod = ktime_to_ns(ktime_get()); | 350 | scd->tick_raw = sched_clock(); |
338 | now = sched_clock(); | 351 | scd->tick_gtod = ktime_get_ns(); |
339 | 352 | ||
340 | scd->tick_raw = now; | 353 | if (!sched_clock_stable() && likely(sched_clock_running)) |
341 | scd->tick_gtod = now_gtod; | 354 | sched_clock_local(scd); |
342 | sched_clock_local(scd); | ||
343 | } | 355 | } |
344 | 356 | ||
345 | /* | 357 | /* |
@@ -366,11 +378,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
366 | 378 | ||
367 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 379 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
368 | 380 | ||
369 | void sched_clock_init(void) | ||
370 | { | ||
371 | sched_clock_running = 1; | ||
372 | } | ||
373 | |||
374 | u64 sched_clock_cpu(int cpu) | 381 | u64 sched_clock_cpu(int cpu) |
375 | { | 382 | { |
376 | if (unlikely(!sched_clock_running)) | 383 | if (unlikely(!sched_clock_running)) |
@@ -378,6 +385,7 @@ u64 sched_clock_cpu(int cpu) | |||
378 | 385 | ||
379 | return sched_clock(); | 386 | return sched_clock(); |
380 | } | 387 | } |
388 | |||
381 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 389 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
382 | 390 | ||
383 | /* | 391 | /* |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 8d0f35debf35..f063a25d4449 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -31,7 +31,8 @@ void complete(struct completion *x) | |||
31 | unsigned long flags; | 31 | unsigned long flags; |
32 | 32 | ||
33 | spin_lock_irqsave(&x->wait.lock, flags); | 33 | spin_lock_irqsave(&x->wait.lock, flags); |
34 | x->done++; | 34 | if (x->done != UINT_MAX) |
35 | x->done++; | ||
35 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); | 36 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); |
36 | spin_unlock_irqrestore(&x->wait.lock, flags); | 37 | spin_unlock_irqrestore(&x->wait.lock, flags); |
37 | } | 38 | } |
@@ -51,7 +52,7 @@ void complete_all(struct completion *x) | |||
51 | unsigned long flags; | 52 | unsigned long flags; |
52 | 53 | ||
53 | spin_lock_irqsave(&x->wait.lock, flags); | 54 | spin_lock_irqsave(&x->wait.lock, flags); |
54 | x->done += UINT_MAX/2; | 55 | x->done = UINT_MAX; |
55 | __wake_up_locked(&x->wait, TASK_NORMAL, 0); | 56 | __wake_up_locked(&x->wait, TASK_NORMAL, 0); |
56 | spin_unlock_irqrestore(&x->wait.lock, flags); | 57 | spin_unlock_irqrestore(&x->wait.lock, flags); |
57 | } | 58 | } |
@@ -79,7 +80,8 @@ do_wait_for_common(struct completion *x, | |||
79 | if (!x->done) | 80 | if (!x->done) |
80 | return timeout; | 81 | return timeout; |
81 | } | 82 | } |
82 | x->done--; | 83 | if (x->done != UINT_MAX) |
84 | x->done--; | ||
83 | return timeout ?: 1; | 85 | return timeout ?: 1; |
84 | } | 86 | } |
85 | 87 | ||
@@ -280,7 +282,7 @@ bool try_wait_for_completion(struct completion *x) | |||
280 | spin_lock_irqsave(&x->wait.lock, flags); | 282 | spin_lock_irqsave(&x->wait.lock, flags); |
281 | if (!x->done) | 283 | if (!x->done) |
282 | ret = 0; | 284 | ret = 0; |
283 | else | 285 | else if (x->done != UINT_MAX) |
284 | x->done--; | 286 | x->done--; |
285 | spin_unlock_irqrestore(&x->wait.lock, flags); | 287 | spin_unlock_irqrestore(&x->wait.lock, flags); |
286 | return ret; | 288 | return ret; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c56fb57f2991..34e2291a9a6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1,88 +1,28 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched/core.c | 2 | * kernel/sched/core.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Core kernel scheduler code and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | ||
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | ||
9 | * make semaphores SMP safe | ||
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | ||
11 | * by Andrea Arcangeli | ||
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | ||
13 | * hybrid priority-list and round-robin design with | ||
14 | * an array-switch method of distributing timeslices | ||
15 | * and per-CPU runqueues. Cleanups and useful suggestions | ||
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | ||
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | ||
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | ||
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | ||
20 | * fair scheduling design by Con Kolivas. | ||
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | ||
22 | * by Peter Williams | ||
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | ||
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | ||
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
27 | */ | 7 | */ |
28 | 8 | #include <linux/sched.h> | |
29 | #include <linux/kasan.h> | ||
30 | #include <linux/mm.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/nmi.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/uaccess.h> | ||
35 | #include <linux/highmem.h> | ||
36 | #include <linux/mmu_context.h> | ||
37 | #include <linux/interrupt.h> | ||
38 | #include <linux/capability.h> | ||
39 | #include <linux/completion.h> | ||
40 | #include <linux/kernel_stat.h> | ||
41 | #include <linux/debug_locks.h> | ||
42 | #include <linux/perf_event.h> | ||
43 | #include <linux/security.h> | ||
44 | #include <linux/notifier.h> | ||
45 | #include <linux/profile.h> | ||
46 | #include <linux/freezer.h> | ||
47 | #include <linux/vmalloc.h> | ||
48 | #include <linux/blkdev.h> | ||
49 | #include <linux/delay.h> | ||
50 | #include <linux/pid_namespace.h> | ||
51 | #include <linux/smp.h> | ||
52 | #include <linux/threads.h> | ||
53 | #include <linux/timer.h> | ||
54 | #include <linux/rcupdate.h> | ||
55 | #include <linux/cpu.h> | ||
56 | #include <linux/cpuset.h> | 9 | #include <linux/cpuset.h> |
57 | #include <linux/percpu.h> | ||
58 | #include <linux/proc_fs.h> | ||
59 | #include <linux/seq_file.h> | ||
60 | #include <linux/sysctl.h> | ||
61 | #include <linux/syscalls.h> | ||
62 | #include <linux/times.h> | ||
63 | #include <linux/tsacct_kern.h> | ||
64 | #include <linux/kprobes.h> | ||
65 | #include <linux/delayacct.h> | 10 | #include <linux/delayacct.h> |
66 | #include <linux/unistd.h> | ||
67 | #include <linux/pagemap.h> | ||
68 | #include <linux/hrtimer.h> | ||
69 | #include <linux/tick.h> | ||
70 | #include <linux/ctype.h> | ||
71 | #include <linux/ftrace.h> | ||
72 | #include <linux/slab.h> | ||
73 | #include <linux/init_task.h> | 11 | #include <linux/init_task.h> |
74 | #include <linux/context_tracking.h> | 12 | #include <linux/context_tracking.h> |
75 | #include <linux/compiler.h> | 13 | |
76 | #include <linux/frame.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/kprobes.h> | ||
16 | #include <linux/mmu_context.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/nmi.h> | ||
77 | #include <linux/prefetch.h> | 19 | #include <linux/prefetch.h> |
78 | #include <linux/mutex.h> | 20 | #include <linux/profile.h> |
21 | #include <linux/security.h> | ||
22 | #include <linux/syscalls.h> | ||
79 | 23 | ||
80 | #include <asm/switch_to.h> | 24 | #include <asm/switch_to.h> |
81 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
82 | #include <asm/irq_regs.h> | ||
83 | #ifdef CONFIG_PARAVIRT | ||
84 | #include <asm/paravirt.h> | ||
85 | #endif | ||
86 | 26 | ||
87 | #include "sched.h" | 27 | #include "sched.h" |
88 | #include "../workqueue_internal.h" | 28 | #include "../workqueue_internal.h" |
@@ -91,27 +31,8 @@ | |||
91 | #define CREATE_TRACE_POINTS | 31 | #define CREATE_TRACE_POINTS |
92 | #include <trace/events/sched.h> | 32 | #include <trace/events/sched.h> |
93 | 33 | ||
94 | DEFINE_MUTEX(sched_domains_mutex); | ||
95 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 34 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
96 | 35 | ||
97 | static void update_rq_clock_task(struct rq *rq, s64 delta); | ||
98 | |||
99 | void update_rq_clock(struct rq *rq) | ||
100 | { | ||
101 | s64 delta; | ||
102 | |||
103 | lockdep_assert_held(&rq->lock); | ||
104 | |||
105 | if (rq->clock_skip_update & RQCF_ACT_SKIP) | ||
106 | return; | ||
107 | |||
108 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
109 | if (delta < 0) | ||
110 | return; | ||
111 | rq->clock += delta; | ||
112 | update_rq_clock_task(rq, delta); | ||
113 | } | ||
114 | |||
115 | /* | 36 | /* |
116 | * Debugging: various feature bits | 37 | * Debugging: various feature bits |
117 | */ | 38 | */ |
@@ -140,7 +61,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
140 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | 61 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; |
141 | 62 | ||
142 | /* | 63 | /* |
143 | * period over which we measure -rt task cpu usage in us. | 64 | * period over which we measure -rt task CPU usage in us. |
144 | * default: 1s | 65 | * default: 1s |
145 | */ | 66 | */ |
146 | unsigned int sysctl_sched_rt_period = 1000000; | 67 | unsigned int sysctl_sched_rt_period = 1000000; |
@@ -153,7 +74,7 @@ __read_mostly int scheduler_running; | |||
153 | */ | 74 | */ |
154 | int sysctl_sched_rt_runtime = 950000; | 75 | int sysctl_sched_rt_runtime = 950000; |
155 | 76 | ||
156 | /* cpus with isolated domains */ | 77 | /* CPUs with isolated domains */ |
157 | cpumask_var_t cpu_isolated_map; | 78 | cpumask_var_t cpu_isolated_map; |
158 | 79 | ||
159 | /* | 80 | /* |
@@ -185,7 +106,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
185 | rq = task_rq(p); | 106 | rq = task_rq(p); |
186 | raw_spin_lock(&rq->lock); | 107 | raw_spin_lock(&rq->lock); |
187 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { | 108 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { |
188 | rf->cookie = lockdep_pin_lock(&rq->lock); | 109 | rq_pin_lock(rq, rf); |
189 | return rq; | 110 | return rq; |
190 | } | 111 | } |
191 | raw_spin_unlock(&rq->lock); | 112 | raw_spin_unlock(&rq->lock); |
@@ -221,11 +142,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
221 | * If we observe the old cpu in task_rq_lock, the acquire of | 142 | * If we observe the old cpu in task_rq_lock, the acquire of |
222 | * the old rq->lock will fully serialize against the stores. | 143 | * the old rq->lock will fully serialize against the stores. |
223 | * | 144 | * |
224 | * If we observe the new cpu in task_rq_lock, the acquire will | 145 | * If we observe the new CPU in task_rq_lock, the acquire will |
225 | * pair with the WMB to ensure we must then also see migrating. | 146 | * pair with the WMB to ensure we must then also see migrating. |
226 | */ | 147 | */ |
227 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { | 148 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { |
228 | rf->cookie = lockdep_pin_lock(&rq->lock); | 149 | rq_pin_lock(rq, rf); |
229 | return rq; | 150 | return rq; |
230 | } | 151 | } |
231 | raw_spin_unlock(&rq->lock); | 152 | raw_spin_unlock(&rq->lock); |
@@ -236,6 +157,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
236 | } | 157 | } |
237 | } | 158 | } |
238 | 159 | ||
160 | /* | ||
161 | * RQ-clock updating methods: | ||
162 | */ | ||
163 | |||
164 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
165 | { | ||
166 | /* | ||
167 | * In theory, the compile should just see 0 here, and optimize out the call | ||
168 | * to sched_rt_avg_update. But I don't trust it... | ||
169 | */ | ||
170 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
171 | s64 steal = 0, irq_delta = 0; | ||
172 | #endif | ||
173 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
174 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
175 | |||
176 | /* | ||
177 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
178 | * this case when a previous update_rq_clock() happened inside a | ||
179 | * {soft,}irq region. | ||
180 | * | ||
181 | * When this happens, we stop ->clock_task and only update the | ||
182 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
183 | * update will consume the rest. This ensures ->clock_task is | ||
184 | * monotonic. | ||
185 | * | ||
186 | * It does however cause some slight miss-attribution of {soft,}irq | ||
187 | * time, a more accurate solution would be to update the irq_time using | ||
188 | * the current rq->clock timestamp, except that would require using | ||
189 | * atomic ops. | ||
190 | */ | ||
191 | if (irq_delta > delta) | ||
192 | irq_delta = delta; | ||
193 | |||
194 | rq->prev_irq_time += irq_delta; | ||
195 | delta -= irq_delta; | ||
196 | #endif | ||
197 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
198 | if (static_key_false((¶virt_steal_rq_enabled))) { | ||
199 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
200 | steal -= rq->prev_steal_time_rq; | ||
201 | |||
202 | if (unlikely(steal > delta)) | ||
203 | steal = delta; | ||
204 | |||
205 | rq->prev_steal_time_rq += steal; | ||
206 | delta -= steal; | ||
207 | } | ||
208 | #endif | ||
209 | |||
210 | rq->clock_task += delta; | ||
211 | |||
212 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
213 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | ||
214 | sched_rt_avg_update(rq, irq_delta + steal); | ||
215 | #endif | ||
216 | } | ||
217 | |||
218 | void update_rq_clock(struct rq *rq) | ||
219 | { | ||
220 | s64 delta; | ||
221 | |||
222 | lockdep_assert_held(&rq->lock); | ||
223 | |||
224 | if (rq->clock_update_flags & RQCF_ACT_SKIP) | ||
225 | return; | ||
226 | |||
227 | #ifdef CONFIG_SCHED_DEBUG | ||
228 | rq->clock_update_flags |= RQCF_UPDATED; | ||
229 | #endif | ||
230 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
231 | if (delta < 0) | ||
232 | return; | ||
233 | rq->clock += delta; | ||
234 | update_rq_clock_task(rq, delta); | ||
235 | } | ||
236 | |||
237 | |||
239 | #ifdef CONFIG_SCHED_HRTICK | 238 | #ifdef CONFIG_SCHED_HRTICK |
240 | /* | 239 | /* |
241 | * Use HR-timers to deliver accurate preemption points. | 240 | * Use HR-timers to deliver accurate preemption points. |
@@ -458,7 +457,7 @@ void wake_up_q(struct wake_q_head *head) | |||
458 | 457 | ||
459 | task = container_of(node, struct task_struct, wake_q); | 458 | task = container_of(node, struct task_struct, wake_q); |
460 | BUG_ON(!task); | 459 | BUG_ON(!task); |
461 | /* task can safely be re-inserted now */ | 460 | /* Task can safely be re-inserted now: */ |
462 | node = node->next; | 461 | node = node->next; |
463 | task->wake_q.next = NULL; | 462 | task->wake_q.next = NULL; |
464 | 463 | ||
@@ -516,12 +515,12 @@ void resched_cpu(int cpu) | |||
516 | #ifdef CONFIG_SMP | 515 | #ifdef CONFIG_SMP |
517 | #ifdef CONFIG_NO_HZ_COMMON | 516 | #ifdef CONFIG_NO_HZ_COMMON |
518 | /* | 517 | /* |
519 | * In the semi idle case, use the nearest busy cpu for migrating timers | 518 | * In the semi idle case, use the nearest busy CPU for migrating timers |
520 | * from an idle cpu. This is good for power-savings. | 519 | * from an idle CPU. This is good for power-savings. |
521 | * | 520 | * |
522 | * We don't do similar optimization for completely idle system, as | 521 | * We don't do similar optimization for completely idle system, as |
523 | * selecting an idle cpu will add more delays to the timers than intended | 522 | * selecting an idle CPU will add more delays to the timers than intended |
524 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | 523 | * (as that CPU's timer base may not be uptodate wrt jiffies etc). |
525 | */ | 524 | */ |
526 | int get_nohz_timer_target(void) | 525 | int get_nohz_timer_target(void) |
527 | { | 526 | { |
@@ -550,6 +549,7 @@ unlock: | |||
550 | rcu_read_unlock(); | 549 | rcu_read_unlock(); |
551 | return cpu; | 550 | return cpu; |
552 | } | 551 | } |
552 | |||
553 | /* | 553 | /* |
554 | * When add_timer_on() enqueues a timer into the timer wheel of an | 554 | * When add_timer_on() enqueues a timer into the timer wheel of an |
555 | * idle CPU then this timer might expire before the next timer event | 555 | * idle CPU then this timer might expire before the next timer event |
@@ -784,60 +784,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
784 | dequeue_task(rq, p, flags); | 784 | dequeue_task(rq, p, flags); |
785 | } | 785 | } |
786 | 786 | ||
787 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
788 | { | ||
789 | /* | ||
790 | * In theory, the compile should just see 0 here, and optimize out the call | ||
791 | * to sched_rt_avg_update. But I don't trust it... | ||
792 | */ | ||
793 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
794 | s64 steal = 0, irq_delta = 0; | ||
795 | #endif | ||
796 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
797 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
798 | |||
799 | /* | ||
800 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
801 | * this case when a previous update_rq_clock() happened inside a | ||
802 | * {soft,}irq region. | ||
803 | * | ||
804 | * When this happens, we stop ->clock_task and only update the | ||
805 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
806 | * update will consume the rest. This ensures ->clock_task is | ||
807 | * monotonic. | ||
808 | * | ||
809 | * It does however cause some slight miss-attribution of {soft,}irq | ||
810 | * time, a more accurate solution would be to update the irq_time using | ||
811 | * the current rq->clock timestamp, except that would require using | ||
812 | * atomic ops. | ||
813 | */ | ||
814 | if (irq_delta > delta) | ||
815 | irq_delta = delta; | ||
816 | |||
817 | rq->prev_irq_time += irq_delta; | ||
818 | delta -= irq_delta; | ||
819 | #endif | ||
820 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
821 | if (static_key_false((¶virt_steal_rq_enabled))) { | ||
822 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
823 | steal -= rq->prev_steal_time_rq; | ||
824 | |||
825 | if (unlikely(steal > delta)) | ||
826 | steal = delta; | ||
827 | |||
828 | rq->prev_steal_time_rq += steal; | ||
829 | delta -= steal; | ||
830 | } | ||
831 | #endif | ||
832 | |||
833 | rq->clock_task += delta; | ||
834 | |||
835 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
836 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | ||
837 | sched_rt_avg_update(rq, irq_delta + steal); | ||
838 | #endif | ||
839 | } | ||
840 | |||
841 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 787 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
842 | { | 788 | { |
843 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 789 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -1018,7 +964,7 @@ struct migration_arg { | |||
1018 | }; | 964 | }; |
1019 | 965 | ||
1020 | /* | 966 | /* |
1021 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 967 | * Move (not current) task off this CPU, onto the destination CPU. We're doing |
1022 | * this because either it can't run here any more (set_cpus_allowed() | 968 | * this because either it can't run here any more (set_cpus_allowed() |
1023 | * away from this CPU, or CPU going down), or because we're | 969 | * away from this CPU, or CPU going down), or because we're |
1024 | * attempting to rebalance this task on exec (sched_exec). | 970 | * attempting to rebalance this task on exec (sched_exec). |
@@ -1052,8 +998,8 @@ static int migration_cpu_stop(void *data) | |||
1052 | struct rq *rq = this_rq(); | 998 | struct rq *rq = this_rq(); |
1053 | 999 | ||
1054 | /* | 1000 | /* |
1055 | * The original target cpu might have gone down and we might | 1001 | * The original target CPU might have gone down and we might |
1056 | * be on another cpu but it doesn't matter. | 1002 | * be on another CPU but it doesn't matter. |
1057 | */ | 1003 | */ |
1058 | local_irq_disable(); | 1004 | local_irq_disable(); |
1059 | /* | 1005 | /* |
@@ -1171,7 +1117,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1171 | if (p->flags & PF_KTHREAD) { | 1117 | if (p->flags & PF_KTHREAD) { |
1172 | /* | 1118 | /* |
1173 | * For kernel threads that do indeed end up on online && | 1119 | * For kernel threads that do indeed end up on online && |
1174 | * !active we want to ensure they are strict per-cpu threads. | 1120 | * !active we want to ensure they are strict per-CPU threads. |
1175 | */ | 1121 | */ |
1176 | WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && | 1122 | WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && |
1177 | !cpumask_intersects(new_mask, cpu_active_mask) && | 1123 | !cpumask_intersects(new_mask, cpu_active_mask) && |
@@ -1195,9 +1141,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1195 | * OK, since we're going to drop the lock immediately | 1141 | * OK, since we're going to drop the lock immediately |
1196 | * afterwards anyway. | 1142 | * afterwards anyway. |
1197 | */ | 1143 | */ |
1198 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 1144 | rq_unpin_lock(rq, &rf); |
1199 | rq = move_queued_task(rq, p, dest_cpu); | 1145 | rq = move_queued_task(rq, p, dest_cpu); |
1200 | lockdep_repin_lock(&rq->lock, rf.cookie); | 1146 | rq_repin_lock(rq, &rf); |
1201 | } | 1147 | } |
1202 | out: | 1148 | out: |
1203 | task_rq_unlock(rq, p, &rf); | 1149 | task_rq_unlock(rq, p, &rf); |
@@ -1276,7 +1222,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1276 | /* | 1222 | /* |
1277 | * Task isn't running anymore; make it appear like we migrated | 1223 | * Task isn't running anymore; make it appear like we migrated |
1278 | * it before it went to sleep. This means on wakeup we make the | 1224 | * it before it went to sleep. This means on wakeup we make the |
1279 | * previous cpu our target instead of where it really is. | 1225 | * previous CPU our target instead of where it really is. |
1280 | */ | 1226 | */ |
1281 | p->wake_cpu = cpu; | 1227 | p->wake_cpu = cpu; |
1282 | } | 1228 | } |
@@ -1508,12 +1454,12 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1508 | * | 1454 | * |
1509 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, | 1455 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, |
1510 | * see __set_cpus_allowed_ptr(). At this point the newly online | 1456 | * see __set_cpus_allowed_ptr(). At this point the newly online |
1511 | * cpu isn't yet part of the sched domains, and balancing will not | 1457 | * CPU isn't yet part of the sched domains, and balancing will not |
1512 | * see it. | 1458 | * see it. |
1513 | * | 1459 | * |
1514 | * - on cpu-down we clear cpu_active() to mask the sched domains and | 1460 | * - on CPU-down we clear cpu_active() to mask the sched domains and |
1515 | * avoid the load balancer to place new tasks on the to be removed | 1461 | * avoid the load balancer to place new tasks on the to be removed |
1516 | * cpu. Existing tasks will remain running there and will be taken | 1462 | * CPU. Existing tasks will remain running there and will be taken |
1517 | * off. | 1463 | * off. |
1518 | * | 1464 | * |
1519 | * This means that fallback selection must not select !active CPUs. | 1465 | * This means that fallback selection must not select !active CPUs. |
@@ -1529,9 +1475,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1529 | int dest_cpu; | 1475 | int dest_cpu; |
1530 | 1476 | ||
1531 | /* | 1477 | /* |
1532 | * If the node that the cpu is on has been offlined, cpu_to_node() | 1478 | * If the node that the CPU is on has been offlined, cpu_to_node() |
1533 | * will return -1. There is no cpu on the node, and we should | 1479 | * will return -1. There is no CPU on the node, and we should |
1534 | * select the cpu on the other node. | 1480 | * select the CPU on the other node. |
1535 | */ | 1481 | */ |
1536 | if (nid != -1) { | 1482 | if (nid != -1) { |
1537 | nodemask = cpumask_of_node(nid); | 1483 | nodemask = cpumask_of_node(nid); |
@@ -1563,7 +1509,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1563 | state = possible; | 1509 | state = possible; |
1564 | break; | 1510 | break; |
1565 | } | 1511 | } |
1566 | /* fall-through */ | 1512 | /* Fall-through */ |
1567 | case possible: | 1513 | case possible: |
1568 | do_set_cpus_allowed(p, cpu_possible_mask); | 1514 | do_set_cpus_allowed(p, cpu_possible_mask); |
1569 | state = fail; | 1515 | state = fail; |
@@ -1607,7 +1553,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | |||
1607 | /* | 1553 | /* |
1608 | * In order not to call set_task_cpu() on a blocking task we need | 1554 | * In order not to call set_task_cpu() on a blocking task we need |
1609 | * to rely on ttwu() to place the task on a valid ->cpus_allowed | 1555 | * to rely on ttwu() to place the task on a valid ->cpus_allowed |
1610 | * cpu. | 1556 | * CPU. |
1611 | * | 1557 | * |
1612 | * Since this is common to all placement strategies, this lives here. | 1558 | * Since this is common to all placement strategies, this lives here. |
1613 | * | 1559 | * |
@@ -1681,7 +1627,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl | |||
1681 | activate_task(rq, p, en_flags); | 1627 | activate_task(rq, p, en_flags); |
1682 | p->on_rq = TASK_ON_RQ_QUEUED; | 1628 | p->on_rq = TASK_ON_RQ_QUEUED; |
1683 | 1629 | ||
1684 | /* if a worker is waking up, notify workqueue */ | 1630 | /* If a worker is waking up, notify the workqueue: */ |
1685 | if (p->flags & PF_WQ_WORKER) | 1631 | if (p->flags & PF_WQ_WORKER) |
1686 | wq_worker_waking_up(p, cpu_of(rq)); | 1632 | wq_worker_waking_up(p, cpu_of(rq)); |
1687 | } | 1633 | } |
@@ -1690,7 +1636,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl | |||
1690 | * Mark the task runnable and perform wakeup-preemption. | 1636 | * Mark the task runnable and perform wakeup-preemption. |
1691 | */ | 1637 | */ |
1692 | static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | 1638 | static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, |
1693 | struct pin_cookie cookie) | 1639 | struct rq_flags *rf) |
1694 | { | 1640 | { |
1695 | check_preempt_curr(rq, p, wake_flags); | 1641 | check_preempt_curr(rq, p, wake_flags); |
1696 | p->state = TASK_RUNNING; | 1642 | p->state = TASK_RUNNING; |
@@ -1702,9 +1648,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1702 | * Our task @p is fully woken up and running; so its safe to | 1648 | * Our task @p is fully woken up and running; so its safe to |
1703 | * drop the rq->lock, hereafter rq is only used for statistics. | 1649 | * drop the rq->lock, hereafter rq is only used for statistics. |
1704 | */ | 1650 | */ |
1705 | lockdep_unpin_lock(&rq->lock, cookie); | 1651 | rq_unpin_lock(rq, rf); |
1706 | p->sched_class->task_woken(rq, p); | 1652 | p->sched_class->task_woken(rq, p); |
1707 | lockdep_repin_lock(&rq->lock, cookie); | 1653 | rq_repin_lock(rq, rf); |
1708 | } | 1654 | } |
1709 | 1655 | ||
1710 | if (rq->idle_stamp) { | 1656 | if (rq->idle_stamp) { |
@@ -1723,7 +1669,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1723 | 1669 | ||
1724 | static void | 1670 | static void |
1725 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | 1671 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, |
1726 | struct pin_cookie cookie) | 1672 | struct rq_flags *rf) |
1727 | { | 1673 | { |
1728 | int en_flags = ENQUEUE_WAKEUP; | 1674 | int en_flags = ENQUEUE_WAKEUP; |
1729 | 1675 | ||
@@ -1738,7 +1684,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1738 | #endif | 1684 | #endif |
1739 | 1685 | ||
1740 | ttwu_activate(rq, p, en_flags); | 1686 | ttwu_activate(rq, p, en_flags); |
1741 | ttwu_do_wakeup(rq, p, wake_flags, cookie); | 1687 | ttwu_do_wakeup(rq, p, wake_flags, rf); |
1742 | } | 1688 | } |
1743 | 1689 | ||
1744 | /* | 1690 | /* |
@@ -1757,7 +1703,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1757 | if (task_on_rq_queued(p)) { | 1703 | if (task_on_rq_queued(p)) { |
1758 | /* check_preempt_curr() may use rq clock */ | 1704 | /* check_preempt_curr() may use rq clock */ |
1759 | update_rq_clock(rq); | 1705 | update_rq_clock(rq); |
1760 | ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); | 1706 | ttwu_do_wakeup(rq, p, wake_flags, &rf); |
1761 | ret = 1; | 1707 | ret = 1; |
1762 | } | 1708 | } |
1763 | __task_rq_unlock(rq, &rf); | 1709 | __task_rq_unlock(rq, &rf); |
@@ -1770,15 +1716,15 @@ void sched_ttwu_pending(void) | |||
1770 | { | 1716 | { |
1771 | struct rq *rq = this_rq(); | 1717 | struct rq *rq = this_rq(); |
1772 | struct llist_node *llist = llist_del_all(&rq->wake_list); | 1718 | struct llist_node *llist = llist_del_all(&rq->wake_list); |
1773 | struct pin_cookie cookie; | ||
1774 | struct task_struct *p; | 1719 | struct task_struct *p; |
1775 | unsigned long flags; | 1720 | unsigned long flags; |
1721 | struct rq_flags rf; | ||
1776 | 1722 | ||
1777 | if (!llist) | 1723 | if (!llist) |
1778 | return; | 1724 | return; |
1779 | 1725 | ||
1780 | raw_spin_lock_irqsave(&rq->lock, flags); | 1726 | raw_spin_lock_irqsave(&rq->lock, flags); |
1781 | cookie = lockdep_pin_lock(&rq->lock); | 1727 | rq_pin_lock(rq, &rf); |
1782 | 1728 | ||
1783 | while (llist) { | 1729 | while (llist) { |
1784 | int wake_flags = 0; | 1730 | int wake_flags = 0; |
@@ -1789,10 +1735,10 @@ void sched_ttwu_pending(void) | |||
1789 | if (p->sched_remote_wakeup) | 1735 | if (p->sched_remote_wakeup) |
1790 | wake_flags = WF_MIGRATED; | 1736 | wake_flags = WF_MIGRATED; |
1791 | 1737 | ||
1792 | ttwu_do_activate(rq, p, wake_flags, cookie); | 1738 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1793 | } | 1739 | } |
1794 | 1740 | ||
1795 | lockdep_unpin_lock(&rq->lock, cookie); | 1741 | rq_unpin_lock(rq, &rf); |
1796 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1742 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1797 | } | 1743 | } |
1798 | 1744 | ||
@@ -1864,7 +1810,7 @@ void wake_up_if_idle(int cpu) | |||
1864 | raw_spin_lock_irqsave(&rq->lock, flags); | 1810 | raw_spin_lock_irqsave(&rq->lock, flags); |
1865 | if (is_idle_task(rq->curr)) | 1811 | if (is_idle_task(rq->curr)) |
1866 | smp_send_reschedule(cpu); | 1812 | smp_send_reschedule(cpu); |
1867 | /* Else cpu is not in idle, do nothing here */ | 1813 | /* Else CPU is not idle, do nothing here: */ |
1868 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1814 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1869 | } | 1815 | } |
1870 | 1816 | ||
@@ -1881,20 +1827,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) | |||
1881 | static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | 1827 | static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) |
1882 | { | 1828 | { |
1883 | struct rq *rq = cpu_rq(cpu); | 1829 | struct rq *rq = cpu_rq(cpu); |
1884 | struct pin_cookie cookie; | 1830 | struct rq_flags rf; |
1885 | 1831 | ||
1886 | #if defined(CONFIG_SMP) | 1832 | #if defined(CONFIG_SMP) |
1887 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { | 1833 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { |
1888 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1834 | sched_clock_cpu(cpu); /* Sync clocks across CPUs */ |
1889 | ttwu_queue_remote(p, cpu, wake_flags); | 1835 | ttwu_queue_remote(p, cpu, wake_flags); |
1890 | return; | 1836 | return; |
1891 | } | 1837 | } |
1892 | #endif | 1838 | #endif |
1893 | 1839 | ||
1894 | raw_spin_lock(&rq->lock); | 1840 | raw_spin_lock(&rq->lock); |
1895 | cookie = lockdep_pin_lock(&rq->lock); | 1841 | rq_pin_lock(rq, &rf); |
1896 | ttwu_do_activate(rq, p, wake_flags, cookie); | 1842 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1897 | lockdep_unpin_lock(&rq->lock, cookie); | 1843 | rq_unpin_lock(rq, &rf); |
1898 | raw_spin_unlock(&rq->lock); | 1844 | raw_spin_unlock(&rq->lock); |
1899 | } | 1845 | } |
1900 | 1846 | ||
@@ -1904,8 +1850,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1904 | * MIGRATION | 1850 | * MIGRATION |
1905 | * | 1851 | * |
1906 | * The basic program-order guarantee on SMP systems is that when a task [t] | 1852 | * The basic program-order guarantee on SMP systems is that when a task [t] |
1907 | * migrates, all its activity on its old cpu [c0] happens-before any subsequent | 1853 | * migrates, all its activity on its old CPU [c0] happens-before any subsequent |
1908 | * execution on its new cpu [c1]. | 1854 | * execution on its new CPU [c1]. |
1909 | * | 1855 | * |
1910 | * For migration (of runnable tasks) this is provided by the following means: | 1856 | * For migration (of runnable tasks) this is provided by the following means: |
1911 | * | 1857 | * |
@@ -1916,7 +1862,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1916 | * | 1862 | * |
1917 | * Transitivity guarantees that B happens after A and C after B. | 1863 | * Transitivity guarantees that B happens after A and C after B. |
1918 | * Note: we only require RCpc transitivity. | 1864 | * Note: we only require RCpc transitivity. |
1919 | * Note: the cpu doing B need not be c0 or c1 | 1865 | * Note: the CPU doing B need not be c0 or c1 |
1920 | * | 1866 | * |
1921 | * Example: | 1867 | * Example: |
1922 | * | 1868 | * |
@@ -2024,7 +1970,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2024 | 1970 | ||
2025 | trace_sched_waking(p); | 1971 | trace_sched_waking(p); |
2026 | 1972 | ||
2027 | success = 1; /* we're going to change ->state */ | 1973 | /* We're going to change ->state: */ |
1974 | success = 1; | ||
2028 | cpu = task_cpu(p); | 1975 | cpu = task_cpu(p); |
2029 | 1976 | ||
2030 | /* | 1977 | /* |
@@ -2073,7 +2020,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2073 | smp_rmb(); | 2020 | smp_rmb(); |
2074 | 2021 | ||
2075 | /* | 2022 | /* |
2076 | * If the owning (remote) cpu is still in the middle of schedule() with | 2023 | * If the owning (remote) CPU is still in the middle of schedule() with |
2077 | * this task as prev, wait until its done referencing the task. | 2024 | * this task as prev, wait until its done referencing the task. |
2078 | * | 2025 | * |
2079 | * Pairs with the smp_store_release() in finish_lock_switch(). | 2026 | * Pairs with the smp_store_release() in finish_lock_switch(). |
@@ -2086,11 +2033,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2086 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | 2033 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2087 | p->state = TASK_WAKING; | 2034 | p->state = TASK_WAKING; |
2088 | 2035 | ||
2036 | if (p->in_iowait) { | ||
2037 | delayacct_blkio_end(); | ||
2038 | atomic_dec(&task_rq(p)->nr_iowait); | ||
2039 | } | ||
2040 | |||
2089 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); | 2041 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
2090 | if (task_cpu(p) != cpu) { | 2042 | if (task_cpu(p) != cpu) { |
2091 | wake_flags |= WF_MIGRATED; | 2043 | wake_flags |= WF_MIGRATED; |
2092 | set_task_cpu(p, cpu); | 2044 | set_task_cpu(p, cpu); |
2093 | } | 2045 | } |
2046 | |||
2047 | #else /* CONFIG_SMP */ | ||
2048 | |||
2049 | if (p->in_iowait) { | ||
2050 | delayacct_blkio_end(); | ||
2051 | atomic_dec(&task_rq(p)->nr_iowait); | ||
2052 | } | ||
2053 | |||
2094 | #endif /* CONFIG_SMP */ | 2054 | #endif /* CONFIG_SMP */ |
2095 | 2055 | ||
2096 | ttwu_queue(p, cpu, wake_flags); | 2056 | ttwu_queue(p, cpu, wake_flags); |
@@ -2111,7 +2071,7 @@ out: | |||
2111 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2071 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2112 | * the current task. | 2072 | * the current task. |
2113 | */ | 2073 | */ |
2114 | static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) | 2074 | static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) |
2115 | { | 2075 | { |
2116 | struct rq *rq = task_rq(p); | 2076 | struct rq *rq = task_rq(p); |
2117 | 2077 | ||
@@ -2128,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2128 | * disabled avoiding further scheduler activity on it and we've | 2088 | * disabled avoiding further scheduler activity on it and we've |
2129 | * not yet picked a replacement task. | 2089 | * not yet picked a replacement task. |
2130 | */ | 2090 | */ |
2131 | lockdep_unpin_lock(&rq->lock, cookie); | 2091 | rq_unpin_lock(rq, rf); |
2132 | raw_spin_unlock(&rq->lock); | 2092 | raw_spin_unlock(&rq->lock); |
2133 | raw_spin_lock(&p->pi_lock); | 2093 | raw_spin_lock(&p->pi_lock); |
2134 | raw_spin_lock(&rq->lock); | 2094 | raw_spin_lock(&rq->lock); |
2135 | lockdep_repin_lock(&rq->lock, cookie); | 2095 | rq_repin_lock(rq, rf); |
2136 | } | 2096 | } |
2137 | 2097 | ||
2138 | if (!(p->state & TASK_NORMAL)) | 2098 | if (!(p->state & TASK_NORMAL)) |
@@ -2140,10 +2100,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2140 | 2100 | ||
2141 | trace_sched_waking(p); | 2101 | trace_sched_waking(p); |
2142 | 2102 | ||
2143 | if (!task_on_rq_queued(p)) | 2103 | if (!task_on_rq_queued(p)) { |
2104 | if (p->in_iowait) { | ||
2105 | delayacct_blkio_end(); | ||
2106 | atomic_dec(&rq->nr_iowait); | ||
2107 | } | ||
2144 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2108 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2109 | } | ||
2145 | 2110 | ||
2146 | ttwu_do_wakeup(rq, p, 0, cookie); | 2111 | ttwu_do_wakeup(rq, p, 0, rf); |
2147 | ttwu_stat(p, smp_processor_id(), 0); | 2112 | ttwu_stat(p, smp_processor_id(), 0); |
2148 | out: | 2113 | out: |
2149 | raw_spin_unlock(&p->pi_lock); | 2114 | raw_spin_unlock(&p->pi_lock); |
@@ -2427,7 +2392,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2427 | */ | 2392 | */ |
2428 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2393 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2429 | /* | 2394 | /* |
2430 | * We're setting the cpu for the first time, we don't migrate, | 2395 | * We're setting the CPU for the first time, we don't migrate, |
2431 | * so use __set_task_cpu(). | 2396 | * so use __set_task_cpu(). |
2432 | */ | 2397 | */ |
2433 | __set_task_cpu(p, cpu); | 2398 | __set_task_cpu(p, cpu); |
@@ -2570,7 +2535,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2570 | /* | 2535 | /* |
2571 | * Fork balancing, do it here and not earlier because: | 2536 | * Fork balancing, do it here and not earlier because: |
2572 | * - cpus_allowed can change in the fork path | 2537 | * - cpus_allowed can change in the fork path |
2573 | * - any previously selected cpu might disappear through hotplug | 2538 | * - any previously selected CPU might disappear through hotplug |
2574 | * | 2539 | * |
2575 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | 2540 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, |
2576 | * as we're not fully set-up yet. | 2541 | * as we're not fully set-up yet. |
@@ -2578,6 +2543,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2578 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2543 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2579 | #endif | 2544 | #endif |
2580 | rq = __task_rq_lock(p, &rf); | 2545 | rq = __task_rq_lock(p, &rf); |
2546 | update_rq_clock(rq); | ||
2581 | post_init_entity_util_avg(&p->se); | 2547 | post_init_entity_util_avg(&p->se); |
2582 | 2548 | ||
2583 | activate_task(rq, p, 0); | 2549 | activate_task(rq, p, 0); |
@@ -2590,9 +2556,9 @@ void wake_up_new_task(struct task_struct *p) | |||
2590 | * Nothing relies on rq->lock after this, so its fine to | 2556 | * Nothing relies on rq->lock after this, so its fine to |
2591 | * drop it. | 2557 | * drop it. |
2592 | */ | 2558 | */ |
2593 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 2559 | rq_unpin_lock(rq, &rf); |
2594 | p->sched_class->task_woken(rq, p); | 2560 | p->sched_class->task_woken(rq, p); |
2595 | lockdep_repin_lock(&rq->lock, rf.cookie); | 2561 | rq_repin_lock(rq, &rf); |
2596 | } | 2562 | } |
2597 | #endif | 2563 | #endif |
2598 | task_rq_unlock(rq, p, &rf); | 2564 | task_rq_unlock(rq, p, &rf); |
@@ -2861,7 +2827,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
2861 | */ | 2827 | */ |
2862 | static __always_inline struct rq * | 2828 | static __always_inline struct rq * |
2863 | context_switch(struct rq *rq, struct task_struct *prev, | 2829 | context_switch(struct rq *rq, struct task_struct *prev, |
2864 | struct task_struct *next, struct pin_cookie cookie) | 2830 | struct task_struct *next, struct rq_flags *rf) |
2865 | { | 2831 | { |
2866 | struct mm_struct *mm, *oldmm; | 2832 | struct mm_struct *mm, *oldmm; |
2867 | 2833 | ||
@@ -2887,13 +2853,16 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2887 | prev->active_mm = NULL; | 2853 | prev->active_mm = NULL; |
2888 | rq->prev_mm = oldmm; | 2854 | rq->prev_mm = oldmm; |
2889 | } | 2855 | } |
2856 | |||
2857 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); | ||
2858 | |||
2890 | /* | 2859 | /* |
2891 | * Since the runqueue lock will be released by the next | 2860 | * Since the runqueue lock will be released by the next |
2892 | * task (which is an invalid locking op but in the case | 2861 | * task (which is an invalid locking op but in the case |
2893 | * of the scheduler it's an obvious special-case), so we | 2862 | * of the scheduler it's an obvious special-case), so we |
2894 | * do an early lockdep release here: | 2863 | * do an early lockdep release here: |
2895 | */ | 2864 | */ |
2896 | lockdep_unpin_lock(&rq->lock, cookie); | 2865 | rq_unpin_lock(rq, rf); |
2897 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2866 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
2898 | 2867 | ||
2899 | /* Here we just switch the register state and the stack. */ | 2868 | /* Here we just switch the register state and the stack. */ |
@@ -2920,7 +2889,7 @@ unsigned long nr_running(void) | |||
2920 | } | 2889 | } |
2921 | 2890 | ||
2922 | /* | 2891 | /* |
2923 | * Check if only the current task is running on the cpu. | 2892 | * Check if only the current task is running on the CPU. |
2924 | * | 2893 | * |
2925 | * Caution: this function does not check that the caller has disabled | 2894 | * Caution: this function does not check that the caller has disabled |
2926 | * preemption, thus the result might have a time-of-check-to-time-of-use | 2895 | * preemption, thus the result might have a time-of-check-to-time-of-use |
@@ -2949,6 +2918,36 @@ unsigned long long nr_context_switches(void) | |||
2949 | return sum; | 2918 | return sum; |
2950 | } | 2919 | } |
2951 | 2920 | ||
2921 | /* | ||
2922 | * IO-wait accounting, and how its mostly bollocks (on SMP). | ||
2923 | * | ||
2924 | * The idea behind IO-wait account is to account the idle time that we could | ||
2925 | * have spend running if it were not for IO. That is, if we were to improve the | ||
2926 | * storage performance, we'd have a proportional reduction in IO-wait time. | ||
2927 | * | ||
2928 | * This all works nicely on UP, where, when a task blocks on IO, we account | ||
2929 | * idle time as IO-wait, because if the storage were faster, it could've been | ||
2930 | * running and we'd not be idle. | ||
2931 | * | ||
2932 | * This has been extended to SMP, by doing the same for each CPU. This however | ||
2933 | * is broken. | ||
2934 | * | ||
2935 | * Imagine for instance the case where two tasks block on one CPU, only the one | ||
2936 | * CPU will have IO-wait accounted, while the other has regular idle. Even | ||
2937 | * though, if the storage were faster, both could've ran at the same time, | ||
2938 | * utilising both CPUs. | ||
2939 | * | ||
2940 | * This means, that when looking globally, the current IO-wait accounting on | ||
2941 | * SMP is a lower bound, by reason of under accounting. | ||
2942 | * | ||
2943 | * Worse, since the numbers are provided per CPU, they are sometimes | ||
2944 | * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly | ||
2945 | * associated with any one particular CPU, it can wake to another CPU than it | ||
2946 | * blocked on. This means the per CPU IO-wait number is meaningless. | ||
2947 | * | ||
2948 | * Task CPU affinities can make all that even more 'interesting'. | ||
2949 | */ | ||
2950 | |||
2952 | unsigned long nr_iowait(void) | 2951 | unsigned long nr_iowait(void) |
2953 | { | 2952 | { |
2954 | unsigned long i, sum = 0; | 2953 | unsigned long i, sum = 0; |
@@ -2959,6 +2958,13 @@ unsigned long nr_iowait(void) | |||
2959 | return sum; | 2958 | return sum; |
2960 | } | 2959 | } |
2961 | 2960 | ||
2961 | /* | ||
2962 | * Consumers of these two interfaces, like for example the cpufreq menu | ||
2963 | * governor are using nonsensical data. Boosting frequency for a CPU that has | ||
2964 | * IO-wait which might not even end up running the task when it does become | ||
2965 | * runnable. | ||
2966 | */ | ||
2967 | |||
2962 | unsigned long nr_iowait_cpu(int cpu) | 2968 | unsigned long nr_iowait_cpu(int cpu) |
2963 | { | 2969 | { |
2964 | struct rq *this = cpu_rq(cpu); | 2970 | struct rq *this = cpu_rq(cpu); |
@@ -3042,8 +3048,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3042 | * So we have a optimization chance when the task's delta_exec is 0. | 3048 | * So we have a optimization chance when the task's delta_exec is 0. |
3043 | * Reading ->on_cpu is racy, but this is ok. | 3049 | * Reading ->on_cpu is racy, but this is ok. |
3044 | * | 3050 | * |
3045 | * If we race with it leaving cpu, we'll take a lock. So we're correct. | 3051 | * If we race with it leaving CPU, we'll take a lock. So we're correct. |
3046 | * If we race with it entering cpu, unaccounted time is 0. This is | 3052 | * If we race with it entering CPU, unaccounted time is 0. This is |
3047 | * indistinguishable from the read occurring a few cycles earlier. | 3053 | * indistinguishable from the read occurring a few cycles earlier. |
3048 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | 3054 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has |
3049 | * been accounted, so we're correct here as well. | 3055 | * been accounted, so we're correct here as well. |
@@ -3257,31 +3263,30 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3257 | * Pick up the highest-prio task: | 3263 | * Pick up the highest-prio task: |
3258 | */ | 3264 | */ |
3259 | static inline struct task_struct * | 3265 | static inline struct task_struct * |
3260 | pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 3266 | pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
3261 | { | 3267 | { |
3262 | const struct sched_class *class = &fair_sched_class; | 3268 | const struct sched_class *class; |
3263 | struct task_struct *p; | 3269 | struct task_struct *p; |
3264 | 3270 | ||
3265 | /* | 3271 | /* |
3266 | * Optimization: we know that if all tasks are in | 3272 | * Optimization: we know that if all tasks are in |
3267 | * the fair class we can call that function directly: | 3273 | * the fair class we can call that function directly: |
3268 | */ | 3274 | */ |
3269 | if (likely(prev->sched_class == class && | 3275 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
3270 | rq->nr_running == rq->cfs.h_nr_running)) { | 3276 | p = fair_sched_class.pick_next_task(rq, prev, rf); |
3271 | p = fair_sched_class.pick_next_task(rq, prev, cookie); | ||
3272 | if (unlikely(p == RETRY_TASK)) | 3277 | if (unlikely(p == RETRY_TASK)) |
3273 | goto again; | 3278 | goto again; |
3274 | 3279 | ||
3275 | /* assumes fair_sched_class->next == idle_sched_class */ | 3280 | /* Assumes fair_sched_class->next == idle_sched_class */ |
3276 | if (unlikely(!p)) | 3281 | if (unlikely(!p)) |
3277 | p = idle_sched_class.pick_next_task(rq, prev, cookie); | 3282 | p = idle_sched_class.pick_next_task(rq, prev, rf); |
3278 | 3283 | ||
3279 | return p; | 3284 | return p; |
3280 | } | 3285 | } |
3281 | 3286 | ||
3282 | again: | 3287 | again: |
3283 | for_each_class(class) { | 3288 | for_each_class(class) { |
3284 | p = class->pick_next_task(rq, prev, cookie); | 3289 | p = class->pick_next_task(rq, prev, rf); |
3285 | if (p) { | 3290 | if (p) { |
3286 | if (unlikely(p == RETRY_TASK)) | 3291 | if (unlikely(p == RETRY_TASK)) |
3287 | goto again; | 3292 | goto again; |
@@ -3289,7 +3294,8 @@ again: | |||
3289 | } | 3294 | } |
3290 | } | 3295 | } |
3291 | 3296 | ||
3292 | BUG(); /* the idle class will always have a runnable task */ | 3297 | /* The idle class should always have a runnable task: */ |
3298 | BUG(); | ||
3293 | } | 3299 | } |
3294 | 3300 | ||
3295 | /* | 3301 | /* |
@@ -3335,7 +3341,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3335 | { | 3341 | { |
3336 | struct task_struct *prev, *next; | 3342 | struct task_struct *prev, *next; |
3337 | unsigned long *switch_count; | 3343 | unsigned long *switch_count; |
3338 | struct pin_cookie cookie; | 3344 | struct rq_flags rf; |
3339 | struct rq *rq; | 3345 | struct rq *rq; |
3340 | int cpu; | 3346 | int cpu; |
3341 | 3347 | ||
@@ -3358,9 +3364,10 @@ static void __sched notrace __schedule(bool preempt) | |||
3358 | */ | 3364 | */ |
3359 | smp_mb__before_spinlock(); | 3365 | smp_mb__before_spinlock(); |
3360 | raw_spin_lock(&rq->lock); | 3366 | raw_spin_lock(&rq->lock); |
3361 | cookie = lockdep_pin_lock(&rq->lock); | 3367 | rq_pin_lock(rq, &rf); |
3362 | 3368 | ||
3363 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | 3369 | /* Promote REQ to ACT */ |
3370 | rq->clock_update_flags <<= 1; | ||
3364 | 3371 | ||
3365 | switch_count = &prev->nivcsw; | 3372 | switch_count = &prev->nivcsw; |
3366 | if (!preempt && prev->state) { | 3373 | if (!preempt && prev->state) { |
@@ -3370,6 +3377,11 @@ static void __sched notrace __schedule(bool preempt) | |||
3370 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3377 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
3371 | prev->on_rq = 0; | 3378 | prev->on_rq = 0; |
3372 | 3379 | ||
3380 | if (prev->in_iowait) { | ||
3381 | atomic_inc(&rq->nr_iowait); | ||
3382 | delayacct_blkio_start(); | ||
3383 | } | ||
3384 | |||
3373 | /* | 3385 | /* |
3374 | * If a worker went to sleep, notify and ask workqueue | 3386 | * If a worker went to sleep, notify and ask workqueue |
3375 | * whether it wants to wake up a task to maintain | 3387 | * whether it wants to wake up a task to maintain |
@@ -3380,7 +3392,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3380 | 3392 | ||
3381 | to_wakeup = wq_worker_sleeping(prev); | 3393 | to_wakeup = wq_worker_sleeping(prev); |
3382 | if (to_wakeup) | 3394 | if (to_wakeup) |
3383 | try_to_wake_up_local(to_wakeup, cookie); | 3395 | try_to_wake_up_local(to_wakeup, &rf); |
3384 | } | 3396 | } |
3385 | } | 3397 | } |
3386 | switch_count = &prev->nvcsw; | 3398 | switch_count = &prev->nvcsw; |
@@ -3389,10 +3401,9 @@ static void __sched notrace __schedule(bool preempt) | |||
3389 | if (task_on_rq_queued(prev)) | 3401 | if (task_on_rq_queued(prev)) |
3390 | update_rq_clock(rq); | 3402 | update_rq_clock(rq); |
3391 | 3403 | ||
3392 | next = pick_next_task(rq, prev, cookie); | 3404 | next = pick_next_task(rq, prev, &rf); |
3393 | clear_tsk_need_resched(prev); | 3405 | clear_tsk_need_resched(prev); |
3394 | clear_preempt_need_resched(); | 3406 | clear_preempt_need_resched(); |
3395 | rq->clock_skip_update = 0; | ||
3396 | 3407 | ||
3397 | if (likely(prev != next)) { | 3408 | if (likely(prev != next)) { |
3398 | rq->nr_switches++; | 3409 | rq->nr_switches++; |
@@ -3400,9 +3411,12 @@ static void __sched notrace __schedule(bool preempt) | |||
3400 | ++*switch_count; | 3411 | ++*switch_count; |
3401 | 3412 | ||
3402 | trace_sched_switch(preempt, prev, next); | 3413 | trace_sched_switch(preempt, prev, next); |
3403 | rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ | 3414 | |
3415 | /* Also unlocks the rq: */ | ||
3416 | rq = context_switch(rq, prev, next, &rf); | ||
3404 | } else { | 3417 | } else { |
3405 | lockdep_unpin_lock(&rq->lock, cookie); | 3418 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); |
3419 | rq_unpin_lock(rq, &rf); | ||
3406 | raw_spin_unlock_irq(&rq->lock); | 3420 | raw_spin_unlock_irq(&rq->lock); |
3407 | } | 3421 | } |
3408 | 3422 | ||
@@ -3426,14 +3440,18 @@ void __noreturn do_task_dead(void) | |||
3426 | smp_mb(); | 3440 | smp_mb(); |
3427 | raw_spin_unlock_wait(¤t->pi_lock); | 3441 | raw_spin_unlock_wait(¤t->pi_lock); |
3428 | 3442 | ||
3429 | /* causes final put_task_struct in finish_task_switch(). */ | 3443 | /* Causes final put_task_struct in finish_task_switch(): */ |
3430 | __set_current_state(TASK_DEAD); | 3444 | __set_current_state(TASK_DEAD); |
3431 | current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | 3445 | |
3446 | /* Tell freezer to ignore us: */ | ||
3447 | current->flags |= PF_NOFREEZE; | ||
3448 | |||
3432 | __schedule(false); | 3449 | __schedule(false); |
3433 | BUG(); | 3450 | BUG(); |
3434 | /* Avoid "noreturn function does return". */ | 3451 | |
3452 | /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ | ||
3435 | for (;;) | 3453 | for (;;) |
3436 | cpu_relax(); /* For when BUG is null */ | 3454 | cpu_relax(); |
3437 | } | 3455 | } |
3438 | 3456 | ||
3439 | static inline void sched_submit_work(struct task_struct *tsk) | 3457 | static inline void sched_submit_work(struct task_struct *tsk) |
@@ -3651,6 +3669,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3651 | BUG_ON(prio > MAX_PRIO); | 3669 | BUG_ON(prio > MAX_PRIO); |
3652 | 3670 | ||
3653 | rq = __task_rq_lock(p, &rf); | 3671 | rq = __task_rq_lock(p, &rf); |
3672 | update_rq_clock(rq); | ||
3654 | 3673 | ||
3655 | /* | 3674 | /* |
3656 | * Idle task boosting is a nono in general. There is one | 3675 | * Idle task boosting is a nono in general. There is one |
@@ -3725,7 +3744,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3725 | 3744 | ||
3726 | check_class_changed(rq, p, prev_class, oldprio); | 3745 | check_class_changed(rq, p, prev_class, oldprio); |
3727 | out_unlock: | 3746 | out_unlock: |
3728 | preempt_disable(); /* avoid rq from going away on us */ | 3747 | /* Avoid rq from going away on us: */ |
3748 | preempt_disable(); | ||
3729 | __task_rq_unlock(rq, &rf); | 3749 | __task_rq_unlock(rq, &rf); |
3730 | 3750 | ||
3731 | balance_callback(rq); | 3751 | balance_callback(rq); |
@@ -3747,6 +3767,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3747 | * the task might be in the middle of scheduling on another CPU. | 3767 | * the task might be in the middle of scheduling on another CPU. |
3748 | */ | 3768 | */ |
3749 | rq = task_rq_lock(p, &rf); | 3769 | rq = task_rq_lock(p, &rf); |
3770 | update_rq_clock(rq); | ||
3771 | |||
3750 | /* | 3772 | /* |
3751 | * The RT priorities are set via sched_setscheduler(), but we still | 3773 | * The RT priorities are set via sched_setscheduler(), but we still |
3752 | * allow the 'normal' nice value to be set - but as expected | 3774 | * allow the 'normal' nice value to be set - but as expected |
@@ -3793,7 +3815,7 @@ EXPORT_SYMBOL(set_user_nice); | |||
3793 | */ | 3815 | */ |
3794 | int can_nice(const struct task_struct *p, const int nice) | 3816 | int can_nice(const struct task_struct *p, const int nice) |
3795 | { | 3817 | { |
3796 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3818 | /* Convert nice value [19,-20] to rlimit style value [1,40]: */ |
3797 | int nice_rlim = nice_to_rlimit(nice); | 3819 | int nice_rlim = nice_to_rlimit(nice); |
3798 | 3820 | ||
3799 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || | 3821 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
@@ -3849,7 +3871,7 @@ int task_prio(const struct task_struct *p) | |||
3849 | } | 3871 | } |
3850 | 3872 | ||
3851 | /** | 3873 | /** |
3852 | * idle_cpu - is a given cpu idle currently? | 3874 | * idle_cpu - is a given CPU idle currently? |
3853 | * @cpu: the processor in question. | 3875 | * @cpu: the processor in question. |
3854 | * | 3876 | * |
3855 | * Return: 1 if the CPU is currently idle. 0 otherwise. | 3877 | * Return: 1 if the CPU is currently idle. 0 otherwise. |
@@ -3873,10 +3895,10 @@ int idle_cpu(int cpu) | |||
3873 | } | 3895 | } |
3874 | 3896 | ||
3875 | /** | 3897 | /** |
3876 | * idle_task - return the idle task for a given cpu. | 3898 | * idle_task - return the idle task for a given CPU. |
3877 | * @cpu: the processor in question. | 3899 | * @cpu: the processor in question. |
3878 | * | 3900 | * |
3879 | * Return: The idle task for the cpu @cpu. | 3901 | * Return: The idle task for the CPU @cpu. |
3880 | */ | 3902 | */ |
3881 | struct task_struct *idle_task(int cpu) | 3903 | struct task_struct *idle_task(int cpu) |
3882 | { | 3904 | { |
@@ -4042,7 +4064,7 @@ __checkparam_dl(const struct sched_attr *attr) | |||
4042 | } | 4064 | } |
4043 | 4065 | ||
4044 | /* | 4066 | /* |
4045 | * check the target process has a UID that matches the current process's | 4067 | * Check the target process has a UID that matches the current process's: |
4046 | */ | 4068 | */ |
4047 | static bool check_same_owner(struct task_struct *p) | 4069 | static bool check_same_owner(struct task_struct *p) |
4048 | { | 4070 | { |
@@ -4057,8 +4079,7 @@ static bool check_same_owner(struct task_struct *p) | |||
4057 | return match; | 4079 | return match; |
4058 | } | 4080 | } |
4059 | 4081 | ||
4060 | static bool dl_param_changed(struct task_struct *p, | 4082 | static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) |
4061 | const struct sched_attr *attr) | ||
4062 | { | 4083 | { |
4063 | struct sched_dl_entity *dl_se = &p->dl; | 4084 | struct sched_dl_entity *dl_se = &p->dl; |
4064 | 4085 | ||
@@ -4085,10 +4106,10 @@ static int __sched_setscheduler(struct task_struct *p, | |||
4085 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; | 4106 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; |
4086 | struct rq *rq; | 4107 | struct rq *rq; |
4087 | 4108 | ||
4088 | /* may grab non-irq protected spin_locks */ | 4109 | /* May grab non-irq protected spin_locks: */ |
4089 | BUG_ON(in_interrupt()); | 4110 | BUG_ON(in_interrupt()); |
4090 | recheck: | 4111 | recheck: |
4091 | /* double check policy once rq lock held */ | 4112 | /* Double check policy once rq lock held: */ |
4092 | if (policy < 0) { | 4113 | if (policy < 0) { |
4093 | reset_on_fork = p->sched_reset_on_fork; | 4114 | reset_on_fork = p->sched_reset_on_fork; |
4094 | policy = oldpolicy = p->policy; | 4115 | policy = oldpolicy = p->policy; |
@@ -4128,11 +4149,11 @@ recheck: | |||
4128 | unsigned long rlim_rtprio = | 4149 | unsigned long rlim_rtprio = |
4129 | task_rlimit(p, RLIMIT_RTPRIO); | 4150 | task_rlimit(p, RLIMIT_RTPRIO); |
4130 | 4151 | ||
4131 | /* can't set/change the rt policy */ | 4152 | /* Can't set/change the rt policy: */ |
4132 | if (policy != p->policy && !rlim_rtprio) | 4153 | if (policy != p->policy && !rlim_rtprio) |
4133 | return -EPERM; | 4154 | return -EPERM; |
4134 | 4155 | ||
4135 | /* can't increase priority */ | 4156 | /* Can't increase priority: */ |
4136 | if (attr->sched_priority > p->rt_priority && | 4157 | if (attr->sched_priority > p->rt_priority && |
4137 | attr->sched_priority > rlim_rtprio) | 4158 | attr->sched_priority > rlim_rtprio) |
4138 | return -EPERM; | 4159 | return -EPERM; |
@@ -4156,11 +4177,11 @@ recheck: | |||
4156 | return -EPERM; | 4177 | return -EPERM; |
4157 | } | 4178 | } |
4158 | 4179 | ||
4159 | /* can't change other user's priorities */ | 4180 | /* Can't change other user's priorities: */ |
4160 | if (!check_same_owner(p)) | 4181 | if (!check_same_owner(p)) |
4161 | return -EPERM; | 4182 | return -EPERM; |
4162 | 4183 | ||
4163 | /* Normal users shall not reset the sched_reset_on_fork flag */ | 4184 | /* Normal users shall not reset the sched_reset_on_fork flag: */ |
4164 | if (p->sched_reset_on_fork && !reset_on_fork) | 4185 | if (p->sched_reset_on_fork && !reset_on_fork) |
4165 | return -EPERM; | 4186 | return -EPERM; |
4166 | } | 4187 | } |
@@ -4172,16 +4193,17 @@ recheck: | |||
4172 | } | 4193 | } |
4173 | 4194 | ||
4174 | /* | 4195 | /* |
4175 | * make sure no PI-waiters arrive (or leave) while we are | 4196 | * Make sure no PI-waiters arrive (or leave) while we are |
4176 | * changing the priority of the task: | 4197 | * changing the priority of the task: |
4177 | * | 4198 | * |
4178 | * To be able to change p->policy safely, the appropriate | 4199 | * To be able to change p->policy safely, the appropriate |
4179 | * runqueue lock must be held. | 4200 | * runqueue lock must be held. |
4180 | */ | 4201 | */ |
4181 | rq = task_rq_lock(p, &rf); | 4202 | rq = task_rq_lock(p, &rf); |
4203 | update_rq_clock(rq); | ||
4182 | 4204 | ||
4183 | /* | 4205 | /* |
4184 | * Changing the policy of the stop threads its a very bad idea | 4206 | * Changing the policy of the stop threads its a very bad idea: |
4185 | */ | 4207 | */ |
4186 | if (p == rq->stop) { | 4208 | if (p == rq->stop) { |
4187 | task_rq_unlock(rq, p, &rf); | 4209 | task_rq_unlock(rq, p, &rf); |
@@ -4237,7 +4259,7 @@ change: | |||
4237 | #endif | 4259 | #endif |
4238 | } | 4260 | } |
4239 | 4261 | ||
4240 | /* recheck policy now with rq lock held */ | 4262 | /* Re-check policy now with rq lock held: */ |
4241 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4263 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4242 | policy = oldpolicy = -1; | 4264 | policy = oldpolicy = -1; |
4243 | task_rq_unlock(rq, p, &rf); | 4265 | task_rq_unlock(rq, p, &rf); |
@@ -4294,15 +4316,15 @@ change: | |||
4294 | set_curr_task(rq, p); | 4316 | set_curr_task(rq, p); |
4295 | 4317 | ||
4296 | check_class_changed(rq, p, prev_class, oldprio); | 4318 | check_class_changed(rq, p, prev_class, oldprio); |
4297 | preempt_disable(); /* avoid rq from going away on us */ | 4319 | |
4320 | /* Avoid rq from going away on us: */ | ||
4321 | preempt_disable(); | ||
4298 | task_rq_unlock(rq, p, &rf); | 4322 | task_rq_unlock(rq, p, &rf); |
4299 | 4323 | ||
4300 | if (pi) | 4324 | if (pi) |
4301 | rt_mutex_adjust_pi(p); | 4325 | rt_mutex_adjust_pi(p); |
4302 | 4326 | ||
4303 | /* | 4327 | /* Run balance callbacks after we've adjusted the PI chain: */ |
4304 | * Run balance callbacks after we've adjusted the PI chain. | ||
4305 | */ | ||
4306 | balance_callback(rq); | 4328 | balance_callback(rq); |
4307 | preempt_enable(); | 4329 | preempt_enable(); |
4308 | 4330 | ||
@@ -4395,8 +4417,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
4395 | /* | 4417 | /* |
4396 | * Mimics kernel/events/core.c perf_copy_attr(). | 4418 | * Mimics kernel/events/core.c perf_copy_attr(). |
4397 | */ | 4419 | */ |
4398 | static int sched_copy_attr(struct sched_attr __user *uattr, | 4420 | static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) |
4399 | struct sched_attr *attr) | ||
4400 | { | 4421 | { |
4401 | u32 size; | 4422 | u32 size; |
4402 | int ret; | 4423 | int ret; |
@@ -4404,19 +4425,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
4404 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | 4425 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) |
4405 | return -EFAULT; | 4426 | return -EFAULT; |
4406 | 4427 | ||
4407 | /* | 4428 | /* Zero the full structure, so that a short copy will be nice: */ |
4408 | * zero the full structure, so that a short copy will be nice. | ||
4409 | */ | ||
4410 | memset(attr, 0, sizeof(*attr)); | 4429 | memset(attr, 0, sizeof(*attr)); |
4411 | 4430 | ||
4412 | ret = get_user(size, &uattr->size); | 4431 | ret = get_user(size, &uattr->size); |
4413 | if (ret) | 4432 | if (ret) |
4414 | return ret; | 4433 | return ret; |
4415 | 4434 | ||
4416 | if (size > PAGE_SIZE) /* silly large */ | 4435 | /* Bail out on silly large: */ |
4436 | if (size > PAGE_SIZE) | ||
4417 | goto err_size; | 4437 | goto err_size; |
4418 | 4438 | ||
4419 | if (!size) /* abi compat */ | 4439 | /* ABI compatibility quirk: */ |
4440 | if (!size) | ||
4420 | size = SCHED_ATTR_SIZE_VER0; | 4441 | size = SCHED_ATTR_SIZE_VER0; |
4421 | 4442 | ||
4422 | if (size < SCHED_ATTR_SIZE_VER0) | 4443 | if (size < SCHED_ATTR_SIZE_VER0) |
@@ -4451,7 +4472,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
4451 | return -EFAULT; | 4472 | return -EFAULT; |
4452 | 4473 | ||
4453 | /* | 4474 | /* |
4454 | * XXX: do we want to be lenient like existing syscalls; or do we want | 4475 | * XXX: Do we want to be lenient like existing syscalls; or do we want |
4455 | * to be strict and return an error on out-of-bounds values? | 4476 | * to be strict and return an error on out-of-bounds values? |
4456 | */ | 4477 | */ |
4457 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); | 4478 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); |
@@ -4471,10 +4492,8 @@ err_size: | |||
4471 | * | 4492 | * |
4472 | * Return: 0 on success. An error code otherwise. | 4493 | * Return: 0 on success. An error code otherwise. |
4473 | */ | 4494 | */ |
4474 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | 4495 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) |
4475 | struct sched_param __user *, param) | ||
4476 | { | 4496 | { |
4477 | /* negative values for policy are not valid */ | ||
4478 | if (policy < 0) | 4497 | if (policy < 0) |
4479 | return -EINVAL; | 4498 | return -EINVAL; |
4480 | 4499 | ||
@@ -4784,10 +4803,10 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | |||
4784 | } | 4803 | } |
4785 | 4804 | ||
4786 | /** | 4805 | /** |
4787 | * sys_sched_setaffinity - set the cpu affinity of a process | 4806 | * sys_sched_setaffinity - set the CPU affinity of a process |
4788 | * @pid: pid of the process | 4807 | * @pid: pid of the process |
4789 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4808 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4790 | * @user_mask_ptr: user-space pointer to the new cpu mask | 4809 | * @user_mask_ptr: user-space pointer to the new CPU mask |
4791 | * | 4810 | * |
4792 | * Return: 0 on success. An error code otherwise. | 4811 | * Return: 0 on success. An error code otherwise. |
4793 | */ | 4812 | */ |
@@ -4835,10 +4854,10 @@ out_unlock: | |||
4835 | } | 4854 | } |
4836 | 4855 | ||
4837 | /** | 4856 | /** |
4838 | * sys_sched_getaffinity - get the cpu affinity of a process | 4857 | * sys_sched_getaffinity - get the CPU affinity of a process |
4839 | * @pid: pid of the process | 4858 | * @pid: pid of the process |
4840 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4859 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4841 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4860 | * @user_mask_ptr: user-space pointer to hold the current CPU mask |
4842 | * | 4861 | * |
4843 | * Return: size of CPU mask copied to user_mask_ptr on success. An | 4862 | * Return: size of CPU mask copied to user_mask_ptr on success. An |
4844 | * error code otherwise. | 4863 | * error code otherwise. |
@@ -4966,7 +4985,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); | |||
4966 | * Typical broken usage is: | 4985 | * Typical broken usage is: |
4967 | * | 4986 | * |
4968 | * while (!event) | 4987 | * while (!event) |
4969 | * yield(); | 4988 | * yield(); |
4970 | * | 4989 | * |
4971 | * where one assumes that yield() will let 'the other' process run that will | 4990 | * where one assumes that yield() will let 'the other' process run that will |
4972 | * make event true. If the current task is a SCHED_FIFO task that will never | 4991 | * make event true. If the current task is a SCHED_FIFO task that will never |
@@ -5057,31 +5076,48 @@ out_irq: | |||
5057 | } | 5076 | } |
5058 | EXPORT_SYMBOL_GPL(yield_to); | 5077 | EXPORT_SYMBOL_GPL(yield_to); |
5059 | 5078 | ||
5079 | int io_schedule_prepare(void) | ||
5080 | { | ||
5081 | int old_iowait = current->in_iowait; | ||
5082 | |||
5083 | current->in_iowait = 1; | ||
5084 | blk_schedule_flush_plug(current); | ||
5085 | |||
5086 | return old_iowait; | ||
5087 | } | ||
5088 | |||
5089 | void io_schedule_finish(int token) | ||
5090 | { | ||
5091 | current->in_iowait = token; | ||
5092 | } | ||
5093 | |||
5060 | /* | 5094 | /* |
5061 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5095 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5062 | * that process accounting knows that this is a task in IO wait state. | 5096 | * that process accounting knows that this is a task in IO wait state. |
5063 | */ | 5097 | */ |
5064 | long __sched io_schedule_timeout(long timeout) | 5098 | long __sched io_schedule_timeout(long timeout) |
5065 | { | 5099 | { |
5066 | int old_iowait = current->in_iowait; | 5100 | int token; |
5067 | struct rq *rq; | ||
5068 | long ret; | 5101 | long ret; |
5069 | 5102 | ||
5070 | current->in_iowait = 1; | 5103 | token = io_schedule_prepare(); |
5071 | blk_schedule_flush_plug(current); | ||
5072 | |||
5073 | delayacct_blkio_start(); | ||
5074 | rq = raw_rq(); | ||
5075 | atomic_inc(&rq->nr_iowait); | ||
5076 | ret = schedule_timeout(timeout); | 5104 | ret = schedule_timeout(timeout); |
5077 | current->in_iowait = old_iowait; | 5105 | io_schedule_finish(token); |
5078 | atomic_dec(&rq->nr_iowait); | ||
5079 | delayacct_blkio_end(); | ||
5080 | 5106 | ||
5081 | return ret; | 5107 | return ret; |
5082 | } | 5108 | } |
5083 | EXPORT_SYMBOL(io_schedule_timeout); | 5109 | EXPORT_SYMBOL(io_schedule_timeout); |
5084 | 5110 | ||
5111 | void io_schedule(void) | ||
5112 | { | ||
5113 | int token; | ||
5114 | |||
5115 | token = io_schedule_prepare(); | ||
5116 | schedule(); | ||
5117 | io_schedule_finish(token); | ||
5118 | } | ||
5119 | EXPORT_SYMBOL(io_schedule); | ||
5120 | |||
5085 | /** | 5121 | /** |
5086 | * sys_sched_get_priority_max - return maximum RT priority. | 5122 | * sys_sched_get_priority_max - return maximum RT priority. |
5087 | * @policy: scheduling class. | 5123 | * @policy: scheduling class. |
@@ -5264,7 +5300,7 @@ void init_idle_bootup_task(struct task_struct *idle) | |||
5264 | /** | 5300 | /** |
5265 | * init_idle - set up an idle thread for a given CPU | 5301 | * init_idle - set up an idle thread for a given CPU |
5266 | * @idle: task in question | 5302 | * @idle: task in question |
5267 | * @cpu: cpu the idle task belongs to | 5303 | * @cpu: CPU the idle task belongs to |
5268 | * | 5304 | * |
5269 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 5305 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
5270 | * flag, to make booting more robust. | 5306 | * flag, to make booting more robust. |
@@ -5295,7 +5331,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
5295 | #endif | 5331 | #endif |
5296 | /* | 5332 | /* |
5297 | * We're having a chicken and egg problem, even though we are | 5333 | * We're having a chicken and egg problem, even though we are |
5298 | * holding rq->lock, the cpu isn't yet set to this cpu so the | 5334 | * holding rq->lock, the CPU isn't yet set to this CPU so the |
5299 | * lockdep check in task_group() will fail. | 5335 | * lockdep check in task_group() will fail. |
5300 | * | 5336 | * |
5301 | * Similar case to sched_fork(). / Alternatively we could | 5337 | * Similar case to sched_fork(). / Alternatively we could |
@@ -5360,7 +5396,7 @@ int task_can_attach(struct task_struct *p, | |||
5360 | 5396 | ||
5361 | /* | 5397 | /* |
5362 | * Kthreads which disallow setaffinity shouldn't be moved | 5398 | * Kthreads which disallow setaffinity shouldn't be moved |
5363 | * to a new cpuset; we don't want to change their cpu | 5399 | * to a new cpuset; we don't want to change their CPU |
5364 | * affinity and isolating such threads by their set of | 5400 | * affinity and isolating such threads by their set of |
5365 | * allowed nodes is unnecessary. Thus, cpusets are not | 5401 | * allowed nodes is unnecessary. Thus, cpusets are not |
5366 | * applicable for such threads. This prevents checking for | 5402 | * applicable for such threads. This prevents checking for |
@@ -5409,7 +5445,7 @@ out: | |||
5409 | 5445 | ||
5410 | #ifdef CONFIG_SMP | 5446 | #ifdef CONFIG_SMP |
5411 | 5447 | ||
5412 | static bool sched_smp_initialized __read_mostly; | 5448 | bool sched_smp_initialized __read_mostly; |
5413 | 5449 | ||
5414 | #ifdef CONFIG_NUMA_BALANCING | 5450 | #ifdef CONFIG_NUMA_BALANCING |
5415 | /* Migrate current task p to target_cpu */ | 5451 | /* Migrate current task p to target_cpu */ |
@@ -5461,7 +5497,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5461 | 5497 | ||
5462 | #ifdef CONFIG_HOTPLUG_CPU | 5498 | #ifdef CONFIG_HOTPLUG_CPU |
5463 | /* | 5499 | /* |
5464 | * Ensures that the idle task is using init_mm right before its cpu goes | 5500 | * Ensure that the idle task is using init_mm right before its CPU goes |
5465 | * offline. | 5501 | * offline. |
5466 | */ | 5502 | */ |
5467 | void idle_task_exit(void) | 5503 | void idle_task_exit(void) |
@@ -5521,7 +5557,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5521 | { | 5557 | { |
5522 | struct rq *rq = dead_rq; | 5558 | struct rq *rq = dead_rq; |
5523 | struct task_struct *next, *stop = rq->stop; | 5559 | struct task_struct *next, *stop = rq->stop; |
5524 | struct pin_cookie cookie; | 5560 | struct rq_flags rf, old_rf; |
5525 | int dest_cpu; | 5561 | int dest_cpu; |
5526 | 5562 | ||
5527 | /* | 5563 | /* |
@@ -5545,16 +5581,16 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5545 | for (;;) { | 5581 | for (;;) { |
5546 | /* | 5582 | /* |
5547 | * There's this thread running, bail when that's the only | 5583 | * There's this thread running, bail when that's the only |
5548 | * remaining thread. | 5584 | * remaining thread: |
5549 | */ | 5585 | */ |
5550 | if (rq->nr_running == 1) | 5586 | if (rq->nr_running == 1) |
5551 | break; | 5587 | break; |
5552 | 5588 | ||
5553 | /* | 5589 | /* |
5554 | * pick_next_task assumes pinned rq->lock. | 5590 | * pick_next_task() assumes pinned rq->lock: |
5555 | */ | 5591 | */ |
5556 | cookie = lockdep_pin_lock(&rq->lock); | 5592 | rq_pin_lock(rq, &rf); |
5557 | next = pick_next_task(rq, &fake_task, cookie); | 5593 | next = pick_next_task(rq, &fake_task, &rf); |
5558 | BUG_ON(!next); | 5594 | BUG_ON(!next); |
5559 | next->sched_class->put_prev_task(rq, next); | 5595 | next->sched_class->put_prev_task(rq, next); |
5560 | 5596 | ||
@@ -5567,7 +5603,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5567 | * because !cpu_active at this point, which means load-balance | 5603 | * because !cpu_active at this point, which means load-balance |
5568 | * will not interfere. Also, stop-machine. | 5604 | * will not interfere. Also, stop-machine. |
5569 | */ | 5605 | */ |
5570 | lockdep_unpin_lock(&rq->lock, cookie); | 5606 | rq_unpin_lock(rq, &rf); |
5571 | raw_spin_unlock(&rq->lock); | 5607 | raw_spin_unlock(&rq->lock); |
5572 | raw_spin_lock(&next->pi_lock); | 5608 | raw_spin_lock(&next->pi_lock); |
5573 | raw_spin_lock(&rq->lock); | 5609 | raw_spin_lock(&rq->lock); |
@@ -5582,6 +5618,13 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5582 | continue; | 5618 | continue; |
5583 | } | 5619 | } |
5584 | 5620 | ||
5621 | /* | ||
5622 | * __migrate_task() may return with a different | ||
5623 | * rq->lock held and a new cookie in 'rf', but we need | ||
5624 | * to preserve rf::clock_update_flags for 'dead_rq'. | ||
5625 | */ | ||
5626 | old_rf = rf; | ||
5627 | |||
5585 | /* Find suitable destination for @next, with force if needed. */ | 5628 | /* Find suitable destination for @next, with force if needed. */ |
5586 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); | 5629 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); |
5587 | 5630 | ||
@@ -5590,6 +5633,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5590 | raw_spin_unlock(&rq->lock); | 5633 | raw_spin_unlock(&rq->lock); |
5591 | rq = dead_rq; | 5634 | rq = dead_rq; |
5592 | raw_spin_lock(&rq->lock); | 5635 | raw_spin_lock(&rq->lock); |
5636 | rf = old_rf; | ||
5593 | } | 5637 | } |
5594 | raw_spin_unlock(&next->pi_lock); | 5638 | raw_spin_unlock(&next->pi_lock); |
5595 | } | 5639 | } |
@@ -5598,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5598 | } | 5642 | } |
5599 | #endif /* CONFIG_HOTPLUG_CPU */ | 5643 | #endif /* CONFIG_HOTPLUG_CPU */ |
5600 | 5644 | ||
5601 | static void set_rq_online(struct rq *rq) | 5645 | void set_rq_online(struct rq *rq) |
5602 | { | 5646 | { |
5603 | if (!rq->online) { | 5647 | if (!rq->online) { |
5604 | const struct sched_class *class; | 5648 | const struct sched_class *class; |
@@ -5613,7 +5657,7 @@ static void set_rq_online(struct rq *rq) | |||
5613 | } | 5657 | } |
5614 | } | 5658 | } |
5615 | 5659 | ||
5616 | static void set_rq_offline(struct rq *rq) | 5660 | void set_rq_offline(struct rq *rq) |
5617 | { | 5661 | { |
5618 | if (rq->online) { | 5662 | if (rq->online) { |
5619 | const struct sched_class *class; | 5663 | const struct sched_class *class; |
@@ -5635,1647 +5679,10 @@ static void set_cpu_rq_start_time(unsigned int cpu) | |||
5635 | rq->age_stamp = sched_clock_cpu(cpu); | 5679 | rq->age_stamp = sched_clock_cpu(cpu); |
5636 | } | 5680 | } |
5637 | 5681 | ||
5638 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
5639 | |||
5640 | #ifdef CONFIG_SCHED_DEBUG | ||
5641 | |||
5642 | static __read_mostly int sched_debug_enabled; | ||
5643 | |||
5644 | static int __init sched_debug_setup(char *str) | ||
5645 | { | ||
5646 | sched_debug_enabled = 1; | ||
5647 | |||
5648 | return 0; | ||
5649 | } | ||
5650 | early_param("sched_debug", sched_debug_setup); | ||
5651 | |||
5652 | static inline bool sched_debug(void) | ||
5653 | { | ||
5654 | return sched_debug_enabled; | ||
5655 | } | ||
5656 | |||
5657 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
5658 | struct cpumask *groupmask) | ||
5659 | { | ||
5660 | struct sched_group *group = sd->groups; | ||
5661 | |||
5662 | cpumask_clear(groupmask); | ||
5663 | |||
5664 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
5665 | |||
5666 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
5667 | printk("does not load-balance\n"); | ||
5668 | if (sd->parent) | ||
5669 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
5670 | " has parent"); | ||
5671 | return -1; | ||
5672 | } | ||
5673 | |||
5674 | printk(KERN_CONT "span %*pbl level %s\n", | ||
5675 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
5676 | |||
5677 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
5678 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
5679 | "CPU%d\n", cpu); | ||
5680 | } | ||
5681 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
5682 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
5683 | " CPU%d\n", cpu); | ||
5684 | } | ||
5685 | |||
5686 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
5687 | do { | ||
5688 | if (!group) { | ||
5689 | printk("\n"); | ||
5690 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
5691 | break; | ||
5692 | } | ||
5693 | |||
5694 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
5695 | printk(KERN_CONT "\n"); | ||
5696 | printk(KERN_ERR "ERROR: empty group\n"); | ||
5697 | break; | ||
5698 | } | ||
5699 | |||
5700 | if (!(sd->flags & SD_OVERLAP) && | ||
5701 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5702 | printk(KERN_CONT "\n"); | ||
5703 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
5704 | break; | ||
5705 | } | ||
5706 | |||
5707 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
5708 | |||
5709 | printk(KERN_CONT " %*pbl", | ||
5710 | cpumask_pr_args(sched_group_cpus(group))); | ||
5711 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
5712 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
5713 | group->sgc->capacity); | ||
5714 | } | ||
5715 | |||
5716 | group = group->next; | ||
5717 | } while (group != sd->groups); | ||
5718 | printk(KERN_CONT "\n"); | ||
5719 | |||
5720 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
5721 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
5722 | |||
5723 | if (sd->parent && | ||
5724 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
5725 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
5726 | "of domain->span\n"); | ||
5727 | return 0; | ||
5728 | } | ||
5729 | |||
5730 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
5731 | { | ||
5732 | int level = 0; | ||
5733 | |||
5734 | if (!sched_debug_enabled) | ||
5735 | return; | ||
5736 | |||
5737 | if (!sd) { | ||
5738 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
5739 | return; | ||
5740 | } | ||
5741 | |||
5742 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
5743 | |||
5744 | for (;;) { | ||
5745 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
5746 | break; | ||
5747 | level++; | ||
5748 | sd = sd->parent; | ||
5749 | if (!sd) | ||
5750 | break; | ||
5751 | } | ||
5752 | } | ||
5753 | #else /* !CONFIG_SCHED_DEBUG */ | ||
5754 | |||
5755 | # define sched_debug_enabled 0 | ||
5756 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
5757 | static inline bool sched_debug(void) | ||
5758 | { | ||
5759 | return false; | ||
5760 | } | ||
5761 | #endif /* CONFIG_SCHED_DEBUG */ | ||
5762 | |||
5763 | static int sd_degenerate(struct sched_domain *sd) | ||
5764 | { | ||
5765 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
5766 | return 1; | ||
5767 | |||
5768 | /* Following flags need at least 2 groups */ | ||
5769 | if (sd->flags & (SD_LOAD_BALANCE | | ||
5770 | SD_BALANCE_NEWIDLE | | ||
5771 | SD_BALANCE_FORK | | ||
5772 | SD_BALANCE_EXEC | | ||
5773 | SD_SHARE_CPUCAPACITY | | ||
5774 | SD_ASYM_CPUCAPACITY | | ||
5775 | SD_SHARE_PKG_RESOURCES | | ||
5776 | SD_SHARE_POWERDOMAIN)) { | ||
5777 | if (sd->groups != sd->groups->next) | ||
5778 | return 0; | ||
5779 | } | ||
5780 | |||
5781 | /* Following flags don't use groups */ | ||
5782 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
5783 | return 0; | ||
5784 | |||
5785 | return 1; | ||
5786 | } | ||
5787 | |||
5788 | static int | ||
5789 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
5790 | { | ||
5791 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
5792 | |||
5793 | if (sd_degenerate(parent)) | ||
5794 | return 1; | ||
5795 | |||
5796 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
5797 | return 0; | ||
5798 | |||
5799 | /* Flags needing groups don't count if only 1 group in parent */ | ||
5800 | if (parent->groups == parent->groups->next) { | ||
5801 | pflags &= ~(SD_LOAD_BALANCE | | ||
5802 | SD_BALANCE_NEWIDLE | | ||
5803 | SD_BALANCE_FORK | | ||
5804 | SD_BALANCE_EXEC | | ||
5805 | SD_ASYM_CPUCAPACITY | | ||
5806 | SD_SHARE_CPUCAPACITY | | ||
5807 | SD_SHARE_PKG_RESOURCES | | ||
5808 | SD_PREFER_SIBLING | | ||
5809 | SD_SHARE_POWERDOMAIN); | ||
5810 | if (nr_node_ids == 1) | ||
5811 | pflags &= ~SD_SERIALIZE; | ||
5812 | } | ||
5813 | if (~cflags & pflags) | ||
5814 | return 0; | ||
5815 | |||
5816 | return 1; | ||
5817 | } | ||
5818 | |||
5819 | static void free_rootdomain(struct rcu_head *rcu) | ||
5820 | { | ||
5821 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
5822 | |||
5823 | cpupri_cleanup(&rd->cpupri); | ||
5824 | cpudl_cleanup(&rd->cpudl); | ||
5825 | free_cpumask_var(rd->dlo_mask); | ||
5826 | free_cpumask_var(rd->rto_mask); | ||
5827 | free_cpumask_var(rd->online); | ||
5828 | free_cpumask_var(rd->span); | ||
5829 | kfree(rd); | ||
5830 | } | ||
5831 | |||
5832 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
5833 | { | ||
5834 | struct root_domain *old_rd = NULL; | ||
5835 | unsigned long flags; | ||
5836 | |||
5837 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5838 | |||
5839 | if (rq->rd) { | ||
5840 | old_rd = rq->rd; | ||
5841 | |||
5842 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
5843 | set_rq_offline(rq); | ||
5844 | |||
5845 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
5846 | |||
5847 | /* | ||
5848 | * If we dont want to free the old_rd yet then | ||
5849 | * set old_rd to NULL to skip the freeing later | ||
5850 | * in this function: | ||
5851 | */ | ||
5852 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
5853 | old_rd = NULL; | ||
5854 | } | ||
5855 | |||
5856 | atomic_inc(&rd->refcount); | ||
5857 | rq->rd = rd; | ||
5858 | |||
5859 | cpumask_set_cpu(rq->cpu, rd->span); | ||
5860 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
5861 | set_rq_online(rq); | ||
5862 | |||
5863 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5864 | |||
5865 | if (old_rd) | ||
5866 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
5867 | } | ||
5868 | |||
5869 | static int init_rootdomain(struct root_domain *rd) | ||
5870 | { | ||
5871 | memset(rd, 0, sizeof(*rd)); | ||
5872 | |||
5873 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
5874 | goto out; | ||
5875 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
5876 | goto free_span; | ||
5877 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
5878 | goto free_online; | ||
5879 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
5880 | goto free_dlo_mask; | ||
5881 | |||
5882 | init_dl_bw(&rd->dl_bw); | ||
5883 | if (cpudl_init(&rd->cpudl) != 0) | ||
5884 | goto free_dlo_mask; | ||
5885 | |||
5886 | if (cpupri_init(&rd->cpupri) != 0) | ||
5887 | goto free_rto_mask; | ||
5888 | return 0; | ||
5889 | |||
5890 | free_rto_mask: | ||
5891 | free_cpumask_var(rd->rto_mask); | ||
5892 | free_dlo_mask: | ||
5893 | free_cpumask_var(rd->dlo_mask); | ||
5894 | free_online: | ||
5895 | free_cpumask_var(rd->online); | ||
5896 | free_span: | ||
5897 | free_cpumask_var(rd->span); | ||
5898 | out: | ||
5899 | return -ENOMEM; | ||
5900 | } | ||
5901 | |||
5902 | /* | ||
5903 | * By default the system creates a single root-domain with all cpus as | ||
5904 | * members (mimicking the global state we have today). | ||
5905 | */ | ||
5906 | struct root_domain def_root_domain; | ||
5907 | |||
5908 | static void init_defrootdomain(void) | ||
5909 | { | ||
5910 | init_rootdomain(&def_root_domain); | ||
5911 | |||
5912 | atomic_set(&def_root_domain.refcount, 1); | ||
5913 | } | ||
5914 | |||
5915 | static struct root_domain *alloc_rootdomain(void) | ||
5916 | { | ||
5917 | struct root_domain *rd; | ||
5918 | |||
5919 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
5920 | if (!rd) | ||
5921 | return NULL; | ||
5922 | |||
5923 | if (init_rootdomain(rd) != 0) { | ||
5924 | kfree(rd); | ||
5925 | return NULL; | ||
5926 | } | ||
5927 | |||
5928 | return rd; | ||
5929 | } | ||
5930 | |||
5931 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
5932 | { | ||
5933 | struct sched_group *tmp, *first; | ||
5934 | |||
5935 | if (!sg) | ||
5936 | return; | ||
5937 | |||
5938 | first = sg; | ||
5939 | do { | ||
5940 | tmp = sg->next; | ||
5941 | |||
5942 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
5943 | kfree(sg->sgc); | ||
5944 | |||
5945 | kfree(sg); | ||
5946 | sg = tmp; | ||
5947 | } while (sg != first); | ||
5948 | } | ||
5949 | |||
5950 | static void destroy_sched_domain(struct sched_domain *sd) | ||
5951 | { | ||
5952 | /* | ||
5953 | * If its an overlapping domain it has private groups, iterate and | ||
5954 | * nuke them all. | ||
5955 | */ | ||
5956 | if (sd->flags & SD_OVERLAP) { | ||
5957 | free_sched_groups(sd->groups, 1); | ||
5958 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
5959 | kfree(sd->groups->sgc); | ||
5960 | kfree(sd->groups); | ||
5961 | } | ||
5962 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
5963 | kfree(sd->shared); | ||
5964 | kfree(sd); | ||
5965 | } | ||
5966 | |||
5967 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
5968 | { | ||
5969 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
5970 | |||
5971 | while (sd) { | ||
5972 | struct sched_domain *parent = sd->parent; | ||
5973 | destroy_sched_domain(sd); | ||
5974 | sd = parent; | ||
5975 | } | ||
5976 | } | ||
5977 | |||
5978 | static void destroy_sched_domains(struct sched_domain *sd) | ||
5979 | { | ||
5980 | if (sd) | ||
5981 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
5982 | } | ||
5983 | |||
5984 | /* | ||
5985 | * Keep a special pointer to the highest sched_domain that has | ||
5986 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
5987 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
5988 | * | ||
5989 | * Also keep a unique ID per domain (we use the first cpu number in | ||
5990 | * the cpumask of the domain), this allows us to quickly tell if | ||
5991 | * two cpus are in the same cache domain, see cpus_share_cache(). | ||
5992 | */ | ||
5993 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
5994 | DEFINE_PER_CPU(int, sd_llc_size); | ||
5995 | DEFINE_PER_CPU(int, sd_llc_id); | ||
5996 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
5997 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
5998 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
5999 | |||
6000 | static void update_top_cache_domain(int cpu) | ||
6001 | { | ||
6002 | struct sched_domain_shared *sds = NULL; | ||
6003 | struct sched_domain *sd; | ||
6004 | int id = cpu; | ||
6005 | int size = 1; | ||
6006 | |||
6007 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
6008 | if (sd) { | ||
6009 | id = cpumask_first(sched_domain_span(sd)); | ||
6010 | size = cpumask_weight(sched_domain_span(sd)); | ||
6011 | sds = sd->shared; | ||
6012 | } | ||
6013 | |||
6014 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
6015 | per_cpu(sd_llc_size, cpu) = size; | ||
6016 | per_cpu(sd_llc_id, cpu) = id; | ||
6017 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
6018 | |||
6019 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
6020 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
6021 | |||
6022 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
6023 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
6024 | } | ||
6025 | |||
6026 | /* | ||
6027 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
6028 | * hold the hotplug lock. | ||
6029 | */ | ||
6030 | static void | ||
6031 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
6032 | { | ||
6033 | struct rq *rq = cpu_rq(cpu); | ||
6034 | struct sched_domain *tmp; | ||
6035 | |||
6036 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
6037 | for (tmp = sd; tmp; ) { | ||
6038 | struct sched_domain *parent = tmp->parent; | ||
6039 | if (!parent) | ||
6040 | break; | ||
6041 | |||
6042 | if (sd_parent_degenerate(tmp, parent)) { | ||
6043 | tmp->parent = parent->parent; | ||
6044 | if (parent->parent) | ||
6045 | parent->parent->child = tmp; | ||
6046 | /* | ||
6047 | * Transfer SD_PREFER_SIBLING down in case of a | ||
6048 | * degenerate parent; the spans match for this | ||
6049 | * so the property transfers. | ||
6050 | */ | ||
6051 | if (parent->flags & SD_PREFER_SIBLING) | ||
6052 | tmp->flags |= SD_PREFER_SIBLING; | ||
6053 | destroy_sched_domain(parent); | ||
6054 | } else | ||
6055 | tmp = tmp->parent; | ||
6056 | } | ||
6057 | |||
6058 | if (sd && sd_degenerate(sd)) { | ||
6059 | tmp = sd; | ||
6060 | sd = sd->parent; | ||
6061 | destroy_sched_domain(tmp); | ||
6062 | if (sd) | ||
6063 | sd->child = NULL; | ||
6064 | } | ||
6065 | |||
6066 | sched_domain_debug(sd, cpu); | ||
6067 | |||
6068 | rq_attach_root(rq, rd); | ||
6069 | tmp = rq->sd; | ||
6070 | rcu_assign_pointer(rq->sd, sd); | ||
6071 | destroy_sched_domains(tmp); | ||
6072 | |||
6073 | update_top_cache_domain(cpu); | ||
6074 | } | ||
6075 | |||
6076 | /* Setup the mask of cpus configured for isolated domains */ | ||
6077 | static int __init isolated_cpu_setup(char *str) | ||
6078 | { | ||
6079 | int ret; | ||
6080 | |||
6081 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
6082 | ret = cpulist_parse(str, cpu_isolated_map); | ||
6083 | if (ret) { | ||
6084 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
6085 | return 0; | ||
6086 | } | ||
6087 | return 1; | ||
6088 | } | ||
6089 | __setup("isolcpus=", isolated_cpu_setup); | ||
6090 | |||
6091 | struct s_data { | ||
6092 | struct sched_domain ** __percpu sd; | ||
6093 | struct root_domain *rd; | ||
6094 | }; | ||
6095 | |||
6096 | enum s_alloc { | ||
6097 | sa_rootdomain, | ||
6098 | sa_sd, | ||
6099 | sa_sd_storage, | ||
6100 | sa_none, | ||
6101 | }; | ||
6102 | |||
6103 | /* | ||
6104 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6105 | * domain traversal. | ||
6106 | * | ||
6107 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6108 | * unequal depth, make sure to skip domains that already cover the entire | ||
6109 | * range. | ||
6110 | * | ||
6111 | * In that case build_sched_domains() will have terminated the iteration early | ||
6112 | * and our sibling sd spans will be empty. Domains should always include the | ||
6113 | * cpu they're built on, so check that. | ||
6114 | * | ||
6115 | */ | ||
6116 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6117 | { | ||
6118 | const struct cpumask *span = sched_domain_span(sd); | ||
6119 | struct sd_data *sdd = sd->private; | ||
6120 | struct sched_domain *sibling; | ||
6121 | int i; | ||
6122 | |||
6123 | for_each_cpu(i, span) { | ||
6124 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6125 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6126 | continue; | ||
6127 | |||
6128 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6129 | } | ||
6130 | } | ||
6131 | |||
6132 | /* | ||
6133 | * Return the canonical balance cpu for this group, this is the first cpu | ||
6134 | * of this group that's also in the iteration mask. | ||
6135 | */ | ||
6136 | int group_balance_cpu(struct sched_group *sg) | ||
6137 | { | ||
6138 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6139 | } | ||
6140 | |||
6141 | static int | ||
6142 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
6143 | { | ||
6144 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
6145 | const struct cpumask *span = sched_domain_span(sd); | ||
6146 | struct cpumask *covered = sched_domains_tmpmask; | ||
6147 | struct sd_data *sdd = sd->private; | ||
6148 | struct sched_domain *sibling; | ||
6149 | int i; | ||
6150 | |||
6151 | cpumask_clear(covered); | ||
6152 | |||
6153 | for_each_cpu(i, span) { | ||
6154 | struct cpumask *sg_span; | ||
6155 | |||
6156 | if (cpumask_test_cpu(i, covered)) | ||
6157 | continue; | ||
6158 | |||
6159 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6160 | |||
6161 | /* See the comment near build_group_mask(). */ | ||
6162 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6163 | continue; | ||
6164 | |||
6165 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6166 | GFP_KERNEL, cpu_to_node(cpu)); | ||
6167 | |||
6168 | if (!sg) | ||
6169 | goto fail; | ||
6170 | |||
6171 | sg_span = sched_group_cpus(sg); | ||
6172 | if (sibling->child) | ||
6173 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
6174 | else | ||
6175 | cpumask_set_cpu(i, sg_span); | ||
6176 | |||
6177 | cpumask_or(covered, covered, sg_span); | ||
6178 | |||
6179 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
6180 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
6181 | build_group_mask(sd, sg); | ||
6182 | |||
6183 | /* | ||
6184 | * Initialize sgc->capacity such that even if we mess up the | ||
6185 | * domains and no possible iteration will get us here, we won't | ||
6186 | * die on a /0 trap. | ||
6187 | */ | ||
6188 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
6189 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
6190 | |||
6191 | /* | ||
6192 | * Make sure the first group of this domain contains the | ||
6193 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
6194 | * breaks. See update_sg_lb_stats(). | ||
6195 | */ | ||
6196 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6197 | group_balance_cpu(sg) == cpu) | ||
6198 | groups = sg; | ||
6199 | |||
6200 | if (!first) | ||
6201 | first = sg; | ||
6202 | if (last) | ||
6203 | last->next = sg; | ||
6204 | last = sg; | ||
6205 | last->next = first; | ||
6206 | } | ||
6207 | sd->groups = groups; | ||
6208 | |||
6209 | return 0; | ||
6210 | |||
6211 | fail: | ||
6212 | free_sched_groups(first, 0); | ||
6213 | |||
6214 | return -ENOMEM; | ||
6215 | } | ||
6216 | |||
6217 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
6218 | { | ||
6219 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6220 | struct sched_domain *child = sd->child; | ||
6221 | |||
6222 | if (child) | ||
6223 | cpu = cpumask_first(sched_domain_span(child)); | ||
6224 | |||
6225 | if (sg) { | ||
6226 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
6227 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
6228 | atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ | ||
6229 | } | ||
6230 | |||
6231 | return cpu; | ||
6232 | } | ||
6233 | |||
6234 | /* | ||
6235 | * build_sched_groups will build a circular linked list of the groups | ||
6236 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6237 | * and ->cpu_capacity to 0. | ||
6238 | * | ||
6239 | * Assumes the sched_domain tree is fully constructed | ||
6240 | */ | ||
6241 | static int | ||
6242 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
6243 | { | ||
6244 | struct sched_group *first = NULL, *last = NULL; | ||
6245 | struct sd_data *sdd = sd->private; | ||
6246 | const struct cpumask *span = sched_domain_span(sd); | ||
6247 | struct cpumask *covered; | ||
6248 | int i; | ||
6249 | |||
6250 | get_group(cpu, sdd, &sd->groups); | ||
6251 | atomic_inc(&sd->groups->ref); | ||
6252 | |||
6253 | if (cpu != cpumask_first(span)) | ||
6254 | return 0; | ||
6255 | |||
6256 | lockdep_assert_held(&sched_domains_mutex); | ||
6257 | covered = sched_domains_tmpmask; | ||
6258 | |||
6259 | cpumask_clear(covered); | ||
6260 | |||
6261 | for_each_cpu(i, span) { | ||
6262 | struct sched_group *sg; | ||
6263 | int group, j; | ||
6264 | |||
6265 | if (cpumask_test_cpu(i, covered)) | ||
6266 | continue; | ||
6267 | |||
6268 | group = get_group(i, sdd, &sg); | ||
6269 | cpumask_setall(sched_group_mask(sg)); | ||
6270 | |||
6271 | for_each_cpu(j, span) { | ||
6272 | if (get_group(j, sdd, NULL) != group) | ||
6273 | continue; | ||
6274 | |||
6275 | cpumask_set_cpu(j, covered); | ||
6276 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6277 | } | ||
6278 | |||
6279 | if (!first) | ||
6280 | first = sg; | ||
6281 | if (last) | ||
6282 | last->next = sg; | ||
6283 | last = sg; | ||
6284 | } | ||
6285 | last->next = first; | ||
6286 | |||
6287 | return 0; | ||
6288 | } | ||
6289 | |||
6290 | /* | ||
6291 | * Initialize sched groups cpu_capacity. | ||
6292 | * | ||
6293 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
6294 | * distributing the load between different sched groups in a sched domain. | ||
6295 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
6296 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
6297 | * group having more cpu_capacity will pickup more load compared to the | ||
6298 | * group having less cpu_capacity. | ||
6299 | */ | ||
6300 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
6301 | { | ||
6302 | struct sched_group *sg = sd->groups; | ||
6303 | |||
6304 | WARN_ON(!sg); | ||
6305 | |||
6306 | do { | ||
6307 | int cpu, max_cpu = -1; | ||
6308 | |||
6309 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
6310 | |||
6311 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
6312 | goto next; | ||
6313 | |||
6314 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
6315 | if (max_cpu < 0) | ||
6316 | max_cpu = cpu; | ||
6317 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
6318 | max_cpu = cpu; | ||
6319 | } | ||
6320 | sg->asym_prefer_cpu = max_cpu; | ||
6321 | |||
6322 | next: | ||
6323 | sg = sg->next; | ||
6324 | } while (sg != sd->groups); | ||
6325 | |||
6326 | if (cpu != group_balance_cpu(sg)) | ||
6327 | return; | ||
6328 | |||
6329 | update_group_capacity(sd, cpu); | ||
6330 | } | ||
6331 | |||
6332 | /* | ||
6333 | * Initializers for schedule domains | ||
6334 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
6335 | */ | ||
6336 | |||
6337 | static int default_relax_domain_level = -1; | ||
6338 | int sched_domain_level_max; | ||
6339 | |||
6340 | static int __init setup_relax_domain_level(char *str) | ||
6341 | { | ||
6342 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
6343 | pr_warn("Unable to set relax_domain_level\n"); | ||
6344 | |||
6345 | return 1; | ||
6346 | } | ||
6347 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
6348 | |||
6349 | static void set_domain_attribute(struct sched_domain *sd, | ||
6350 | struct sched_domain_attr *attr) | ||
6351 | { | ||
6352 | int request; | ||
6353 | |||
6354 | if (!attr || attr->relax_domain_level < 0) { | ||
6355 | if (default_relax_domain_level < 0) | ||
6356 | return; | ||
6357 | else | ||
6358 | request = default_relax_domain_level; | ||
6359 | } else | ||
6360 | request = attr->relax_domain_level; | ||
6361 | if (request < sd->level) { | ||
6362 | /* turn off idle balance on this domain */ | ||
6363 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6364 | } else { | ||
6365 | /* turn on idle balance on this domain */ | ||
6366 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6367 | } | ||
6368 | } | ||
6369 | |||
6370 | static void __sdt_free(const struct cpumask *cpu_map); | ||
6371 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
6372 | |||
6373 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
6374 | const struct cpumask *cpu_map) | ||
6375 | { | ||
6376 | switch (what) { | ||
6377 | case sa_rootdomain: | ||
6378 | if (!atomic_read(&d->rd->refcount)) | ||
6379 | free_rootdomain(&d->rd->rcu); /* fall through */ | ||
6380 | case sa_sd: | ||
6381 | free_percpu(d->sd); /* fall through */ | ||
6382 | case sa_sd_storage: | ||
6383 | __sdt_free(cpu_map); /* fall through */ | ||
6384 | case sa_none: | ||
6385 | break; | ||
6386 | } | ||
6387 | } | ||
6388 | |||
6389 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
6390 | const struct cpumask *cpu_map) | ||
6391 | { | ||
6392 | memset(d, 0, sizeof(*d)); | ||
6393 | |||
6394 | if (__sdt_alloc(cpu_map)) | ||
6395 | return sa_sd_storage; | ||
6396 | d->sd = alloc_percpu(struct sched_domain *); | ||
6397 | if (!d->sd) | ||
6398 | return sa_sd_storage; | ||
6399 | d->rd = alloc_rootdomain(); | ||
6400 | if (!d->rd) | ||
6401 | return sa_sd; | ||
6402 | return sa_rootdomain; | ||
6403 | } | ||
6404 | |||
6405 | /* | ||
6406 | * NULL the sd_data elements we've used to build the sched_domain and | ||
6407 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
6408 | * will not free the data we're using. | ||
6409 | */ | ||
6410 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
6411 | { | ||
6412 | struct sd_data *sdd = sd->private; | ||
6413 | |||
6414 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
6415 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
6416 | |||
6417 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
6418 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
6419 | |||
6420 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
6421 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
6422 | |||
6423 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
6424 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
6425 | } | ||
6426 | |||
6427 | #ifdef CONFIG_NUMA | ||
6428 | static int sched_domains_numa_levels; | ||
6429 | enum numa_topology_type sched_numa_topology_type; | ||
6430 | static int *sched_domains_numa_distance; | ||
6431 | int sched_max_numa_distance; | ||
6432 | static struct cpumask ***sched_domains_numa_masks; | ||
6433 | static int sched_domains_curr_level; | ||
6434 | #endif | ||
6435 | |||
6436 | /* | ||
6437 | * SD_flags allowed in topology descriptions. | ||
6438 | * | ||
6439 | * These flags are purely descriptive of the topology and do not prescribe | ||
6440 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
6441 | * function: | ||
6442 | * | ||
6443 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
6444 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
6445 | * SD_NUMA - describes NUMA topologies | ||
6446 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
6447 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
6448 | * | ||
6449 | * Odd one out, which beside describing the topology has a quirk also | ||
6450 | * prescribes the desired behaviour that goes along with it: | ||
6451 | * | ||
6452 | * SD_ASYM_PACKING - describes SMT quirks | ||
6453 | */ | ||
6454 | #define TOPOLOGY_SD_FLAGS \ | ||
6455 | (SD_SHARE_CPUCAPACITY | \ | ||
6456 | SD_SHARE_PKG_RESOURCES | \ | ||
6457 | SD_NUMA | \ | ||
6458 | SD_ASYM_PACKING | \ | ||
6459 | SD_ASYM_CPUCAPACITY | \ | ||
6460 | SD_SHARE_POWERDOMAIN) | ||
6461 | |||
6462 | static struct sched_domain * | ||
6463 | sd_init(struct sched_domain_topology_level *tl, | ||
6464 | const struct cpumask *cpu_map, | ||
6465 | struct sched_domain *child, int cpu) | ||
6466 | { | ||
6467 | struct sd_data *sdd = &tl->data; | ||
6468 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6469 | int sd_id, sd_weight, sd_flags = 0; | ||
6470 | |||
6471 | #ifdef CONFIG_NUMA | ||
6472 | /* | ||
6473 | * Ugly hack to pass state to sd_numa_mask()... | ||
6474 | */ | ||
6475 | sched_domains_curr_level = tl->numa_level; | ||
6476 | #endif | ||
6477 | |||
6478 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
6479 | |||
6480 | if (tl->sd_flags) | ||
6481 | sd_flags = (*tl->sd_flags)(); | ||
6482 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
6483 | "wrong sd_flags in topology description\n")) | ||
6484 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
6485 | |||
6486 | *sd = (struct sched_domain){ | ||
6487 | .min_interval = sd_weight, | ||
6488 | .max_interval = 2*sd_weight, | ||
6489 | .busy_factor = 32, | ||
6490 | .imbalance_pct = 125, | ||
6491 | |||
6492 | .cache_nice_tries = 0, | ||
6493 | .busy_idx = 0, | ||
6494 | .idle_idx = 0, | ||
6495 | .newidle_idx = 0, | ||
6496 | .wake_idx = 0, | ||
6497 | .forkexec_idx = 0, | ||
6498 | |||
6499 | .flags = 1*SD_LOAD_BALANCE | ||
6500 | | 1*SD_BALANCE_NEWIDLE | ||
6501 | | 1*SD_BALANCE_EXEC | ||
6502 | | 1*SD_BALANCE_FORK | ||
6503 | | 0*SD_BALANCE_WAKE | ||
6504 | | 1*SD_WAKE_AFFINE | ||
6505 | | 0*SD_SHARE_CPUCAPACITY | ||
6506 | | 0*SD_SHARE_PKG_RESOURCES | ||
6507 | | 0*SD_SERIALIZE | ||
6508 | | 0*SD_PREFER_SIBLING | ||
6509 | | 0*SD_NUMA | ||
6510 | | sd_flags | ||
6511 | , | ||
6512 | |||
6513 | .last_balance = jiffies, | ||
6514 | .balance_interval = sd_weight, | ||
6515 | .smt_gain = 0, | ||
6516 | .max_newidle_lb_cost = 0, | ||
6517 | .next_decay_max_lb_cost = jiffies, | ||
6518 | .child = child, | ||
6519 | #ifdef CONFIG_SCHED_DEBUG | ||
6520 | .name = tl->name, | ||
6521 | #endif | ||
6522 | }; | ||
6523 | |||
6524 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6525 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
6526 | |||
6527 | /* | ||
6528 | * Convert topological properties into behaviour. | ||
6529 | */ | ||
6530 | |||
6531 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
6532 | struct sched_domain *t = sd; | ||
6533 | |||
6534 | for_each_lower_domain(t) | ||
6535 | t->flags |= SD_BALANCE_WAKE; | ||
6536 | } | ||
6537 | |||
6538 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
6539 | sd->flags |= SD_PREFER_SIBLING; | ||
6540 | sd->imbalance_pct = 110; | ||
6541 | sd->smt_gain = 1178; /* ~15% */ | ||
6542 | |||
6543 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6544 | sd->imbalance_pct = 117; | ||
6545 | sd->cache_nice_tries = 1; | ||
6546 | sd->busy_idx = 2; | ||
6547 | |||
6548 | #ifdef CONFIG_NUMA | ||
6549 | } else if (sd->flags & SD_NUMA) { | ||
6550 | sd->cache_nice_tries = 2; | ||
6551 | sd->busy_idx = 3; | ||
6552 | sd->idle_idx = 2; | ||
6553 | |||
6554 | sd->flags |= SD_SERIALIZE; | ||
6555 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
6556 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
6557 | SD_BALANCE_FORK | | ||
6558 | SD_WAKE_AFFINE); | ||
6559 | } | ||
6560 | |||
6561 | #endif | ||
6562 | } else { | ||
6563 | sd->flags |= SD_PREFER_SIBLING; | ||
6564 | sd->cache_nice_tries = 1; | ||
6565 | sd->busy_idx = 2; | ||
6566 | sd->idle_idx = 1; | ||
6567 | } | ||
6568 | |||
6569 | /* | ||
6570 | * For all levels sharing cache; connect a sched_domain_shared | ||
6571 | * instance. | ||
6572 | */ | ||
6573 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6574 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
6575 | atomic_inc(&sd->shared->ref); | ||
6576 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
6577 | } | ||
6578 | |||
6579 | sd->private = sdd; | ||
6580 | |||
6581 | return sd; | ||
6582 | } | ||
6583 | |||
6584 | /* | ||
6585 | * Topology list, bottom-up. | ||
6586 | */ | ||
6587 | static struct sched_domain_topology_level default_topology[] = { | ||
6588 | #ifdef CONFIG_SCHED_SMT | ||
6589 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
6590 | #endif | ||
6591 | #ifdef CONFIG_SCHED_MC | ||
6592 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
6593 | #endif | ||
6594 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
6595 | { NULL, }, | ||
6596 | }; | ||
6597 | |||
6598 | static struct sched_domain_topology_level *sched_domain_topology = | ||
6599 | default_topology; | ||
6600 | |||
6601 | #define for_each_sd_topology(tl) \ | ||
6602 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
6603 | |||
6604 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
6605 | { | ||
6606 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
6607 | return; | ||
6608 | |||
6609 | sched_domain_topology = tl; | ||
6610 | } | ||
6611 | |||
6612 | #ifdef CONFIG_NUMA | ||
6613 | |||
6614 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6615 | { | ||
6616 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6617 | } | ||
6618 | |||
6619 | static void sched_numa_warn(const char *str) | ||
6620 | { | ||
6621 | static int done = false; | ||
6622 | int i,j; | ||
6623 | |||
6624 | if (done) | ||
6625 | return; | ||
6626 | |||
6627 | done = true; | ||
6628 | |||
6629 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6630 | |||
6631 | for (i = 0; i < nr_node_ids; i++) { | ||
6632 | printk(KERN_WARNING " "); | ||
6633 | for (j = 0; j < nr_node_ids; j++) | ||
6634 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6635 | printk(KERN_CONT "\n"); | ||
6636 | } | ||
6637 | printk(KERN_WARNING "\n"); | ||
6638 | } | ||
6639 | |||
6640 | bool find_numa_distance(int distance) | ||
6641 | { | ||
6642 | int i; | ||
6643 | |||
6644 | if (distance == node_distance(0, 0)) | ||
6645 | return true; | ||
6646 | |||
6647 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6648 | if (sched_domains_numa_distance[i] == distance) | ||
6649 | return true; | ||
6650 | } | ||
6651 | |||
6652 | return false; | ||
6653 | } | ||
6654 | |||
6655 | /* | ||
6656 | * A system can have three types of NUMA topology: | ||
6657 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
6658 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
6659 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
6660 | * | ||
6661 | * The difference between a glueless mesh topology and a backplane | ||
6662 | * topology lies in whether communication between not directly | ||
6663 | * connected nodes goes through intermediary nodes (where programs | ||
6664 | * could run), or through backplane controllers. This affects | ||
6665 | * placement of programs. | ||
6666 | * | ||
6667 | * The type of topology can be discerned with the following tests: | ||
6668 | * - If the maximum distance between any nodes is 1 hop, the system | ||
6669 | * is directly connected. | ||
6670 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
6671 | * there is an intermediary node C, which is < N hops away from both | ||
6672 | * nodes A and B, the system is a glueless mesh. | ||
6673 | */ | ||
6674 | static void init_numa_topology_type(void) | ||
6675 | { | ||
6676 | int a, b, c, n; | ||
6677 | |||
6678 | n = sched_max_numa_distance; | ||
6679 | |||
6680 | if (sched_domains_numa_levels <= 1) { | ||
6681 | sched_numa_topology_type = NUMA_DIRECT; | ||
6682 | return; | ||
6683 | } | ||
6684 | |||
6685 | for_each_online_node(a) { | ||
6686 | for_each_online_node(b) { | ||
6687 | /* Find two nodes furthest removed from each other. */ | ||
6688 | if (node_distance(a, b) < n) | ||
6689 | continue; | ||
6690 | |||
6691 | /* Is there an intermediary node between a and b? */ | ||
6692 | for_each_online_node(c) { | ||
6693 | if (node_distance(a, c) < n && | ||
6694 | node_distance(b, c) < n) { | ||
6695 | sched_numa_topology_type = | ||
6696 | NUMA_GLUELESS_MESH; | ||
6697 | return; | ||
6698 | } | ||
6699 | } | ||
6700 | |||
6701 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
6702 | return; | ||
6703 | } | ||
6704 | } | ||
6705 | } | ||
6706 | |||
6707 | static void sched_init_numa(void) | ||
6708 | { | ||
6709 | int next_distance, curr_distance = node_distance(0, 0); | ||
6710 | struct sched_domain_topology_level *tl; | ||
6711 | int level = 0; | ||
6712 | int i, j, k; | ||
6713 | |||
6714 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6715 | if (!sched_domains_numa_distance) | ||
6716 | return; | ||
6717 | |||
6718 | /* | ||
6719 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6720 | * unique distances in the node_distance() table. | ||
6721 | * | ||
6722 | * Assumes node_distance(0,j) includes all distances in | ||
6723 | * node_distance(i,j) in order to avoid cubic time. | ||
6724 | */ | ||
6725 | next_distance = curr_distance; | ||
6726 | for (i = 0; i < nr_node_ids; i++) { | ||
6727 | for (j = 0; j < nr_node_ids; j++) { | ||
6728 | for (k = 0; k < nr_node_ids; k++) { | ||
6729 | int distance = node_distance(i, k); | ||
6730 | |||
6731 | if (distance > curr_distance && | ||
6732 | (distance < next_distance || | ||
6733 | next_distance == curr_distance)) | ||
6734 | next_distance = distance; | ||
6735 | |||
6736 | /* | ||
6737 | * While not a strong assumption it would be nice to know | ||
6738 | * about cases where if node A is connected to B, B is not | ||
6739 | * equally connected to A. | ||
6740 | */ | ||
6741 | if (sched_debug() && node_distance(k, i) != distance) | ||
6742 | sched_numa_warn("Node-distance not symmetric"); | ||
6743 | |||
6744 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6745 | sched_numa_warn("Node-0 not representative"); | ||
6746 | } | ||
6747 | if (next_distance != curr_distance) { | ||
6748 | sched_domains_numa_distance[level++] = next_distance; | ||
6749 | sched_domains_numa_levels = level; | ||
6750 | curr_distance = next_distance; | ||
6751 | } else break; | ||
6752 | } | ||
6753 | |||
6754 | /* | ||
6755 | * In case of sched_debug() we verify the above assumption. | ||
6756 | */ | ||
6757 | if (!sched_debug()) | ||
6758 | break; | ||
6759 | } | ||
6760 | |||
6761 | if (!level) | ||
6762 | return; | ||
6763 | |||
6764 | /* | ||
6765 | * 'level' contains the number of unique distances, excluding the | ||
6766 | * identity distance node_distance(i,i). | ||
6767 | * | ||
6768 | * The sched_domains_numa_distance[] array includes the actual distance | ||
6769 | * numbers. | ||
6770 | */ | ||
6771 | |||
6772 | /* | ||
6773 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
6774 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
6775 | * the array will contain less then 'level' members. This could be | ||
6776 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
6777 | * in other functions. | ||
6778 | * | ||
6779 | * We reset it to 'level' at the end of this function. | ||
6780 | */ | ||
6781 | sched_domains_numa_levels = 0; | ||
6782 | |||
6783 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6784 | if (!sched_domains_numa_masks) | ||
6785 | return; | ||
6786 | |||
6787 | /* | ||
6788 | * Now for each level, construct a mask per node which contains all | ||
6789 | * cpus of nodes that are that many hops away from us. | ||
6790 | */ | ||
6791 | for (i = 0; i < level; i++) { | ||
6792 | sched_domains_numa_masks[i] = | ||
6793 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6794 | if (!sched_domains_numa_masks[i]) | ||
6795 | return; | ||
6796 | |||
6797 | for (j = 0; j < nr_node_ids; j++) { | ||
6798 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6799 | if (!mask) | ||
6800 | return; | ||
6801 | |||
6802 | sched_domains_numa_masks[i][j] = mask; | ||
6803 | |||
6804 | for_each_node(k) { | ||
6805 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6806 | continue; | ||
6807 | |||
6808 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6809 | } | ||
6810 | } | ||
6811 | } | ||
6812 | |||
6813 | /* Compute default topology size */ | ||
6814 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
6815 | |||
6816 | tl = kzalloc((i + level + 1) * | ||
6817 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6818 | if (!tl) | ||
6819 | return; | ||
6820 | |||
6821 | /* | ||
6822 | * Copy the default topology bits.. | ||
6823 | */ | ||
6824 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
6825 | tl[i] = sched_domain_topology[i]; | ||
6826 | |||
6827 | /* | ||
6828 | * .. and append 'j' levels of NUMA goodness. | ||
6829 | */ | ||
6830 | for (j = 0; j < level; i++, j++) { | ||
6831 | tl[i] = (struct sched_domain_topology_level){ | ||
6832 | .mask = sd_numa_mask, | ||
6833 | .sd_flags = cpu_numa_flags, | ||
6834 | .flags = SDTL_OVERLAP, | ||
6835 | .numa_level = j, | ||
6836 | SD_INIT_NAME(NUMA) | ||
6837 | }; | ||
6838 | } | ||
6839 | |||
6840 | sched_domain_topology = tl; | ||
6841 | |||
6842 | sched_domains_numa_levels = level; | ||
6843 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
6844 | |||
6845 | init_numa_topology_type(); | ||
6846 | } | ||
6847 | |||
6848 | static void sched_domains_numa_masks_set(unsigned int cpu) | ||
6849 | { | ||
6850 | int node = cpu_to_node(cpu); | ||
6851 | int i, j; | ||
6852 | |||
6853 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6854 | for (j = 0; j < nr_node_ids; j++) { | ||
6855 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
6856 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6857 | } | ||
6858 | } | ||
6859 | } | ||
6860 | |||
6861 | static void sched_domains_numa_masks_clear(unsigned int cpu) | ||
6862 | { | ||
6863 | int i, j; | ||
6864 | |||
6865 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6866 | for (j = 0; j < nr_node_ids; j++) | ||
6867 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6868 | } | ||
6869 | } | ||
6870 | |||
6871 | #else | ||
6872 | static inline void sched_init_numa(void) { } | ||
6873 | static void sched_domains_numa_masks_set(unsigned int cpu) { } | ||
6874 | static void sched_domains_numa_masks_clear(unsigned int cpu) { } | ||
6875 | #endif /* CONFIG_NUMA */ | ||
6876 | |||
6877 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
6878 | { | ||
6879 | struct sched_domain_topology_level *tl; | ||
6880 | int j; | ||
6881 | |||
6882 | for_each_sd_topology(tl) { | ||
6883 | struct sd_data *sdd = &tl->data; | ||
6884 | |||
6885 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
6886 | if (!sdd->sd) | ||
6887 | return -ENOMEM; | ||
6888 | |||
6889 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
6890 | if (!sdd->sds) | ||
6891 | return -ENOMEM; | ||
6892 | |||
6893 | sdd->sg = alloc_percpu(struct sched_group *); | ||
6894 | if (!sdd->sg) | ||
6895 | return -ENOMEM; | ||
6896 | |||
6897 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
6898 | if (!sdd->sgc) | ||
6899 | return -ENOMEM; | ||
6900 | |||
6901 | for_each_cpu(j, cpu_map) { | ||
6902 | struct sched_domain *sd; | ||
6903 | struct sched_domain_shared *sds; | ||
6904 | struct sched_group *sg; | ||
6905 | struct sched_group_capacity *sgc; | ||
6906 | |||
6907 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
6908 | GFP_KERNEL, cpu_to_node(j)); | ||
6909 | if (!sd) | ||
6910 | return -ENOMEM; | ||
6911 | |||
6912 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
6913 | |||
6914 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
6915 | GFP_KERNEL, cpu_to_node(j)); | ||
6916 | if (!sds) | ||
6917 | return -ENOMEM; | ||
6918 | |||
6919 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
6920 | |||
6921 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6922 | GFP_KERNEL, cpu_to_node(j)); | ||
6923 | if (!sg) | ||
6924 | return -ENOMEM; | ||
6925 | |||
6926 | sg->next = sg; | ||
6927 | |||
6928 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
6929 | |||
6930 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
6931 | GFP_KERNEL, cpu_to_node(j)); | ||
6932 | if (!sgc) | ||
6933 | return -ENOMEM; | ||
6934 | |||
6935 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
6936 | } | ||
6937 | } | ||
6938 | |||
6939 | return 0; | ||
6940 | } | ||
6941 | |||
6942 | static void __sdt_free(const struct cpumask *cpu_map) | ||
6943 | { | ||
6944 | struct sched_domain_topology_level *tl; | ||
6945 | int j; | ||
6946 | |||
6947 | for_each_sd_topology(tl) { | ||
6948 | struct sd_data *sdd = &tl->data; | ||
6949 | |||
6950 | for_each_cpu(j, cpu_map) { | ||
6951 | struct sched_domain *sd; | ||
6952 | |||
6953 | if (sdd->sd) { | ||
6954 | sd = *per_cpu_ptr(sdd->sd, j); | ||
6955 | if (sd && (sd->flags & SD_OVERLAP)) | ||
6956 | free_sched_groups(sd->groups, 0); | ||
6957 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
6958 | } | ||
6959 | |||
6960 | if (sdd->sds) | ||
6961 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
6962 | if (sdd->sg) | ||
6963 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
6964 | if (sdd->sgc) | ||
6965 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
6966 | } | ||
6967 | free_percpu(sdd->sd); | ||
6968 | sdd->sd = NULL; | ||
6969 | free_percpu(sdd->sds); | ||
6970 | sdd->sds = NULL; | ||
6971 | free_percpu(sdd->sg); | ||
6972 | sdd->sg = NULL; | ||
6973 | free_percpu(sdd->sgc); | ||
6974 | sdd->sgc = NULL; | ||
6975 | } | ||
6976 | } | ||
6977 | |||
6978 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
6979 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
6980 | struct sched_domain *child, int cpu) | ||
6981 | { | ||
6982 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
6983 | |||
6984 | if (child) { | ||
6985 | sd->level = child->level + 1; | ||
6986 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
6987 | child->parent = sd; | ||
6988 | |||
6989 | if (!cpumask_subset(sched_domain_span(child), | ||
6990 | sched_domain_span(sd))) { | ||
6991 | pr_err("BUG: arch topology borken\n"); | ||
6992 | #ifdef CONFIG_SCHED_DEBUG | ||
6993 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
6994 | child->name, sd->name); | ||
6995 | #endif | ||
6996 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
6997 | cpumask_or(sched_domain_span(sd), | ||
6998 | sched_domain_span(sd), | ||
6999 | sched_domain_span(child)); | ||
7000 | } | ||
7001 | |||
7002 | } | ||
7003 | set_domain_attribute(sd, attr); | ||
7004 | |||
7005 | return sd; | ||
7006 | } | ||
7007 | |||
7008 | /* | 5682 | /* |
7009 | * Build sched domains for a given set of cpus and attach the sched domains | 5683 | * used to mark begin/end of suspend/resume: |
7010 | * to the individual cpus | ||
7011 | */ | 5684 | */ |
7012 | static int build_sched_domains(const struct cpumask *cpu_map, | 5685 | static int num_cpus_frozen; |
7013 | struct sched_domain_attr *attr) | ||
7014 | { | ||
7015 | enum s_alloc alloc_state; | ||
7016 | struct sched_domain *sd; | ||
7017 | struct s_data d; | ||
7018 | struct rq *rq = NULL; | ||
7019 | int i, ret = -ENOMEM; | ||
7020 | |||
7021 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
7022 | if (alloc_state != sa_rootdomain) | ||
7023 | goto error; | ||
7024 | |||
7025 | /* Set up domains for cpus specified by the cpu_map. */ | ||
7026 | for_each_cpu(i, cpu_map) { | ||
7027 | struct sched_domain_topology_level *tl; | ||
7028 | |||
7029 | sd = NULL; | ||
7030 | for_each_sd_topology(tl) { | ||
7031 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
7032 | if (tl == sched_domain_topology) | ||
7033 | *per_cpu_ptr(d.sd, i) = sd; | ||
7034 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
7035 | sd->flags |= SD_OVERLAP; | ||
7036 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
7037 | break; | ||
7038 | } | ||
7039 | } | ||
7040 | |||
7041 | /* Build the groups for the domains */ | ||
7042 | for_each_cpu(i, cpu_map) { | ||
7043 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7044 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
7045 | if (sd->flags & SD_OVERLAP) { | ||
7046 | if (build_overlap_sched_groups(sd, i)) | ||
7047 | goto error; | ||
7048 | } else { | ||
7049 | if (build_sched_groups(sd, i)) | ||
7050 | goto error; | ||
7051 | } | ||
7052 | } | ||
7053 | } | ||
7054 | |||
7055 | /* Calculate CPU capacity for physical packages and nodes */ | ||
7056 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
7057 | if (!cpumask_test_cpu(i, cpu_map)) | ||
7058 | continue; | ||
7059 | |||
7060 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7061 | claim_allocations(i, sd); | ||
7062 | init_sched_groups_capacity(i, sd); | ||
7063 | } | ||
7064 | } | ||
7065 | |||
7066 | /* Attach the domains */ | ||
7067 | rcu_read_lock(); | ||
7068 | for_each_cpu(i, cpu_map) { | ||
7069 | rq = cpu_rq(i); | ||
7070 | sd = *per_cpu_ptr(d.sd, i); | ||
7071 | |||
7072 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
7073 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
7074 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
7075 | |||
7076 | cpu_attach_domain(sd, d.rd, i); | ||
7077 | } | ||
7078 | rcu_read_unlock(); | ||
7079 | |||
7080 | if (rq && sched_debug_enabled) { | ||
7081 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
7082 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
7083 | } | ||
7084 | |||
7085 | ret = 0; | ||
7086 | error: | ||
7087 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
7088 | return ret; | ||
7089 | } | ||
7090 | |||
7091 | static cpumask_var_t *doms_cur; /* current sched domains */ | ||
7092 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | ||
7093 | static struct sched_domain_attr *dattr_cur; | ||
7094 | /* attribues of custom domains in 'doms_cur' */ | ||
7095 | |||
7096 | /* | ||
7097 | * Special case: If a kmalloc of a doms_cur partition (array of | ||
7098 | * cpumask) fails, then fallback to a single sched domain, | ||
7099 | * as determined by the single cpumask fallback_doms. | ||
7100 | */ | ||
7101 | static cpumask_var_t fallback_doms; | ||
7102 | |||
7103 | /* | ||
7104 | * arch_update_cpu_topology lets virtualized architectures update the | ||
7105 | * cpu core maps. It is supposed to return 1 if the topology changed | ||
7106 | * or 0 if it stayed the same. | ||
7107 | */ | ||
7108 | int __weak arch_update_cpu_topology(void) | ||
7109 | { | ||
7110 | return 0; | ||
7111 | } | ||
7112 | |||
7113 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
7114 | { | ||
7115 | int i; | ||
7116 | cpumask_var_t *doms; | ||
7117 | |||
7118 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
7119 | if (!doms) | ||
7120 | return NULL; | ||
7121 | for (i = 0; i < ndoms; i++) { | ||
7122 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
7123 | free_sched_domains(doms, i); | ||
7124 | return NULL; | ||
7125 | } | ||
7126 | } | ||
7127 | return doms; | ||
7128 | } | ||
7129 | |||
7130 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
7131 | { | ||
7132 | unsigned int i; | ||
7133 | for (i = 0; i < ndoms; i++) | ||
7134 | free_cpumask_var(doms[i]); | ||
7135 | kfree(doms); | ||
7136 | } | ||
7137 | |||
7138 | /* | ||
7139 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
7140 | * For now this just excludes isolated cpus, but could be used to | ||
7141 | * exclude other special cases in the future. | ||
7142 | */ | ||
7143 | static int init_sched_domains(const struct cpumask *cpu_map) | ||
7144 | { | ||
7145 | int err; | ||
7146 | |||
7147 | arch_update_cpu_topology(); | ||
7148 | ndoms_cur = 1; | ||
7149 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
7150 | if (!doms_cur) | ||
7151 | doms_cur = &fallback_doms; | ||
7152 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
7153 | err = build_sched_domains(doms_cur[0], NULL); | ||
7154 | register_sched_domain_sysctl(); | ||
7155 | |||
7156 | return err; | ||
7157 | } | ||
7158 | |||
7159 | /* | ||
7160 | * Detach sched domains from a group of cpus specified in cpu_map | ||
7161 | * These cpus will now be attached to the NULL domain | ||
7162 | */ | ||
7163 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
7164 | { | ||
7165 | int i; | ||
7166 | |||
7167 | rcu_read_lock(); | ||
7168 | for_each_cpu(i, cpu_map) | ||
7169 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
7170 | rcu_read_unlock(); | ||
7171 | } | ||
7172 | |||
7173 | /* handle null as "default" */ | ||
7174 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
7175 | struct sched_domain_attr *new, int idx_new) | ||
7176 | { | ||
7177 | struct sched_domain_attr tmp; | ||
7178 | |||
7179 | /* fast path */ | ||
7180 | if (!new && !cur) | ||
7181 | return 1; | ||
7182 | |||
7183 | tmp = SD_ATTR_INIT; | ||
7184 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
7185 | new ? (new + idx_new) : &tmp, | ||
7186 | sizeof(struct sched_domain_attr)); | ||
7187 | } | ||
7188 | |||
7189 | /* | ||
7190 | * Partition sched domains as specified by the 'ndoms_new' | ||
7191 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
7192 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
7193 | * It destroys each deleted domain and builds each new domain. | ||
7194 | * | ||
7195 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
7196 | * The masks don't intersect (don't overlap.) We should setup one | ||
7197 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
7198 | * not be load balanced. If the same cpumask appears both in the | ||
7199 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
7200 | * it as it is. | ||
7201 | * | ||
7202 | * The passed in 'doms_new' should be allocated using | ||
7203 | * alloc_sched_domains. This routine takes ownership of it and will | ||
7204 | * free_sched_domains it when done with it. If the caller failed the | ||
7205 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
7206 | * and partition_sched_domains() will fallback to the single partition | ||
7207 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
7208 | * | ||
7209 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
7210 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
7211 | * and it will not create the default domain. | ||
7212 | * | ||
7213 | * Call with hotplug lock held | ||
7214 | */ | ||
7215 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
7216 | struct sched_domain_attr *dattr_new) | ||
7217 | { | ||
7218 | int i, j, n; | ||
7219 | int new_topology; | ||
7220 | |||
7221 | mutex_lock(&sched_domains_mutex); | ||
7222 | |||
7223 | /* always unregister in case we don't destroy any domains */ | ||
7224 | unregister_sched_domain_sysctl(); | ||
7225 | |||
7226 | /* Let architecture update cpu core mappings. */ | ||
7227 | new_topology = arch_update_cpu_topology(); | ||
7228 | |||
7229 | n = doms_new ? ndoms_new : 0; | ||
7230 | |||
7231 | /* Destroy deleted domains */ | ||
7232 | for (i = 0; i < ndoms_cur; i++) { | ||
7233 | for (j = 0; j < n && !new_topology; j++) { | ||
7234 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
7235 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
7236 | goto match1; | ||
7237 | } | ||
7238 | /* no match - a current sched domain not in new doms_new[] */ | ||
7239 | detach_destroy_domains(doms_cur[i]); | ||
7240 | match1: | ||
7241 | ; | ||
7242 | } | ||
7243 | |||
7244 | n = ndoms_cur; | ||
7245 | if (doms_new == NULL) { | ||
7246 | n = 0; | ||
7247 | doms_new = &fallback_doms; | ||
7248 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
7249 | WARN_ON_ONCE(dattr_new); | ||
7250 | } | ||
7251 | |||
7252 | /* Build new domains */ | ||
7253 | for (i = 0; i < ndoms_new; i++) { | ||
7254 | for (j = 0; j < n && !new_topology; j++) { | ||
7255 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
7256 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
7257 | goto match2; | ||
7258 | } | ||
7259 | /* no match - add a new doms_new */ | ||
7260 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
7261 | match2: | ||
7262 | ; | ||
7263 | } | ||
7264 | |||
7265 | /* Remember the new sched domains */ | ||
7266 | if (doms_cur != &fallback_doms) | ||
7267 | free_sched_domains(doms_cur, ndoms_cur); | ||
7268 | kfree(dattr_cur); /* kfree(NULL) is safe */ | ||
7269 | doms_cur = doms_new; | ||
7270 | dattr_cur = dattr_new; | ||
7271 | ndoms_cur = ndoms_new; | ||
7272 | |||
7273 | register_sched_domain_sysctl(); | ||
7274 | |||
7275 | mutex_unlock(&sched_domains_mutex); | ||
7276 | } | ||
7277 | |||
7278 | static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ | ||
7279 | 5686 | ||
7280 | /* | 5687 | /* |
7281 | * Update cpusets according to cpu_active mask. If cpusets are | 5688 | * Update cpusets according to cpu_active mask. If cpusets are |
@@ -7352,7 +5759,7 @@ int sched_cpu_activate(unsigned int cpu) | |||
7352 | * Put the rq online, if not already. This happens: | 5759 | * Put the rq online, if not already. This happens: |
7353 | * | 5760 | * |
7354 | * 1) In the early boot process, because we build the real domains | 5761 | * 1) In the early boot process, because we build the real domains |
7355 | * after all cpus have been brought up. | 5762 | * after all CPUs have been brought up. |
7356 | * | 5763 | * |
7357 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the | 5764 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the |
7358 | * domains. | 5765 | * domains. |
@@ -7467,7 +5874,7 @@ void __init sched_init_smp(void) | |||
7467 | 5874 | ||
7468 | /* | 5875 | /* |
7469 | * There's no userspace yet to cause hotplug operations; hence all the | 5876 | * There's no userspace yet to cause hotplug operations; hence all the |
7470 | * cpu masks are stable and all blatant races in the below code cannot | 5877 | * CPU masks are stable and all blatant races in the below code cannot |
7471 | * happen. | 5878 | * happen. |
7472 | */ | 5879 | */ |
7473 | mutex_lock(&sched_domains_mutex); | 5880 | mutex_lock(&sched_domains_mutex); |
@@ -7487,6 +5894,7 @@ void __init sched_init_smp(void) | |||
7487 | init_sched_dl_class(); | 5894 | init_sched_dl_class(); |
7488 | 5895 | ||
7489 | sched_init_smt(); | 5896 | sched_init_smt(); |
5897 | sched_clock_init_late(); | ||
7490 | 5898 | ||
7491 | sched_smp_initialized = true; | 5899 | sched_smp_initialized = true; |
7492 | } | 5900 | } |
@@ -7502,6 +5910,7 @@ early_initcall(migration_init); | |||
7502 | void __init sched_init_smp(void) | 5910 | void __init sched_init_smp(void) |
7503 | { | 5911 | { |
7504 | sched_init_granularity(); | 5912 | sched_init_granularity(); |
5913 | sched_clock_init_late(); | ||
7505 | } | 5914 | } |
7506 | #endif /* CONFIG_SMP */ | 5915 | #endif /* CONFIG_SMP */ |
7507 | 5916 | ||
@@ -7545,6 +5954,8 @@ void __init sched_init(void) | |||
7545 | int i, j; | 5954 | int i, j; |
7546 | unsigned long alloc_size = 0, ptr; | 5955 | unsigned long alloc_size = 0, ptr; |
7547 | 5956 | ||
5957 | sched_clock_init(); | ||
5958 | |||
7548 | for (i = 0; i < WAIT_TABLE_SIZE; i++) | 5959 | for (i = 0; i < WAIT_TABLE_SIZE; i++) |
7549 | init_waitqueue_head(bit_wait_table + i); | 5960 | init_waitqueue_head(bit_wait_table + i); |
7550 | 5961 | ||
@@ -7583,10 +5994,8 @@ void __init sched_init(void) | |||
7583 | } | 5994 | } |
7584 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 5995 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
7585 | 5996 | ||
7586 | init_rt_bandwidth(&def_rt_bandwidth, | 5997 | init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); |
7587 | global_rt_period(), global_rt_runtime()); | 5998 | init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); |
7588 | init_dl_bandwidth(&def_dl_bandwidth, | ||
7589 | global_rt_period(), global_rt_runtime()); | ||
7590 | 5999 | ||
7591 | #ifdef CONFIG_SMP | 6000 | #ifdef CONFIG_SMP |
7592 | init_defrootdomain(); | 6001 | init_defrootdomain(); |
@@ -7622,18 +6031,18 @@ void __init sched_init(void) | |||
7622 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6031 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7623 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | 6032 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; |
7624 | /* | 6033 | /* |
7625 | * How much cpu bandwidth does root_task_group get? | 6034 | * How much CPU bandwidth does root_task_group get? |
7626 | * | 6035 | * |
7627 | * In case of task-groups formed thr' the cgroup filesystem, it | 6036 | * In case of task-groups formed thr' the cgroup filesystem, it |
7628 | * gets 100% of the cpu resources in the system. This overall | 6037 | * gets 100% of the CPU resources in the system. This overall |
7629 | * system cpu resource is divided among the tasks of | 6038 | * system CPU resource is divided among the tasks of |
7630 | * root_task_group and its child task-groups in a fair manner, | 6039 | * root_task_group and its child task-groups in a fair manner, |
7631 | * based on each entity's (task or task-group's) weight | 6040 | * based on each entity's (task or task-group's) weight |
7632 | * (se->load.weight). | 6041 | * (se->load.weight). |
7633 | * | 6042 | * |
7634 | * In other words, if root_task_group has 10 tasks of weight | 6043 | * In other words, if root_task_group has 10 tasks of weight |
7635 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 6044 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7636 | * then A0's share of the cpu resource is: | 6045 | * then A0's share of the CPU resource is: |
7637 | * | 6046 | * |
7638 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 6047 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7639 | * | 6048 | * |
@@ -7742,10 +6151,14 @@ EXPORT_SYMBOL(__might_sleep); | |||
7742 | 6151 | ||
7743 | void ___might_sleep(const char *file, int line, int preempt_offset) | 6152 | void ___might_sleep(const char *file, int line, int preempt_offset) |
7744 | { | 6153 | { |
7745 | static unsigned long prev_jiffy; /* ratelimiting */ | 6154 | /* Ratelimiting timestamp: */ |
6155 | static unsigned long prev_jiffy; | ||
6156 | |||
7746 | unsigned long preempt_disable_ip; | 6157 | unsigned long preempt_disable_ip; |
7747 | 6158 | ||
7748 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 6159 | /* WARN_ON_ONCE() by default, no rate limit required: */ |
6160 | rcu_sleep_check(); | ||
6161 | |||
7749 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 6162 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
7750 | !is_idle_task(current)) || | 6163 | !is_idle_task(current)) || |
7751 | system_state != SYSTEM_RUNNING || oops_in_progress) | 6164 | system_state != SYSTEM_RUNNING || oops_in_progress) |
@@ -7754,7 +6167,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7754 | return; | 6167 | return; |
7755 | prev_jiffy = jiffies; | 6168 | prev_jiffy = jiffies; |
7756 | 6169 | ||
7757 | /* Save this before calling printk(), since that will clobber it */ | 6170 | /* Save this before calling printk(), since that will clobber it: */ |
7758 | preempt_disable_ip = get_preempt_disable_ip(current); | 6171 | preempt_disable_ip = get_preempt_disable_ip(current); |
7759 | 6172 | ||
7760 | printk(KERN_ERR | 6173 | printk(KERN_ERR |
@@ -7833,7 +6246,7 @@ void normalize_rt_tasks(void) | |||
7833 | */ | 6246 | */ |
7834 | 6247 | ||
7835 | /** | 6248 | /** |
7836 | * curr_task - return the current task for a given cpu. | 6249 | * curr_task - return the current task for a given CPU. |
7837 | * @cpu: the processor in question. | 6250 | * @cpu: the processor in question. |
7838 | * | 6251 | * |
7839 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6252 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
@@ -7849,13 +6262,13 @@ struct task_struct *curr_task(int cpu) | |||
7849 | 6262 | ||
7850 | #ifdef CONFIG_IA64 | 6263 | #ifdef CONFIG_IA64 |
7851 | /** | 6264 | /** |
7852 | * set_curr_task - set the current task for a given cpu. | 6265 | * set_curr_task - set the current task for a given CPU. |
7853 | * @cpu: the processor in question. | 6266 | * @cpu: the processor in question. |
7854 | * @p: the task pointer to set. | 6267 | * @p: the task pointer to set. |
7855 | * | 6268 | * |
7856 | * Description: This function must only be used when non-maskable interrupts | 6269 | * Description: This function must only be used when non-maskable interrupts |
7857 | * are serviced on a separate stack. It allows the architecture to switch the | 6270 | * are serviced on a separate stack. It allows the architecture to switch the |
7858 | * notion of the current task on a cpu in a non-blocking manner. This function | 6271 | * notion of the current task on a CPU in a non-blocking manner. This function |
7859 | * must be called with all CPU's synchronized, and interrupts disabled, the | 6272 | * must be called with all CPU's synchronized, and interrupts disabled, the |
7860 | * and caller must save the original value of the current task (see | 6273 | * and caller must save the original value of the current task (see |
7861 | * curr_task() above) and restore that value before reenabling interrupts and | 6274 | * curr_task() above) and restore that value before reenabling interrupts and |
@@ -7911,7 +6324,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7911 | spin_lock_irqsave(&task_group_lock, flags); | 6324 | spin_lock_irqsave(&task_group_lock, flags); |
7912 | list_add_rcu(&tg->list, &task_groups); | 6325 | list_add_rcu(&tg->list, &task_groups); |
7913 | 6326 | ||
7914 | WARN_ON(!parent); /* root should already exist */ | 6327 | /* Root should already exist: */ |
6328 | WARN_ON(!parent); | ||
7915 | 6329 | ||
7916 | tg->parent = parent; | 6330 | tg->parent = parent; |
7917 | INIT_LIST_HEAD(&tg->children); | 6331 | INIT_LIST_HEAD(&tg->children); |
@@ -7924,13 +6338,13 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7924 | /* rcu callback to free various structures associated with a task group */ | 6338 | /* rcu callback to free various structures associated with a task group */ |
7925 | static void sched_free_group_rcu(struct rcu_head *rhp) | 6339 | static void sched_free_group_rcu(struct rcu_head *rhp) |
7926 | { | 6340 | { |
7927 | /* now it should be safe to free those cfs_rqs */ | 6341 | /* Now it should be safe to free those cfs_rqs: */ |
7928 | sched_free_group(container_of(rhp, struct task_group, rcu)); | 6342 | sched_free_group(container_of(rhp, struct task_group, rcu)); |
7929 | } | 6343 | } |
7930 | 6344 | ||
7931 | void sched_destroy_group(struct task_group *tg) | 6345 | void sched_destroy_group(struct task_group *tg) |
7932 | { | 6346 | { |
7933 | /* wait for possible concurrent references to cfs_rqs complete */ | 6347 | /* Wait for possible concurrent references to cfs_rqs complete: */ |
7934 | call_rcu(&tg->rcu, sched_free_group_rcu); | 6348 | call_rcu(&tg->rcu, sched_free_group_rcu); |
7935 | } | 6349 | } |
7936 | 6350 | ||
@@ -7938,7 +6352,7 @@ void sched_offline_group(struct task_group *tg) | |||
7938 | { | 6352 | { |
7939 | unsigned long flags; | 6353 | unsigned long flags; |
7940 | 6354 | ||
7941 | /* end participation in shares distribution */ | 6355 | /* End participation in shares distribution: */ |
7942 | unregister_fair_sched_group(tg); | 6356 | unregister_fair_sched_group(tg); |
7943 | 6357 | ||
7944 | spin_lock_irqsave(&task_group_lock, flags); | 6358 | spin_lock_irqsave(&task_group_lock, flags); |
@@ -7983,20 +6397,21 @@ void sched_move_task(struct task_struct *tsk) | |||
7983 | struct rq *rq; | 6397 | struct rq *rq; |
7984 | 6398 | ||
7985 | rq = task_rq_lock(tsk, &rf); | 6399 | rq = task_rq_lock(tsk, &rf); |
6400 | update_rq_clock(rq); | ||
7986 | 6401 | ||
7987 | running = task_current(rq, tsk); | 6402 | running = task_current(rq, tsk); |
7988 | queued = task_on_rq_queued(tsk); | 6403 | queued = task_on_rq_queued(tsk); |
7989 | 6404 | ||
7990 | if (queued) | 6405 | if (queued) |
7991 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | 6406 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); |
7992 | if (unlikely(running)) | 6407 | if (running) |
7993 | put_prev_task(rq, tsk); | 6408 | put_prev_task(rq, tsk); |
7994 | 6409 | ||
7995 | sched_change_group(tsk, TASK_MOVE_GROUP); | 6410 | sched_change_group(tsk, TASK_MOVE_GROUP); |
7996 | 6411 | ||
7997 | if (queued) | 6412 | if (queued) |
7998 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); | 6413 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
7999 | if (unlikely(running)) | 6414 | if (running) |
8000 | set_curr_task(rq, tsk); | 6415 | set_curr_task(rq, tsk); |
8001 | 6416 | ||
8002 | task_rq_unlock(rq, tsk, &rf); | 6417 | task_rq_unlock(rq, tsk, &rf); |
@@ -8366,11 +6781,14 @@ int sched_rr_handler(struct ctl_table *table, int write, | |||
8366 | 6781 | ||
8367 | mutex_lock(&mutex); | 6782 | mutex_lock(&mutex); |
8368 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 6783 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
8369 | /* make sure that internally we keep jiffies */ | 6784 | /* |
8370 | /* also, writing zero resets timeslice to default */ | 6785 | * Make sure that internally we keep jiffies. |
6786 | * Also, writing zero resets the timeslice to default: | ||
6787 | */ | ||
8371 | if (!ret && write) { | 6788 | if (!ret && write) { |
8372 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | 6789 | sched_rr_timeslice = |
8373 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | 6790 | sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : |
6791 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | ||
8374 | } | 6792 | } |
8375 | mutex_unlock(&mutex); | 6793 | mutex_unlock(&mutex); |
8376 | return ret; | 6794 | return ret; |
@@ -8431,6 +6849,7 @@ static void cpu_cgroup_fork(struct task_struct *task) | |||
8431 | 6849 | ||
8432 | rq = task_rq_lock(task, &rf); | 6850 | rq = task_rq_lock(task, &rf); |
8433 | 6851 | ||
6852 | update_rq_clock(rq); | ||
8434 | sched_change_group(task, TASK_SET_GROUP); | 6853 | sched_change_group(task, TASK_SET_GROUP); |
8435 | 6854 | ||
8436 | task_rq_unlock(rq, task, &rf); | 6855 | task_rq_unlock(rq, task, &rf); |
@@ -8550,9 +6969,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
8550 | cfs_b->quota = quota; | 6969 | cfs_b->quota = quota; |
8551 | 6970 | ||
8552 | __refill_cfs_bandwidth_runtime(cfs_b); | 6971 | __refill_cfs_bandwidth_runtime(cfs_b); |
8553 | /* restart the period timer (if active) to handle new period expiry */ | 6972 | |
6973 | /* Restart the period timer (if active) to handle new period expiry: */ | ||
8554 | if (runtime_enabled) | 6974 | if (runtime_enabled) |
8555 | start_cfs_bandwidth(cfs_b); | 6975 | start_cfs_bandwidth(cfs_b); |
6976 | |||
8556 | raw_spin_unlock_irq(&cfs_b->lock); | 6977 | raw_spin_unlock_irq(&cfs_b->lock); |
8557 | 6978 | ||
8558 | for_each_online_cpu(i) { | 6979 | for_each_online_cpu(i) { |
@@ -8690,8 +7111,8 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
8690 | parent_quota = parent_b->hierarchical_quota; | 7111 | parent_quota = parent_b->hierarchical_quota; |
8691 | 7112 | ||
8692 | /* | 7113 | /* |
8693 | * ensure max(child_quota) <= parent_quota, inherit when no | 7114 | * Ensure max(child_quota) <= parent_quota, inherit when no |
8694 | * limit is set | 7115 | * limit is set: |
8695 | */ | 7116 | */ |
8696 | if (quota == RUNTIME_INF) | 7117 | if (quota == RUNTIME_INF) |
8697 | quota = parent_quota; | 7118 | quota = parent_quota; |
@@ -8800,7 +7221,7 @@ static struct cftype cpu_files[] = { | |||
8800 | .write_u64 = cpu_rt_period_write_uint, | 7221 | .write_u64 = cpu_rt_period_write_uint, |
8801 | }, | 7222 | }, |
8802 | #endif | 7223 | #endif |
8803 | { } /* terminate */ | 7224 | { } /* Terminate */ |
8804 | }; | 7225 | }; |
8805 | 7226 | ||
8806 | struct cgroup_subsys cpu_cgrp_subsys = { | 7227 | struct cgroup_subsys cpu_cgrp_subsys = { |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9add206b5608..f95ab29a45d0 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) | |||
297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { | 297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { |
298 | seq_printf(sf, "%s %lld\n", | 298 | seq_printf(sf, "%s %lld\n", |
299 | cpuacct_stat_desc[stat], | 299 | cpuacct_stat_desc[stat], |
300 | (long long)cputime64_to_clock_t(val[stat])); | 300 | (long long)nsec_to_clock_t(val[stat])); |
301 | } | 301 | } |
302 | 302 | ||
303 | return 0; | 303 | return 0; |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7700a9cba335..2ecec3a4f1ee 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/kernel_stat.h> | 4 | #include <linux/kernel_stat.h> |
5 | #include <linux/static_key.h> | 5 | #include <linux/static_key.h> |
6 | #include <linux/context_tracking.h> | 6 | #include <linux/context_tracking.h> |
7 | #include <linux/cputime.h> | ||
7 | #include "sched.h" | 8 | #include "sched.h" |
8 | #ifdef CONFIG_PARAVIRT | 9 | #ifdef CONFIG_PARAVIRT |
9 | #include <asm/paravirt.h> | 10 | #include <asm/paravirt.h> |
@@ -44,6 +45,7 @@ void disable_sched_clock_irqtime(void) | |||
44 | void irqtime_account_irq(struct task_struct *curr) | 45 | void irqtime_account_irq(struct task_struct *curr) |
45 | { | 46 | { |
46 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); | 47 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
48 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
47 | s64 delta; | 49 | s64 delta; |
48 | int cpu; | 50 | int cpu; |
49 | 51 | ||
@@ -61,49 +63,34 @@ void irqtime_account_irq(struct task_struct *curr) | |||
61 | * in that case, so as not to confuse scheduler with a special task | 63 | * in that case, so as not to confuse scheduler with a special task |
62 | * that do not consume any time, but still wants to run. | 64 | * that do not consume any time, but still wants to run. |
63 | */ | 65 | */ |
64 | if (hardirq_count()) | 66 | if (hardirq_count()) { |
65 | irqtime->hardirq_time += delta; | 67 | cpustat[CPUTIME_IRQ] += delta; |
66 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | 68 | irqtime->tick_delta += delta; |
67 | irqtime->softirq_time += delta; | 69 | } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { |
70 | cpustat[CPUTIME_SOFTIRQ] += delta; | ||
71 | irqtime->tick_delta += delta; | ||
72 | } | ||
68 | 73 | ||
69 | u64_stats_update_end(&irqtime->sync); | 74 | u64_stats_update_end(&irqtime->sync); |
70 | } | 75 | } |
71 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
72 | 77 | ||
73 | static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) | 78 | static u64 irqtime_tick_accounted(u64 maxtime) |
74 | { | 79 | { |
75 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 80 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
76 | cputime_t irq_cputime; | 81 | u64 delta; |
77 | |||
78 | irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; | ||
79 | irq_cputime = min(irq_cputime, maxtime); | ||
80 | cpustat[idx] += irq_cputime; | ||
81 | 82 | ||
82 | return irq_cputime; | 83 | delta = min(irqtime->tick_delta, maxtime); |
83 | } | 84 | irqtime->tick_delta -= delta; |
84 | 85 | ||
85 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) | 86 | return delta; |
86 | { | ||
87 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), | ||
88 | CPUTIME_IRQ, maxtime); | ||
89 | } | ||
90 | |||
91 | static cputime_t irqtime_account_si_update(cputime_t maxtime) | ||
92 | { | ||
93 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), | ||
94 | CPUTIME_SOFTIRQ, maxtime); | ||
95 | } | 87 | } |
96 | 88 | ||
97 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 89 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
98 | 90 | ||
99 | #define sched_clock_irqtime (0) | 91 | #define sched_clock_irqtime (0) |
100 | 92 | ||
101 | static cputime_t irqtime_account_hi_update(cputime_t dummy) | 93 | static u64 irqtime_tick_accounted(u64 dummy) |
102 | { | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static cputime_t irqtime_account_si_update(cputime_t dummy) | ||
107 | { | 94 | { |
108 | return 0; | 95 | return 0; |
109 | } | 96 | } |
@@ -129,7 +116,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
129 | * @p: the process that the cpu time gets accounted to | 116 | * @p: the process that the cpu time gets accounted to |
130 | * @cputime: the cpu time spent in user space since the last update | 117 | * @cputime: the cpu time spent in user space since the last update |
131 | */ | 118 | */ |
132 | void account_user_time(struct task_struct *p, cputime_t cputime) | 119 | void account_user_time(struct task_struct *p, u64 cputime) |
133 | { | 120 | { |
134 | int index; | 121 | int index; |
135 | 122 | ||
@@ -140,7 +127,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
140 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | 127 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
141 | 128 | ||
142 | /* Add user time to cpustat. */ | 129 | /* Add user time to cpustat. */ |
143 | task_group_account_field(p, index, (__force u64) cputime); | 130 | task_group_account_field(p, index, cputime); |
144 | 131 | ||
145 | /* Account for user time used */ | 132 | /* Account for user time used */ |
146 | acct_account_cputime(p); | 133 | acct_account_cputime(p); |
@@ -151,7 +138,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
151 | * @p: the process that the cpu time gets accounted to | 138 | * @p: the process that the cpu time gets accounted to |
152 | * @cputime: the cpu time spent in virtual machine since the last update | 139 | * @cputime: the cpu time spent in virtual machine since the last update |
153 | */ | 140 | */ |
154 | static void account_guest_time(struct task_struct *p, cputime_t cputime) | 141 | void account_guest_time(struct task_struct *p, u64 cputime) |
155 | { | 142 | { |
156 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 143 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
157 | 144 | ||
@@ -162,11 +149,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
162 | 149 | ||
163 | /* Add guest time to cpustat. */ | 150 | /* Add guest time to cpustat. */ |
164 | if (task_nice(p) > 0) { | 151 | if (task_nice(p) > 0) { |
165 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | 152 | cpustat[CPUTIME_NICE] += cputime; |
166 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | 153 | cpustat[CPUTIME_GUEST_NICE] += cputime; |
167 | } else { | 154 | } else { |
168 | cpustat[CPUTIME_USER] += (__force u64) cputime; | 155 | cpustat[CPUTIME_USER] += cputime; |
169 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | 156 | cpustat[CPUTIME_GUEST] += cputime; |
170 | } | 157 | } |
171 | } | 158 | } |
172 | 159 | ||
@@ -176,15 +163,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
176 | * @cputime: the cpu time spent in kernel space since the last update | 163 | * @cputime: the cpu time spent in kernel space since the last update |
177 | * @index: pointer to cpustat field that has to be updated | 164 | * @index: pointer to cpustat field that has to be updated |
178 | */ | 165 | */ |
179 | static inline | 166 | void account_system_index_time(struct task_struct *p, |
180 | void __account_system_time(struct task_struct *p, cputime_t cputime, int index) | 167 | u64 cputime, enum cpu_usage_stat index) |
181 | { | 168 | { |
182 | /* Add system time to process. */ | 169 | /* Add system time to process. */ |
183 | p->stime += cputime; | 170 | p->stime += cputime; |
184 | account_group_system_time(p, cputime); | 171 | account_group_system_time(p, cputime); |
185 | 172 | ||
186 | /* Add system time to cpustat. */ | 173 | /* Add system time to cpustat. */ |
187 | task_group_account_field(p, index, (__force u64) cputime); | 174 | task_group_account_field(p, index, cputime); |
188 | 175 | ||
189 | /* Account for system time used */ | 176 | /* Account for system time used */ |
190 | acct_account_cputime(p); | 177 | acct_account_cputime(p); |
@@ -196,8 +183,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, int index) | |||
196 | * @hardirq_offset: the offset to subtract from hardirq_count() | 183 | * @hardirq_offset: the offset to subtract from hardirq_count() |
197 | * @cputime: the cpu time spent in kernel space since the last update | 184 | * @cputime: the cpu time spent in kernel space since the last update |
198 | */ | 185 | */ |
199 | void account_system_time(struct task_struct *p, int hardirq_offset, | 186 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
200 | cputime_t cputime) | ||
201 | { | 187 | { |
202 | int index; | 188 | int index; |
203 | 189 | ||
@@ -213,33 +199,33 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
213 | else | 199 | else |
214 | index = CPUTIME_SYSTEM; | 200 | index = CPUTIME_SYSTEM; |
215 | 201 | ||
216 | __account_system_time(p, cputime, index); | 202 | account_system_index_time(p, cputime, index); |
217 | } | 203 | } |
218 | 204 | ||
219 | /* | 205 | /* |
220 | * Account for involuntary wait time. | 206 | * Account for involuntary wait time. |
221 | * @cputime: the cpu time spent in involuntary wait | 207 | * @cputime: the cpu time spent in involuntary wait |
222 | */ | 208 | */ |
223 | void account_steal_time(cputime_t cputime) | 209 | void account_steal_time(u64 cputime) |
224 | { | 210 | { |
225 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 211 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
226 | 212 | ||
227 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | 213 | cpustat[CPUTIME_STEAL] += cputime; |
228 | } | 214 | } |
229 | 215 | ||
230 | /* | 216 | /* |
231 | * Account for idle time. | 217 | * Account for idle time. |
232 | * @cputime: the cpu time spent in idle wait | 218 | * @cputime: the cpu time spent in idle wait |
233 | */ | 219 | */ |
234 | void account_idle_time(cputime_t cputime) | 220 | void account_idle_time(u64 cputime) |
235 | { | 221 | { |
236 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 222 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
237 | struct rq *rq = this_rq(); | 223 | struct rq *rq = this_rq(); |
238 | 224 | ||
239 | if (atomic_read(&rq->nr_iowait) > 0) | 225 | if (atomic_read(&rq->nr_iowait) > 0) |
240 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | 226 | cpustat[CPUTIME_IOWAIT] += cputime; |
241 | else | 227 | else |
242 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | 228 | cpustat[CPUTIME_IDLE] += cputime; |
243 | } | 229 | } |
244 | 230 | ||
245 | /* | 231 | /* |
@@ -247,21 +233,19 @@ void account_idle_time(cputime_t cputime) | |||
247 | * ticks are not redelivered later. Due to that, this function may on | 233 | * ticks are not redelivered later. Due to that, this function may on |
248 | * occasion account more time than the calling functions think elapsed. | 234 | * occasion account more time than the calling functions think elapsed. |
249 | */ | 235 | */ |
250 | static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) | 236 | static __always_inline u64 steal_account_process_time(u64 maxtime) |
251 | { | 237 | { |
252 | #ifdef CONFIG_PARAVIRT | 238 | #ifdef CONFIG_PARAVIRT |
253 | if (static_key_false(¶virt_steal_enabled)) { | 239 | if (static_key_false(¶virt_steal_enabled)) { |
254 | cputime_t steal_cputime; | ||
255 | u64 steal; | 240 | u64 steal; |
256 | 241 | ||
257 | steal = paravirt_steal_clock(smp_processor_id()); | 242 | steal = paravirt_steal_clock(smp_processor_id()); |
258 | steal -= this_rq()->prev_steal_time; | 243 | steal -= this_rq()->prev_steal_time; |
244 | steal = min(steal, maxtime); | ||
245 | account_steal_time(steal); | ||
246 | this_rq()->prev_steal_time += steal; | ||
259 | 247 | ||
260 | steal_cputime = min(nsecs_to_cputime(steal), maxtime); | 248 | return steal; |
261 | account_steal_time(steal_cputime); | ||
262 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); | ||
263 | |||
264 | return steal_cputime; | ||
265 | } | 249 | } |
266 | #endif | 250 | #endif |
267 | return 0; | 251 | return 0; |
@@ -270,9 +254,9 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) | |||
270 | /* | 254 | /* |
271 | * Account how much elapsed time was spent in steal, irq, or softirq time. | 255 | * Account how much elapsed time was spent in steal, irq, or softirq time. |
272 | */ | 256 | */ |
273 | static inline cputime_t account_other_time(cputime_t max) | 257 | static inline u64 account_other_time(u64 max) |
274 | { | 258 | { |
275 | cputime_t accounted; | 259 | u64 accounted; |
276 | 260 | ||
277 | /* Shall be converted to a lockdep-enabled lightweight check */ | 261 | /* Shall be converted to a lockdep-enabled lightweight check */ |
278 | WARN_ON_ONCE(!irqs_disabled()); | 262 | WARN_ON_ONCE(!irqs_disabled()); |
@@ -280,10 +264,7 @@ static inline cputime_t account_other_time(cputime_t max) | |||
280 | accounted = steal_account_process_time(max); | 264 | accounted = steal_account_process_time(max); |
281 | 265 | ||
282 | if (accounted < max) | 266 | if (accounted < max) |
283 | accounted += irqtime_account_hi_update(max - accounted); | 267 | accounted += irqtime_tick_accounted(max - accounted); |
284 | |||
285 | if (accounted < max) | ||
286 | accounted += irqtime_account_si_update(max - accounted); | ||
287 | 268 | ||
288 | return accounted; | 269 | return accounted; |
289 | } | 270 | } |
@@ -315,7 +296,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) | |||
315 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | 296 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
316 | { | 297 | { |
317 | struct signal_struct *sig = tsk->signal; | 298 | struct signal_struct *sig = tsk->signal; |
318 | cputime_t utime, stime; | 299 | u64 utime, stime; |
319 | struct task_struct *t; | 300 | struct task_struct *t; |
320 | unsigned int seq, nextseq; | 301 | unsigned int seq, nextseq; |
321 | unsigned long flags; | 302 | unsigned long flags; |
@@ -379,8 +360,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
379 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 360 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
380 | struct rq *rq, int ticks) | 361 | struct rq *rq, int ticks) |
381 | { | 362 | { |
382 | u64 cputime = (__force u64) cputime_one_jiffy * ticks; | 363 | u64 other, cputime = TICK_NSEC * ticks; |
383 | cputime_t other; | ||
384 | 364 | ||
385 | /* | 365 | /* |
386 | * When returning from idle, many ticks can get accounted at | 366 | * When returning from idle, many ticks can get accounted at |
@@ -392,6 +372,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
392 | other = account_other_time(ULONG_MAX); | 372 | other = account_other_time(ULONG_MAX); |
393 | if (other >= cputime) | 373 | if (other >= cputime) |
394 | return; | 374 | return; |
375 | |||
395 | cputime -= other; | 376 | cputime -= other; |
396 | 377 | ||
397 | if (this_cpu_ksoftirqd() == p) { | 378 | if (this_cpu_ksoftirqd() == p) { |
@@ -400,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
400 | * So, we have to handle it separately here. | 381 | * So, we have to handle it separately here. |
401 | * Also, p->stime needs to be updated for ksoftirqd. | 382 | * Also, p->stime needs to be updated for ksoftirqd. |
402 | */ | 383 | */ |
403 | __account_system_time(p, cputime, CPUTIME_SOFTIRQ); | 384 | account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); |
404 | } else if (user_tick) { | 385 | } else if (user_tick) { |
405 | account_user_time(p, cputime); | 386 | account_user_time(p, cputime); |
406 | } else if (p == rq->idle) { | 387 | } else if (p == rq->idle) { |
@@ -408,7 +389,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
408 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | 389 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
409 | account_guest_time(p, cputime); | 390 | account_guest_time(p, cputime); |
410 | } else { | 391 | } else { |
411 | __account_system_time(p, cputime, CPUTIME_SYSTEM); | 392 | account_system_index_time(p, cputime, CPUTIME_SYSTEM); |
412 | } | 393 | } |
413 | } | 394 | } |
414 | 395 | ||
@@ -437,9 +418,7 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
437 | else | 418 | else |
438 | vtime_account_system(prev); | 419 | vtime_account_system(prev); |
439 | 420 | ||
440 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 421 | vtime_flush(prev); |
441 | vtime_account_user(prev); | ||
442 | #endif | ||
443 | arch_vtime_task_switch(prev); | 422 | arch_vtime_task_switch(prev); |
444 | } | 423 | } |
445 | #endif | 424 | #endif |
@@ -467,14 +446,14 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
467 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 446 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
468 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 447 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
469 | 448 | ||
470 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 449 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
471 | { | 450 | { |
472 | *ut = p->utime; | 451 | *ut = p->utime; |
473 | *st = p->stime; | 452 | *st = p->stime; |
474 | } | 453 | } |
475 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | 454 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
476 | 455 | ||
477 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 456 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
478 | { | 457 | { |
479 | struct task_cputime cputime; | 458 | struct task_cputime cputime; |
480 | 459 | ||
@@ -491,7 +470,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
491 | */ | 470 | */ |
492 | void account_process_tick(struct task_struct *p, int user_tick) | 471 | void account_process_tick(struct task_struct *p, int user_tick) |
493 | { | 472 | { |
494 | cputime_t cputime, steal; | 473 | u64 cputime, steal; |
495 | struct rq *rq = this_rq(); | 474 | struct rq *rq = this_rq(); |
496 | 475 | ||
497 | if (vtime_accounting_cpu_enabled()) | 476 | if (vtime_accounting_cpu_enabled()) |
@@ -502,7 +481,7 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
502 | return; | 481 | return; |
503 | } | 482 | } |
504 | 483 | ||
505 | cputime = cputime_one_jiffy; | 484 | cputime = TICK_NSEC; |
506 | steal = steal_account_process_time(ULONG_MAX); | 485 | steal = steal_account_process_time(ULONG_MAX); |
507 | 486 | ||
508 | if (steal >= cputime) | 487 | if (steal >= cputime) |
@@ -524,14 +503,14 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
524 | */ | 503 | */ |
525 | void account_idle_ticks(unsigned long ticks) | 504 | void account_idle_ticks(unsigned long ticks) |
526 | { | 505 | { |
527 | cputime_t cputime, steal; | 506 | u64 cputime, steal; |
528 | 507 | ||
529 | if (sched_clock_irqtime) { | 508 | if (sched_clock_irqtime) { |
530 | irqtime_account_idle_ticks(ticks); | 509 | irqtime_account_idle_ticks(ticks); |
531 | return; | 510 | return; |
532 | } | 511 | } |
533 | 512 | ||
534 | cputime = jiffies_to_cputime(ticks); | 513 | cputime = ticks * TICK_NSEC; |
535 | steal = steal_account_process_time(ULONG_MAX); | 514 | steal = steal_account_process_time(ULONG_MAX); |
536 | 515 | ||
537 | if (steal >= cputime) | 516 | if (steal >= cputime) |
@@ -545,7 +524,7 @@ void account_idle_ticks(unsigned long ticks) | |||
545 | * Perform (stime * rtime) / total, but avoid multiplication overflow by | 524 | * Perform (stime * rtime) / total, but avoid multiplication overflow by |
546 | * loosing precision when the numbers are big. | 525 | * loosing precision when the numbers are big. |
547 | */ | 526 | */ |
548 | static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) | 527 | static u64 scale_stime(u64 stime, u64 rtime, u64 total) |
549 | { | 528 | { |
550 | u64 scaled; | 529 | u64 scaled; |
551 | 530 | ||
@@ -582,7 +561,7 @@ drop_precision: | |||
582 | * followed by a 64/32->64 divide. | 561 | * followed by a 64/32->64 divide. |
583 | */ | 562 | */ |
584 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); | 563 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); |
585 | return (__force cputime_t) scaled; | 564 | return scaled; |
586 | } | 565 | } |
587 | 566 | ||
588 | /* | 567 | /* |
@@ -607,14 +586,14 @@ drop_precision: | |||
607 | */ | 586 | */ |
608 | static void cputime_adjust(struct task_cputime *curr, | 587 | static void cputime_adjust(struct task_cputime *curr, |
609 | struct prev_cputime *prev, | 588 | struct prev_cputime *prev, |
610 | cputime_t *ut, cputime_t *st) | 589 | u64 *ut, u64 *st) |
611 | { | 590 | { |
612 | cputime_t rtime, stime, utime; | 591 | u64 rtime, stime, utime; |
613 | unsigned long flags; | 592 | unsigned long flags; |
614 | 593 | ||
615 | /* Serialize concurrent callers such that we can honour our guarantees */ | 594 | /* Serialize concurrent callers such that we can honour our guarantees */ |
616 | raw_spin_lock_irqsave(&prev->lock, flags); | 595 | raw_spin_lock_irqsave(&prev->lock, flags); |
617 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | 596 | rtime = curr->sum_exec_runtime; |
618 | 597 | ||
619 | /* | 598 | /* |
620 | * This is possible under two circumstances: | 599 | * This is possible under two circumstances: |
@@ -645,8 +624,7 @@ static void cputime_adjust(struct task_cputime *curr, | |||
645 | goto update; | 624 | goto update; |
646 | } | 625 | } |
647 | 626 | ||
648 | stime = scale_stime((__force u64)stime, (__force u64)rtime, | 627 | stime = scale_stime(stime, rtime, stime + utime); |
649 | (__force u64)(stime + utime)); | ||
650 | 628 | ||
651 | update: | 629 | update: |
652 | /* | 630 | /* |
@@ -679,7 +657,7 @@ out: | |||
679 | raw_spin_unlock_irqrestore(&prev->lock, flags); | 657 | raw_spin_unlock_irqrestore(&prev->lock, flags); |
680 | } | 658 | } |
681 | 659 | ||
682 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 660 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
683 | { | 661 | { |
684 | struct task_cputime cputime = { | 662 | struct task_cputime cputime = { |
685 | .sum_exec_runtime = p->se.sum_exec_runtime, | 663 | .sum_exec_runtime = p->se.sum_exec_runtime, |
@@ -690,7 +668,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
690 | } | 668 | } |
691 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | 669 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
692 | 670 | ||
693 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 671 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
694 | { | 672 | { |
695 | struct task_cputime cputime; | 673 | struct task_cputime cputime; |
696 | 674 | ||
@@ -700,20 +678,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
700 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 678 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
701 | 679 | ||
702 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 680 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
703 | static cputime_t vtime_delta(struct task_struct *tsk) | 681 | static u64 vtime_delta(struct task_struct *tsk) |
704 | { | 682 | { |
705 | unsigned long now = READ_ONCE(jiffies); | 683 | unsigned long now = READ_ONCE(jiffies); |
706 | 684 | ||
707 | if (time_before(now, (unsigned long)tsk->vtime_snap)) | 685 | if (time_before(now, (unsigned long)tsk->vtime_snap)) |
708 | return 0; | 686 | return 0; |
709 | 687 | ||
710 | return jiffies_to_cputime(now - tsk->vtime_snap); | 688 | return jiffies_to_nsecs(now - tsk->vtime_snap); |
711 | } | 689 | } |
712 | 690 | ||
713 | static cputime_t get_vtime_delta(struct task_struct *tsk) | 691 | static u64 get_vtime_delta(struct task_struct *tsk) |
714 | { | 692 | { |
715 | unsigned long now = READ_ONCE(jiffies); | 693 | unsigned long now = READ_ONCE(jiffies); |
716 | cputime_t delta, other; | 694 | u64 delta, other; |
717 | 695 | ||
718 | /* | 696 | /* |
719 | * Unlike tick based timing, vtime based timing never has lost | 697 | * Unlike tick based timing, vtime based timing never has lost |
@@ -722,7 +700,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) | |||
722 | * elapsed time. Limit account_other_time to prevent rounding | 700 | * elapsed time. Limit account_other_time to prevent rounding |
723 | * errors from causing elapsed vtime to go negative. | 701 | * errors from causing elapsed vtime to go negative. |
724 | */ | 702 | */ |
725 | delta = jiffies_to_cputime(now - tsk->vtime_snap); | 703 | delta = jiffies_to_nsecs(now - tsk->vtime_snap); |
726 | other = account_other_time(delta); | 704 | other = account_other_time(delta); |
727 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 705 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
728 | tsk->vtime_snap = now; | 706 | tsk->vtime_snap = now; |
@@ -732,9 +710,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) | |||
732 | 710 | ||
733 | static void __vtime_account_system(struct task_struct *tsk) | 711 | static void __vtime_account_system(struct task_struct *tsk) |
734 | { | 712 | { |
735 | cputime_t delta_cpu = get_vtime_delta(tsk); | 713 | account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); |
736 | |||
737 | account_system_time(tsk, irq_count(), delta_cpu); | ||
738 | } | 714 | } |
739 | 715 | ||
740 | void vtime_account_system(struct task_struct *tsk) | 716 | void vtime_account_system(struct task_struct *tsk) |
@@ -749,14 +725,10 @@ void vtime_account_system(struct task_struct *tsk) | |||
749 | 725 | ||
750 | void vtime_account_user(struct task_struct *tsk) | 726 | void vtime_account_user(struct task_struct *tsk) |
751 | { | 727 | { |
752 | cputime_t delta_cpu; | ||
753 | |||
754 | write_seqcount_begin(&tsk->vtime_seqcount); | 728 | write_seqcount_begin(&tsk->vtime_seqcount); |
755 | tsk->vtime_snap_whence = VTIME_SYS; | 729 | tsk->vtime_snap_whence = VTIME_SYS; |
756 | if (vtime_delta(tsk)) { | 730 | if (vtime_delta(tsk)) |
757 | delta_cpu = get_vtime_delta(tsk); | 731 | account_user_time(tsk, get_vtime_delta(tsk)); |
758 | account_user_time(tsk, delta_cpu); | ||
759 | } | ||
760 | write_seqcount_end(&tsk->vtime_seqcount); | 732 | write_seqcount_end(&tsk->vtime_seqcount); |
761 | } | 733 | } |
762 | 734 | ||
@@ -797,9 +769,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit); | |||
797 | 769 | ||
798 | void vtime_account_idle(struct task_struct *tsk) | 770 | void vtime_account_idle(struct task_struct *tsk) |
799 | { | 771 | { |
800 | cputime_t delta_cpu = get_vtime_delta(tsk); | 772 | account_idle_time(get_vtime_delta(tsk)); |
801 | |||
802 | account_idle_time(delta_cpu); | ||
803 | } | 773 | } |
804 | 774 | ||
805 | void arch_vtime_task_switch(struct task_struct *prev) | 775 | void arch_vtime_task_switch(struct task_struct *prev) |
@@ -826,10 +796,10 @@ void vtime_init_idle(struct task_struct *t, int cpu) | |||
826 | local_irq_restore(flags); | 796 | local_irq_restore(flags); |
827 | } | 797 | } |
828 | 798 | ||
829 | cputime_t task_gtime(struct task_struct *t) | 799 | u64 task_gtime(struct task_struct *t) |
830 | { | 800 | { |
831 | unsigned int seq; | 801 | unsigned int seq; |
832 | cputime_t gtime; | 802 | u64 gtime; |
833 | 803 | ||
834 | if (!vtime_accounting_enabled()) | 804 | if (!vtime_accounting_enabled()) |
835 | return t->gtime; | 805 | return t->gtime; |
@@ -851,9 +821,9 @@ cputime_t task_gtime(struct task_struct *t) | |||
851 | * add up the pending nohz execution time since the last | 821 | * add up the pending nohz execution time since the last |
852 | * cputime snapshot. | 822 | * cputime snapshot. |
853 | */ | 823 | */ |
854 | void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) | 824 | void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) |
855 | { | 825 | { |
856 | cputime_t delta; | 826 | u64 delta; |
857 | unsigned int seq; | 827 | unsigned int seq; |
858 | 828 | ||
859 | if (!vtime_accounting_enabled()) { | 829 | if (!vtime_accounting_enabled()) { |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 70ef2b1901e4..27737f34757d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -663,9 +663,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
663 | * Nothing relies on rq->lock after this, so its safe to drop | 663 | * Nothing relies on rq->lock after this, so its safe to drop |
664 | * rq->lock. | 664 | * rq->lock. |
665 | */ | 665 | */ |
666 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 666 | rq_unpin_lock(rq, &rf); |
667 | push_dl_task(rq); | 667 | push_dl_task(rq); |
668 | lockdep_repin_lock(&rq->lock, rf.cookie); | 668 | rq_repin_lock(rq, &rf); |
669 | } | 669 | } |
670 | #endif | 670 | #endif |
671 | 671 | ||
@@ -1118,7 +1118,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | |||
1118 | } | 1118 | } |
1119 | 1119 | ||
1120 | struct task_struct * | 1120 | struct task_struct * |
1121 | pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 1121 | pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
1122 | { | 1122 | { |
1123 | struct sched_dl_entity *dl_se; | 1123 | struct sched_dl_entity *dl_se; |
1124 | struct task_struct *p; | 1124 | struct task_struct *p; |
@@ -1133,9 +1133,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo | |||
1133 | * disabled avoiding further scheduler activity on it and we're | 1133 | * disabled avoiding further scheduler activity on it and we're |
1134 | * being very careful to re-start the picking loop. | 1134 | * being very careful to re-start the picking loop. |
1135 | */ | 1135 | */ |
1136 | lockdep_unpin_lock(&rq->lock, cookie); | 1136 | rq_unpin_lock(rq, rf); |
1137 | pull_dl_task(rq); | 1137 | pull_dl_task(rq); |
1138 | lockdep_repin_lock(&rq->lock, cookie); | 1138 | rq_repin_lock(rq, rf); |
1139 | /* | 1139 | /* |
1140 | * pull_dl_task() can drop (and re-acquire) rq->lock; this | 1140 | * pull_dl_task() can drop (and re-acquire) rq->lock; this |
1141 | * means a stop task can slip in, in which case we need to | 1141 | * means a stop task can slip in, in which case we need to |
@@ -1729,12 +1729,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1729 | #ifdef CONFIG_SMP | 1729 | #ifdef CONFIG_SMP |
1730 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) | 1730 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) |
1731 | queue_push_tasks(rq); | 1731 | queue_push_tasks(rq); |
1732 | #else | 1732 | #endif |
1733 | if (dl_task(rq->curr)) | 1733 | if (dl_task(rq->curr)) |
1734 | check_preempt_curr_dl(rq, p, 0); | 1734 | check_preempt_curr_dl(rq, p, 0); |
1735 | else | 1735 | else |
1736 | resched_curr(rq); | 1736 | resched_curr(rq); |
1737 | #endif | ||
1738 | } | 1737 | } |
1739 | } | 1738 | } |
1740 | 1739 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fa178b62ea79..109adc0e9cb9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -953,6 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
953 | #endif | 953 | #endif |
954 | P(policy); | 954 | P(policy); |
955 | P(prio); | 955 | P(prio); |
956 | if (p->policy == SCHED_DEADLINE) { | ||
957 | P(dl.runtime); | ||
958 | P(dl.deadline); | ||
959 | } | ||
956 | #undef PN_SCHEDSTAT | 960 | #undef PN_SCHEDSTAT |
957 | #undef PN | 961 | #undef PN |
958 | #undef __PN | 962 | #undef __PN |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6559d197e08a..274c747a01ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
2657 | if (tg_weight) | 2657 | if (tg_weight) |
2658 | shares /= tg_weight; | 2658 | shares /= tg_weight; |
2659 | 2659 | ||
2660 | /* | ||
2661 | * MIN_SHARES has to be unscaled here to support per-CPU partitioning | ||
2662 | * of a group with small tg->shares value. It is a floor value which is | ||
2663 | * assigned as a minimum load.weight to the sched_entity representing | ||
2664 | * the group on a CPU. | ||
2665 | * | ||
2666 | * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 | ||
2667 | * on an 8-core system with 8 tasks each runnable on one CPU shares has | ||
2668 | * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In | ||
2669 | * case no task is runnable on a CPU MIN_SHARES=2 should be returned | ||
2670 | * instead of 0. | ||
2671 | */ | ||
2660 | if (shares < MIN_SHARES) | 2672 | if (shares < MIN_SHARES) |
2661 | shares = MIN_SHARES; | 2673 | shares = MIN_SHARES; |
2662 | if (shares > tg->shares) | 2674 | if (shares > tg->shares) |
@@ -2689,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
2689 | 2701 | ||
2690 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | 2702 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); |
2691 | 2703 | ||
2692 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | 2704 | static void update_cfs_shares(struct sched_entity *se) |
2693 | { | 2705 | { |
2706 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | ||
2694 | struct task_group *tg; | 2707 | struct task_group *tg; |
2695 | struct sched_entity *se; | ||
2696 | long shares; | 2708 | long shares; |
2697 | 2709 | ||
2698 | tg = cfs_rq->tg; | 2710 | if (!cfs_rq) |
2699 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 2711 | return; |
2700 | if (!se || throttled_hierarchy(cfs_rq)) | 2712 | |
2713 | if (throttled_hierarchy(cfs_rq)) | ||
2701 | return; | 2714 | return; |
2715 | |||
2716 | tg = cfs_rq->tg; | ||
2717 | |||
2702 | #ifndef CONFIG_SMP | 2718 | #ifndef CONFIG_SMP |
2703 | if (likely(se->load.weight == tg->shares)) | 2719 | if (likely(se->load.weight == tg->shares)) |
2704 | return; | 2720 | return; |
@@ -2707,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
2707 | 2723 | ||
2708 | reweight_entity(cfs_rq_of(se), se, shares); | 2724 | reweight_entity(cfs_rq_of(se), se, shares); |
2709 | } | 2725 | } |
2726 | |||
2710 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2727 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
2711 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | 2728 | static inline void update_cfs_shares(struct sched_entity *se) |
2712 | { | 2729 | { |
2713 | } | 2730 | } |
2714 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2731 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -3424,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | |||
3424 | return cfs_rq->avg.load_avg; | 3441 | return cfs_rq->avg.load_avg; |
3425 | } | 3442 | } |
3426 | 3443 | ||
3427 | static int idle_balance(struct rq *this_rq); | 3444 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); |
3428 | 3445 | ||
3429 | #else /* CONFIG_SMP */ | 3446 | #else /* CONFIG_SMP */ |
3430 | 3447 | ||
@@ -3453,7 +3470,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | |||
3453 | static inline void | 3470 | static inline void |
3454 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 3471 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
3455 | 3472 | ||
3456 | static inline int idle_balance(struct rq *rq) | 3473 | static inline int idle_balance(struct rq *rq, struct rq_flags *rf) |
3457 | { | 3474 | { |
3458 | return 0; | 3475 | return 0; |
3459 | } | 3476 | } |
@@ -3582,10 +3599,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3582 | if (renorm && !curr) | 3599 | if (renorm && !curr) |
3583 | se->vruntime += cfs_rq->min_vruntime; | 3600 | se->vruntime += cfs_rq->min_vruntime; |
3584 | 3601 | ||
3602 | /* | ||
3603 | * When enqueuing a sched_entity, we must: | ||
3604 | * - Update loads to have both entity and cfs_rq synced with now. | ||
3605 | * - Add its load to cfs_rq->runnable_avg | ||
3606 | * - For group_entity, update its weight to reflect the new share of | ||
3607 | * its group cfs_rq | ||
3608 | * - Add its new weight to cfs_rq->load.weight | ||
3609 | */ | ||
3585 | update_load_avg(se, UPDATE_TG); | 3610 | update_load_avg(se, UPDATE_TG); |
3586 | enqueue_entity_load_avg(cfs_rq, se); | 3611 | enqueue_entity_load_avg(cfs_rq, se); |
3612 | update_cfs_shares(se); | ||
3587 | account_entity_enqueue(cfs_rq, se); | 3613 | account_entity_enqueue(cfs_rq, se); |
3588 | update_cfs_shares(cfs_rq); | ||
3589 | 3614 | ||
3590 | if (flags & ENQUEUE_WAKEUP) | 3615 | if (flags & ENQUEUE_WAKEUP) |
3591 | place_entity(cfs_rq, se, 0); | 3616 | place_entity(cfs_rq, se, 0); |
@@ -3657,6 +3682,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3657 | * Update run-time statistics of the 'current'. | 3682 | * Update run-time statistics of the 'current'. |
3658 | */ | 3683 | */ |
3659 | update_curr(cfs_rq); | 3684 | update_curr(cfs_rq); |
3685 | |||
3686 | /* | ||
3687 | * When dequeuing a sched_entity, we must: | ||
3688 | * - Update loads to have both entity and cfs_rq synced with now. | ||
3689 | * - Substract its load from the cfs_rq->runnable_avg. | ||
3690 | * - Substract its previous weight from cfs_rq->load.weight. | ||
3691 | * - For group entity, update its weight to reflect the new share | ||
3692 | * of its group cfs_rq. | ||
3693 | */ | ||
3660 | update_load_avg(se, UPDATE_TG); | 3694 | update_load_avg(se, UPDATE_TG); |
3661 | dequeue_entity_load_avg(cfs_rq, se); | 3695 | dequeue_entity_load_avg(cfs_rq, se); |
3662 | 3696 | ||
@@ -3681,7 +3715,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3681 | /* return excess runtime on last dequeue */ | 3715 | /* return excess runtime on last dequeue */ |
3682 | return_cfs_rq_runtime(cfs_rq); | 3716 | return_cfs_rq_runtime(cfs_rq); |
3683 | 3717 | ||
3684 | update_cfs_shares(cfs_rq); | 3718 | update_cfs_shares(se); |
3685 | 3719 | ||
3686 | /* | 3720 | /* |
3687 | * Now advance min_vruntime if @se was the entity holding it back, | 3721 | * Now advance min_vruntime if @se was the entity holding it back, |
@@ -3864,7 +3898,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
3864 | * Ensure that runnable average is periodically updated. | 3898 | * Ensure that runnable average is periodically updated. |
3865 | */ | 3899 | */ |
3866 | update_load_avg(curr, UPDATE_TG); | 3900 | update_load_avg(curr, UPDATE_TG); |
3867 | update_cfs_shares(cfs_rq); | 3901 | update_cfs_shares(curr); |
3868 | 3902 | ||
3869 | #ifdef CONFIG_SCHED_HRTICK | 3903 | #ifdef CONFIG_SCHED_HRTICK |
3870 | /* | 3904 | /* |
@@ -4761,7 +4795,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4761 | break; | 4795 | break; |
4762 | 4796 | ||
4763 | update_load_avg(se, UPDATE_TG); | 4797 | update_load_avg(se, UPDATE_TG); |
4764 | update_cfs_shares(cfs_rq); | 4798 | update_cfs_shares(se); |
4765 | } | 4799 | } |
4766 | 4800 | ||
4767 | if (!se) | 4801 | if (!se) |
@@ -4820,7 +4854,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4820 | break; | 4854 | break; |
4821 | 4855 | ||
4822 | update_load_avg(se, UPDATE_TG); | 4856 | update_load_avg(se, UPDATE_TG); |
4823 | update_cfs_shares(cfs_rq); | 4857 | update_cfs_shares(se); |
4824 | } | 4858 | } |
4825 | 4859 | ||
4826 | if (!se) | 4860 | if (!se) |
@@ -6213,7 +6247,7 @@ preempt: | |||
6213 | } | 6247 | } |
6214 | 6248 | ||
6215 | static struct task_struct * | 6249 | static struct task_struct * |
6216 | pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 6250 | pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
6217 | { | 6251 | { |
6218 | struct cfs_rq *cfs_rq = &rq->cfs; | 6252 | struct cfs_rq *cfs_rq = &rq->cfs; |
6219 | struct sched_entity *se; | 6253 | struct sched_entity *se; |
@@ -6320,15 +6354,8 @@ simple: | |||
6320 | return p; | 6354 | return p; |
6321 | 6355 | ||
6322 | idle: | 6356 | idle: |
6323 | /* | 6357 | new_tasks = idle_balance(rq, rf); |
6324 | * This is OK, because current is on_cpu, which avoids it being picked | 6358 | |
6325 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
6326 | * further scheduler activity on it and we're being very careful to | ||
6327 | * re-start the picking loop. | ||
6328 | */ | ||
6329 | lockdep_unpin_lock(&rq->lock, cookie); | ||
6330 | new_tasks = idle_balance(rq); | ||
6331 | lockdep_repin_lock(&rq->lock, cookie); | ||
6332 | /* | 6359 | /* |
6333 | * Because idle_balance() releases (and re-acquires) rq->lock, it is | 6360 | * Because idle_balance() releases (and re-acquires) rq->lock, it is |
6334 | * possible for any higher priority task to appear. In that case we | 6361 | * possible for any higher priority task to appear. In that case we |
@@ -8077,6 +8104,7 @@ redo: | |||
8077 | 8104 | ||
8078 | more_balance: | 8105 | more_balance: |
8079 | raw_spin_lock_irqsave(&busiest->lock, flags); | 8106 | raw_spin_lock_irqsave(&busiest->lock, flags); |
8107 | update_rq_clock(busiest); | ||
8080 | 8108 | ||
8081 | /* | 8109 | /* |
8082 | * cur_ld_moved - load moved in current iteration | 8110 | * cur_ld_moved - load moved in current iteration |
@@ -8297,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) | |||
8297 | * idle_balance is called by schedule() if this_cpu is about to become | 8325 | * idle_balance is called by schedule() if this_cpu is about to become |
8298 | * idle. Attempts to pull tasks from other CPUs. | 8326 | * idle. Attempts to pull tasks from other CPUs. |
8299 | */ | 8327 | */ |
8300 | static int idle_balance(struct rq *this_rq) | 8328 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) |
8301 | { | 8329 | { |
8302 | unsigned long next_balance = jiffies + HZ; | 8330 | unsigned long next_balance = jiffies + HZ; |
8303 | int this_cpu = this_rq->cpu; | 8331 | int this_cpu = this_rq->cpu; |
@@ -8311,6 +8339,14 @@ static int idle_balance(struct rq *this_rq) | |||
8311 | */ | 8339 | */ |
8312 | this_rq->idle_stamp = rq_clock(this_rq); | 8340 | this_rq->idle_stamp = rq_clock(this_rq); |
8313 | 8341 | ||
8342 | /* | ||
8343 | * This is OK, because current is on_cpu, which avoids it being picked | ||
8344 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
8345 | * further scheduler activity on it and we're being very careful to | ||
8346 | * re-start the picking loop. | ||
8347 | */ | ||
8348 | rq_unpin_lock(this_rq, rf); | ||
8349 | |||
8314 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | 8350 | if (this_rq->avg_idle < sysctl_sched_migration_cost || |
8315 | !this_rq->rd->overload) { | 8351 | !this_rq->rd->overload) { |
8316 | rcu_read_lock(); | 8352 | rcu_read_lock(); |
@@ -8388,6 +8424,8 @@ out: | |||
8388 | if (pulled_task) | 8424 | if (pulled_task) |
8389 | this_rq->idle_stamp = 0; | 8425 | this_rq->idle_stamp = 0; |
8390 | 8426 | ||
8427 | rq_repin_lock(this_rq, rf); | ||
8428 | |||
8391 | return pulled_task; | 8429 | return pulled_task; |
8392 | } | 8430 | } |
8393 | 8431 | ||
@@ -8443,6 +8481,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8443 | }; | 8481 | }; |
8444 | 8482 | ||
8445 | schedstat_inc(sd->alb_count); | 8483 | schedstat_inc(sd->alb_count); |
8484 | update_rq_clock(busiest_rq); | ||
8446 | 8485 | ||
8447 | p = detach_one_task(&env); | 8486 | p = detach_one_task(&env); |
8448 | if (p) { | 8487 | if (p) { |
@@ -9264,6 +9303,7 @@ void online_fair_sched_group(struct task_group *tg) | |||
9264 | se = tg->se[i]; | 9303 | se = tg->se[i]; |
9265 | 9304 | ||
9266 | raw_spin_lock_irq(&rq->lock); | 9305 | raw_spin_lock_irq(&rq->lock); |
9306 | update_rq_clock(rq); | ||
9267 | attach_entity_cfs_rq(se); | 9307 | attach_entity_cfs_rq(se); |
9268 | sync_throttle(tg, i); | 9308 | sync_throttle(tg, i); |
9269 | raw_spin_unlock_irq(&rq->lock); | 9309 | raw_spin_unlock_irq(&rq->lock); |
@@ -9356,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
9356 | 9396 | ||
9357 | /* Possible calls to update_curr() need rq clock */ | 9397 | /* Possible calls to update_curr() need rq clock */ |
9358 | update_rq_clock(rq); | 9398 | update_rq_clock(rq); |
9359 | for_each_sched_entity(se) | 9399 | for_each_sched_entity(se) { |
9360 | update_cfs_shares(group_cfs_rq(se)); | 9400 | update_load_avg(se, UPDATE_TG); |
9401 | update_cfs_shares(se); | ||
9402 | } | ||
9361 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 9403 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
9362 | } | 9404 | } |
9363 | 9405 | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 5405d3feb112..0c00172db63e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
24 | } | 24 | } |
25 | 25 | ||
26 | static struct task_struct * | 26 | static struct task_struct * |
27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
28 | { | 28 | { |
29 | put_prev_task(rq, prev); | 29 | put_prev_task(rq, prev); |
30 | update_idle_core(rq); | 30 | update_idle_core(rq); |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a688a8206727..e8836cfc4cdb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/irq_work.h> | 9 | #include <linux/irq_work.h> |
10 | 10 | ||
11 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; |
12 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; | ||
12 | 13 | ||
13 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | 14 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); |
14 | 15 | ||
@@ -1523,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1523 | } | 1524 | } |
1524 | 1525 | ||
1525 | static struct task_struct * | 1526 | static struct task_struct * |
1526 | pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 1527 | pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
1527 | { | 1528 | { |
1528 | struct task_struct *p; | 1529 | struct task_struct *p; |
1529 | struct rt_rq *rt_rq = &rq->rt; | 1530 | struct rt_rq *rt_rq = &rq->rt; |
@@ -1535,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo | |||
1535 | * disabled avoiding further scheduler activity on it and we're | 1536 | * disabled avoiding further scheduler activity on it and we're |
1536 | * being very careful to re-start the picking loop. | 1537 | * being very careful to re-start the picking loop. |
1537 | */ | 1538 | */ |
1538 | lockdep_unpin_lock(&rq->lock, cookie); | 1539 | rq_unpin_lock(rq, rf); |
1539 | pull_rt_task(rq); | 1540 | pull_rt_task(rq); |
1540 | lockdep_repin_lock(&rq->lock, cookie); | 1541 | rq_repin_lock(rq, rf); |
1541 | /* | 1542 | /* |
1542 | * pull_rt_task() can drop (and re-acquire) rq->lock; this | 1543 | * pull_rt_task() can drop (and re-acquire) rq->lock; this |
1543 | * means a dl or stop task can slip in, in which case we need | 1544 | * means a dl or stop task can slip in, in which case we need |
@@ -2198,10 +2199,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
2198 | #ifdef CONFIG_SMP | 2199 | #ifdef CONFIG_SMP |
2199 | if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) | 2200 | if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) |
2200 | queue_push_tasks(rq); | 2201 | queue_push_tasks(rq); |
2201 | #else | 2202 | #endif /* CONFIG_SMP */ |
2202 | if (p->prio < rq->curr->prio) | 2203 | if (p->prio < rq->curr->prio) |
2203 | resched_curr(rq); | 2204 | resched_curr(rq); |
2204 | #endif /* CONFIG_SMP */ | ||
2205 | } | 2205 | } |
2206 | } | 2206 | } |
2207 | 2207 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7b34c7826ca5..71b10a9b73cf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/u64_stats_sync.h> | 5 | #include <linux/u64_stats_sync.h> |
6 | #include <linux/sched/deadline.h> | 6 | #include <linux/sched/deadline.h> |
7 | #include <linux/kernel_stat.h> | ||
7 | #include <linux/binfmts.h> | 8 | #include <linux/binfmts.h> |
8 | #include <linux/mutex.h> | 9 | #include <linux/mutex.h> |
9 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
@@ -222,7 +223,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
222 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 223 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
223 | } | 224 | } |
224 | 225 | ||
225 | extern struct mutex sched_domains_mutex; | 226 | extern void init_dl_bw(struct dl_bw *dl_b); |
226 | 227 | ||
227 | #ifdef CONFIG_CGROUP_SCHED | 228 | #ifdef CONFIG_CGROUP_SCHED |
228 | 229 | ||
@@ -583,6 +584,13 @@ struct root_domain { | |||
583 | }; | 584 | }; |
584 | 585 | ||
585 | extern struct root_domain def_root_domain; | 586 | extern struct root_domain def_root_domain; |
587 | extern struct mutex sched_domains_mutex; | ||
588 | extern cpumask_var_t fallback_doms; | ||
589 | extern cpumask_var_t sched_domains_tmpmask; | ||
590 | |||
591 | extern void init_defrootdomain(void); | ||
592 | extern int init_sched_domains(const struct cpumask *cpu_map); | ||
593 | extern void rq_attach_root(struct rq *rq, struct root_domain *rd); | ||
586 | 594 | ||
587 | #endif /* CONFIG_SMP */ | 595 | #endif /* CONFIG_SMP */ |
588 | 596 | ||
@@ -644,7 +652,7 @@ struct rq { | |||
644 | unsigned long next_balance; | 652 | unsigned long next_balance; |
645 | struct mm_struct *prev_mm; | 653 | struct mm_struct *prev_mm; |
646 | 654 | ||
647 | unsigned int clock_skip_update; | 655 | unsigned int clock_update_flags; |
648 | u64 clock; | 656 | u64 clock; |
649 | u64 clock_task; | 657 | u64 clock_task; |
650 | 658 | ||
@@ -768,28 +776,110 @@ static inline u64 __rq_clock_broken(struct rq *rq) | |||
768 | return READ_ONCE(rq->clock); | 776 | return READ_ONCE(rq->clock); |
769 | } | 777 | } |
770 | 778 | ||
779 | /* | ||
780 | * rq::clock_update_flags bits | ||
781 | * | ||
782 | * %RQCF_REQ_SKIP - will request skipping of clock update on the next | ||
783 | * call to __schedule(). This is an optimisation to avoid | ||
784 | * neighbouring rq clock updates. | ||
785 | * | ||
786 | * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is | ||
787 | * in effect and calls to update_rq_clock() are being ignored. | ||
788 | * | ||
789 | * %RQCF_UPDATED - is a debug flag that indicates whether a call has been | ||
790 | * made to update_rq_clock() since the last time rq::lock was pinned. | ||
791 | * | ||
792 | * If inside of __schedule(), clock_update_flags will have been | ||
793 | * shifted left (a left shift is a cheap operation for the fast path | ||
794 | * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, | ||
795 | * | ||
796 | * if (rq-clock_update_flags >= RQCF_UPDATED) | ||
797 | * | ||
798 | * to check if %RQCF_UPADTED is set. It'll never be shifted more than | ||
799 | * one position though, because the next rq_unpin_lock() will shift it | ||
800 | * back. | ||
801 | */ | ||
802 | #define RQCF_REQ_SKIP 0x01 | ||
803 | #define RQCF_ACT_SKIP 0x02 | ||
804 | #define RQCF_UPDATED 0x04 | ||
805 | |||
806 | static inline void assert_clock_updated(struct rq *rq) | ||
807 | { | ||
808 | /* | ||
809 | * The only reason for not seeing a clock update since the | ||
810 | * last rq_pin_lock() is if we're currently skipping updates. | ||
811 | */ | ||
812 | SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); | ||
813 | } | ||
814 | |||
771 | static inline u64 rq_clock(struct rq *rq) | 815 | static inline u64 rq_clock(struct rq *rq) |
772 | { | 816 | { |
773 | lockdep_assert_held(&rq->lock); | 817 | lockdep_assert_held(&rq->lock); |
818 | assert_clock_updated(rq); | ||
819 | |||
774 | return rq->clock; | 820 | return rq->clock; |
775 | } | 821 | } |
776 | 822 | ||
777 | static inline u64 rq_clock_task(struct rq *rq) | 823 | static inline u64 rq_clock_task(struct rq *rq) |
778 | { | 824 | { |
779 | lockdep_assert_held(&rq->lock); | 825 | lockdep_assert_held(&rq->lock); |
826 | assert_clock_updated(rq); | ||
827 | |||
780 | return rq->clock_task; | 828 | return rq->clock_task; |
781 | } | 829 | } |
782 | 830 | ||
783 | #define RQCF_REQ_SKIP 0x01 | ||
784 | #define RQCF_ACT_SKIP 0x02 | ||
785 | |||
786 | static inline void rq_clock_skip_update(struct rq *rq, bool skip) | 831 | static inline void rq_clock_skip_update(struct rq *rq, bool skip) |
787 | { | 832 | { |
788 | lockdep_assert_held(&rq->lock); | 833 | lockdep_assert_held(&rq->lock); |
789 | if (skip) | 834 | if (skip) |
790 | rq->clock_skip_update |= RQCF_REQ_SKIP; | 835 | rq->clock_update_flags |= RQCF_REQ_SKIP; |
791 | else | 836 | else |
792 | rq->clock_skip_update &= ~RQCF_REQ_SKIP; | 837 | rq->clock_update_flags &= ~RQCF_REQ_SKIP; |
838 | } | ||
839 | |||
840 | struct rq_flags { | ||
841 | unsigned long flags; | ||
842 | struct pin_cookie cookie; | ||
843 | #ifdef CONFIG_SCHED_DEBUG | ||
844 | /* | ||
845 | * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the | ||
846 | * current pin context is stashed here in case it needs to be | ||
847 | * restored in rq_repin_lock(). | ||
848 | */ | ||
849 | unsigned int clock_update_flags; | ||
850 | #endif | ||
851 | }; | ||
852 | |||
853 | static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) | ||
854 | { | ||
855 | rf->cookie = lockdep_pin_lock(&rq->lock); | ||
856 | |||
857 | #ifdef CONFIG_SCHED_DEBUG | ||
858 | rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); | ||
859 | rf->clock_update_flags = 0; | ||
860 | #endif | ||
861 | } | ||
862 | |||
863 | static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) | ||
864 | { | ||
865 | #ifdef CONFIG_SCHED_DEBUG | ||
866 | if (rq->clock_update_flags > RQCF_ACT_SKIP) | ||
867 | rf->clock_update_flags = RQCF_UPDATED; | ||
868 | #endif | ||
869 | |||
870 | lockdep_unpin_lock(&rq->lock, rf->cookie); | ||
871 | } | ||
872 | |||
873 | static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) | ||
874 | { | ||
875 | lockdep_repin_lock(&rq->lock, rf->cookie); | ||
876 | |||
877 | #ifdef CONFIG_SCHED_DEBUG | ||
878 | /* | ||
879 | * Restore the value we stashed in @rf for this pin context. | ||
880 | */ | ||
881 | rq->clock_update_flags |= rf->clock_update_flags; | ||
882 | #endif | ||
793 | } | 883 | } |
794 | 884 | ||
795 | #ifdef CONFIG_NUMA | 885 | #ifdef CONFIG_NUMA |
@@ -803,6 +893,16 @@ extern int sched_max_numa_distance; | |||
803 | extern bool find_numa_distance(int distance); | 893 | extern bool find_numa_distance(int distance); |
804 | #endif | 894 | #endif |
805 | 895 | ||
896 | #ifdef CONFIG_NUMA | ||
897 | extern void sched_init_numa(void); | ||
898 | extern void sched_domains_numa_masks_set(unsigned int cpu); | ||
899 | extern void sched_domains_numa_masks_clear(unsigned int cpu); | ||
900 | #else | ||
901 | static inline void sched_init_numa(void) { } | ||
902 | static inline void sched_domains_numa_masks_set(unsigned int cpu) { } | ||
903 | static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } | ||
904 | #endif | ||
905 | |||
806 | #ifdef CONFIG_NUMA_BALANCING | 906 | #ifdef CONFIG_NUMA_BALANCING |
807 | /* The regions in numa_faults array from task_struct */ | 907 | /* The regions in numa_faults array from task_struct */ |
808 | enum numa_faults_stats { | 908 | enum numa_faults_stats { |
@@ -969,7 +1069,7 @@ static inline void sched_ttwu_pending(void) { } | |||
969 | #endif /* CONFIG_SMP */ | 1069 | #endif /* CONFIG_SMP */ |
970 | 1070 | ||
971 | #include "stats.h" | 1071 | #include "stats.h" |
972 | #include "auto_group.h" | 1072 | #include "autogroup.h" |
973 | 1073 | ||
974 | #ifdef CONFIG_CGROUP_SCHED | 1074 | #ifdef CONFIG_CGROUP_SCHED |
975 | 1075 | ||
@@ -1245,7 +1345,7 @@ struct sched_class { | |||
1245 | */ | 1345 | */ |
1246 | struct task_struct * (*pick_next_task) (struct rq *rq, | 1346 | struct task_struct * (*pick_next_task) (struct rq *rq, |
1247 | struct task_struct *prev, | 1347 | struct task_struct *prev, |
1248 | struct pin_cookie cookie); | 1348 | struct rq_flags *rf); |
1249 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1349 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
1250 | 1350 | ||
1251 | #ifdef CONFIG_SMP | 1351 | #ifdef CONFIG_SMP |
@@ -1501,11 +1601,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } | |||
1501 | static inline void sched_avg_update(struct rq *rq) { } | 1601 | static inline void sched_avg_update(struct rq *rq) { } |
1502 | #endif | 1602 | #endif |
1503 | 1603 | ||
1504 | struct rq_flags { | ||
1505 | unsigned long flags; | ||
1506 | struct pin_cookie cookie; | ||
1507 | }; | ||
1508 | |||
1509 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 1604 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
1510 | __acquires(rq->lock); | 1605 | __acquires(rq->lock); |
1511 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 1606 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
@@ -1515,7 +1610,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
1515 | static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) | 1610 | static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) |
1516 | __releases(rq->lock) | 1611 | __releases(rq->lock) |
1517 | { | 1612 | { |
1518 | lockdep_unpin_lock(&rq->lock, rf->cookie); | 1613 | rq_unpin_lock(rq, rf); |
1519 | raw_spin_unlock(&rq->lock); | 1614 | raw_spin_unlock(&rq->lock); |
1520 | } | 1615 | } |
1521 | 1616 | ||
@@ -1524,7 +1619,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) | |||
1524 | __releases(rq->lock) | 1619 | __releases(rq->lock) |
1525 | __releases(p->pi_lock) | 1620 | __releases(p->pi_lock) |
1526 | { | 1621 | { |
1527 | lockdep_unpin_lock(&rq->lock, rf->cookie); | 1622 | rq_unpin_lock(rq, rf); |
1528 | raw_spin_unlock(&rq->lock); | 1623 | raw_spin_unlock(&rq->lock); |
1529 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); | 1624 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); |
1530 | } | 1625 | } |
@@ -1674,6 +1769,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1674 | __release(rq2->lock); | 1769 | __release(rq2->lock); |
1675 | } | 1770 | } |
1676 | 1771 | ||
1772 | extern void set_rq_online (struct rq *rq); | ||
1773 | extern void set_rq_offline(struct rq *rq); | ||
1774 | extern bool sched_smp_initialized; | ||
1775 | |||
1677 | #else /* CONFIG_SMP */ | 1776 | #else /* CONFIG_SMP */ |
1678 | 1777 | ||
1679 | /* | 1778 | /* |
@@ -1750,8 +1849,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } | |||
1750 | 1849 | ||
1751 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1850 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
1752 | struct irqtime { | 1851 | struct irqtime { |
1753 | u64 hardirq_time; | 1852 | u64 tick_delta; |
1754 | u64 softirq_time; | ||
1755 | u64 irq_start_time; | 1853 | u64 irq_start_time; |
1756 | struct u64_stats_sync sync; | 1854 | struct u64_stats_sync sync; |
1757 | }; | 1855 | }; |
@@ -1761,12 +1859,13 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime); | |||
1761 | static inline u64 irq_time_read(int cpu) | 1859 | static inline u64 irq_time_read(int cpu) |
1762 | { | 1860 | { |
1763 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); | 1861 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); |
1862 | u64 *cpustat = kcpustat_cpu(cpu).cpustat; | ||
1764 | unsigned int seq; | 1863 | unsigned int seq; |
1765 | u64 total; | 1864 | u64 total; |
1766 | 1865 | ||
1767 | do { | 1866 | do { |
1768 | seq = __u64_stats_fetch_begin(&irqtime->sync); | 1867 | seq = __u64_stats_fetch_begin(&irqtime->sync); |
1769 | total = irqtime->softirq_time + irqtime->hardirq_time; | 1868 | total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; |
1770 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); | 1869 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); |
1771 | 1870 | ||
1772 | return total; | 1871 | return total; |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c69a9870ab79..bf0da0aa0a14 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -224,7 +224,7 @@ struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) | |||
224 | * running CPU and update the utime field there. | 224 | * running CPU and update the utime field there. |
225 | */ | 225 | */ |
226 | static inline void account_group_user_time(struct task_struct *tsk, | 226 | static inline void account_group_user_time(struct task_struct *tsk, |
227 | cputime_t cputime) | 227 | u64 cputime) |
228 | { | 228 | { |
229 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); | 229 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); |
230 | 230 | ||
@@ -245,7 +245,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
245 | * running CPU and update the stime field there. | 245 | * running CPU and update the stime field there. |
246 | */ | 246 | */ |
247 | static inline void account_group_system_time(struct task_struct *tsk, | 247 | static inline void account_group_system_time(struct task_struct *tsk, |
248 | cputime_t cputime) | 248 | u64 cputime) |
249 | { | 249 | { |
250 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); | 250 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); |
251 | 251 | ||
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 604297a08b3a..9f69fb630853 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | |||
24 | } | 24 | } |
25 | 25 | ||
26 | static struct task_struct * | 26 | static struct task_struct * |
27 | pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 27 | pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
28 | { | 28 | { |
29 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
30 | 30 | ||
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c new file mode 100644 index 000000000000..1b0b4fb12837 --- /dev/null +++ b/kernel/sched/topology.c | |||
@@ -0,0 +1,1658 @@ | |||
1 | /* | ||
2 | * Scheduler topology setup/handling methods | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mutex.h> | ||
6 | |||
7 | #include "sched.h" | ||
8 | |||
9 | DEFINE_MUTEX(sched_domains_mutex); | ||
10 | |||
11 | /* Protected by sched_domains_mutex: */ | ||
12 | cpumask_var_t sched_domains_tmpmask; | ||
13 | |||
14 | #ifdef CONFIG_SCHED_DEBUG | ||
15 | |||
16 | static __read_mostly int sched_debug_enabled; | ||
17 | |||
18 | static int __init sched_debug_setup(char *str) | ||
19 | { | ||
20 | sched_debug_enabled = 1; | ||
21 | |||
22 | return 0; | ||
23 | } | ||
24 | early_param("sched_debug", sched_debug_setup); | ||
25 | |||
26 | static inline bool sched_debug(void) | ||
27 | { | ||
28 | return sched_debug_enabled; | ||
29 | } | ||
30 | |||
31 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
32 | struct cpumask *groupmask) | ||
33 | { | ||
34 | struct sched_group *group = sd->groups; | ||
35 | |||
36 | cpumask_clear(groupmask); | ||
37 | |||
38 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
39 | |||
40 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
41 | printk("does not load-balance\n"); | ||
42 | if (sd->parent) | ||
43 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
44 | " has parent"); | ||
45 | return -1; | ||
46 | } | ||
47 | |||
48 | printk(KERN_CONT "span %*pbl level %s\n", | ||
49 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
50 | |||
51 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
52 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
53 | "CPU%d\n", cpu); | ||
54 | } | ||
55 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
56 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
57 | " CPU%d\n", cpu); | ||
58 | } | ||
59 | |||
60 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
61 | do { | ||
62 | if (!group) { | ||
63 | printk("\n"); | ||
64 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
65 | break; | ||
66 | } | ||
67 | |||
68 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
69 | printk(KERN_CONT "\n"); | ||
70 | printk(KERN_ERR "ERROR: empty group\n"); | ||
71 | break; | ||
72 | } | ||
73 | |||
74 | if (!(sd->flags & SD_OVERLAP) && | ||
75 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
76 | printk(KERN_CONT "\n"); | ||
77 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
78 | break; | ||
79 | } | ||
80 | |||
81 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
82 | |||
83 | printk(KERN_CONT " %*pbl", | ||
84 | cpumask_pr_args(sched_group_cpus(group))); | ||
85 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
86 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
87 | group->sgc->capacity); | ||
88 | } | ||
89 | |||
90 | group = group->next; | ||
91 | } while (group != sd->groups); | ||
92 | printk(KERN_CONT "\n"); | ||
93 | |||
94 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
95 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
96 | |||
97 | if (sd->parent && | ||
98 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
99 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
100 | "of domain->span\n"); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
105 | { | ||
106 | int level = 0; | ||
107 | |||
108 | if (!sched_debug_enabled) | ||
109 | return; | ||
110 | |||
111 | if (!sd) { | ||
112 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
113 | return; | ||
114 | } | ||
115 | |||
116 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
117 | |||
118 | for (;;) { | ||
119 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
120 | break; | ||
121 | level++; | ||
122 | sd = sd->parent; | ||
123 | if (!sd) | ||
124 | break; | ||
125 | } | ||
126 | } | ||
127 | #else /* !CONFIG_SCHED_DEBUG */ | ||
128 | |||
129 | # define sched_debug_enabled 0 | ||
130 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
131 | static inline bool sched_debug(void) | ||
132 | { | ||
133 | return false; | ||
134 | } | ||
135 | #endif /* CONFIG_SCHED_DEBUG */ | ||
136 | |||
137 | static int sd_degenerate(struct sched_domain *sd) | ||
138 | { | ||
139 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
140 | return 1; | ||
141 | |||
142 | /* Following flags need at least 2 groups */ | ||
143 | if (sd->flags & (SD_LOAD_BALANCE | | ||
144 | SD_BALANCE_NEWIDLE | | ||
145 | SD_BALANCE_FORK | | ||
146 | SD_BALANCE_EXEC | | ||
147 | SD_SHARE_CPUCAPACITY | | ||
148 | SD_ASYM_CPUCAPACITY | | ||
149 | SD_SHARE_PKG_RESOURCES | | ||
150 | SD_SHARE_POWERDOMAIN)) { | ||
151 | if (sd->groups != sd->groups->next) | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /* Following flags don't use groups */ | ||
156 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
157 | return 0; | ||
158 | |||
159 | return 1; | ||
160 | } | ||
161 | |||
162 | static int | ||
163 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
164 | { | ||
165 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
166 | |||
167 | if (sd_degenerate(parent)) | ||
168 | return 1; | ||
169 | |||
170 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
171 | return 0; | ||
172 | |||
173 | /* Flags needing groups don't count if only 1 group in parent */ | ||
174 | if (parent->groups == parent->groups->next) { | ||
175 | pflags &= ~(SD_LOAD_BALANCE | | ||
176 | SD_BALANCE_NEWIDLE | | ||
177 | SD_BALANCE_FORK | | ||
178 | SD_BALANCE_EXEC | | ||
179 | SD_ASYM_CPUCAPACITY | | ||
180 | SD_SHARE_CPUCAPACITY | | ||
181 | SD_SHARE_PKG_RESOURCES | | ||
182 | SD_PREFER_SIBLING | | ||
183 | SD_SHARE_POWERDOMAIN); | ||
184 | if (nr_node_ids == 1) | ||
185 | pflags &= ~SD_SERIALIZE; | ||
186 | } | ||
187 | if (~cflags & pflags) | ||
188 | return 0; | ||
189 | |||
190 | return 1; | ||
191 | } | ||
192 | |||
193 | static void free_rootdomain(struct rcu_head *rcu) | ||
194 | { | ||
195 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
196 | |||
197 | cpupri_cleanup(&rd->cpupri); | ||
198 | cpudl_cleanup(&rd->cpudl); | ||
199 | free_cpumask_var(rd->dlo_mask); | ||
200 | free_cpumask_var(rd->rto_mask); | ||
201 | free_cpumask_var(rd->online); | ||
202 | free_cpumask_var(rd->span); | ||
203 | kfree(rd); | ||
204 | } | ||
205 | |||
206 | void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
207 | { | ||
208 | struct root_domain *old_rd = NULL; | ||
209 | unsigned long flags; | ||
210 | |||
211 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
212 | |||
213 | if (rq->rd) { | ||
214 | old_rd = rq->rd; | ||
215 | |||
216 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
217 | set_rq_offline(rq); | ||
218 | |||
219 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
220 | |||
221 | /* | ||
222 | * If we dont want to free the old_rd yet then | ||
223 | * set old_rd to NULL to skip the freeing later | ||
224 | * in this function: | ||
225 | */ | ||
226 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
227 | old_rd = NULL; | ||
228 | } | ||
229 | |||
230 | atomic_inc(&rd->refcount); | ||
231 | rq->rd = rd; | ||
232 | |||
233 | cpumask_set_cpu(rq->cpu, rd->span); | ||
234 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
235 | set_rq_online(rq); | ||
236 | |||
237 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
238 | |||
239 | if (old_rd) | ||
240 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
241 | } | ||
242 | |||
243 | static int init_rootdomain(struct root_domain *rd) | ||
244 | { | ||
245 | memset(rd, 0, sizeof(*rd)); | ||
246 | |||
247 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
248 | goto out; | ||
249 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
250 | goto free_span; | ||
251 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
252 | goto free_online; | ||
253 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
254 | goto free_dlo_mask; | ||
255 | |||
256 | init_dl_bw(&rd->dl_bw); | ||
257 | if (cpudl_init(&rd->cpudl) != 0) | ||
258 | goto free_rto_mask; | ||
259 | |||
260 | if (cpupri_init(&rd->cpupri) != 0) | ||
261 | goto free_cpudl; | ||
262 | return 0; | ||
263 | |||
264 | free_cpudl: | ||
265 | cpudl_cleanup(&rd->cpudl); | ||
266 | free_rto_mask: | ||
267 | free_cpumask_var(rd->rto_mask); | ||
268 | free_dlo_mask: | ||
269 | free_cpumask_var(rd->dlo_mask); | ||
270 | free_online: | ||
271 | free_cpumask_var(rd->online); | ||
272 | free_span: | ||
273 | free_cpumask_var(rd->span); | ||
274 | out: | ||
275 | return -ENOMEM; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * By default the system creates a single root-domain with all CPUs as | ||
280 | * members (mimicking the global state we have today). | ||
281 | */ | ||
282 | struct root_domain def_root_domain; | ||
283 | |||
284 | void init_defrootdomain(void) | ||
285 | { | ||
286 | init_rootdomain(&def_root_domain); | ||
287 | |||
288 | atomic_set(&def_root_domain.refcount, 1); | ||
289 | } | ||
290 | |||
291 | static struct root_domain *alloc_rootdomain(void) | ||
292 | { | ||
293 | struct root_domain *rd; | ||
294 | |||
295 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
296 | if (!rd) | ||
297 | return NULL; | ||
298 | |||
299 | if (init_rootdomain(rd) != 0) { | ||
300 | kfree(rd); | ||
301 | return NULL; | ||
302 | } | ||
303 | |||
304 | return rd; | ||
305 | } | ||
306 | |||
307 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
308 | { | ||
309 | struct sched_group *tmp, *first; | ||
310 | |||
311 | if (!sg) | ||
312 | return; | ||
313 | |||
314 | first = sg; | ||
315 | do { | ||
316 | tmp = sg->next; | ||
317 | |||
318 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
319 | kfree(sg->sgc); | ||
320 | |||
321 | kfree(sg); | ||
322 | sg = tmp; | ||
323 | } while (sg != first); | ||
324 | } | ||
325 | |||
326 | static void destroy_sched_domain(struct sched_domain *sd) | ||
327 | { | ||
328 | /* | ||
329 | * If its an overlapping domain it has private groups, iterate and | ||
330 | * nuke them all. | ||
331 | */ | ||
332 | if (sd->flags & SD_OVERLAP) { | ||
333 | free_sched_groups(sd->groups, 1); | ||
334 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
335 | kfree(sd->groups->sgc); | ||
336 | kfree(sd->groups); | ||
337 | } | ||
338 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
339 | kfree(sd->shared); | ||
340 | kfree(sd); | ||
341 | } | ||
342 | |||
343 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
344 | { | ||
345 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
346 | |||
347 | while (sd) { | ||
348 | struct sched_domain *parent = sd->parent; | ||
349 | destroy_sched_domain(sd); | ||
350 | sd = parent; | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void destroy_sched_domains(struct sched_domain *sd) | ||
355 | { | ||
356 | if (sd) | ||
357 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * Keep a special pointer to the highest sched_domain that has | ||
362 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
363 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
364 | * | ||
365 | * Also keep a unique ID per domain (we use the first CPU number in | ||
366 | * the cpumask of the domain), this allows us to quickly tell if | ||
367 | * two CPUs are in the same cache domain, see cpus_share_cache(). | ||
368 | */ | ||
369 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
370 | DEFINE_PER_CPU(int, sd_llc_size); | ||
371 | DEFINE_PER_CPU(int, sd_llc_id); | ||
372 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
373 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
374 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
375 | |||
376 | static void update_top_cache_domain(int cpu) | ||
377 | { | ||
378 | struct sched_domain_shared *sds = NULL; | ||
379 | struct sched_domain *sd; | ||
380 | int id = cpu; | ||
381 | int size = 1; | ||
382 | |||
383 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
384 | if (sd) { | ||
385 | id = cpumask_first(sched_domain_span(sd)); | ||
386 | size = cpumask_weight(sched_domain_span(sd)); | ||
387 | sds = sd->shared; | ||
388 | } | ||
389 | |||
390 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
391 | per_cpu(sd_llc_size, cpu) = size; | ||
392 | per_cpu(sd_llc_id, cpu) = id; | ||
393 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
394 | |||
395 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
396 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
397 | |||
398 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
399 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
404 | * hold the hotplug lock. | ||
405 | */ | ||
406 | static void | ||
407 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
408 | { | ||
409 | struct rq *rq = cpu_rq(cpu); | ||
410 | struct sched_domain *tmp; | ||
411 | |||
412 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
413 | for (tmp = sd; tmp; ) { | ||
414 | struct sched_domain *parent = tmp->parent; | ||
415 | if (!parent) | ||
416 | break; | ||
417 | |||
418 | if (sd_parent_degenerate(tmp, parent)) { | ||
419 | tmp->parent = parent->parent; | ||
420 | if (parent->parent) | ||
421 | parent->parent->child = tmp; | ||
422 | /* | ||
423 | * Transfer SD_PREFER_SIBLING down in case of a | ||
424 | * degenerate parent; the spans match for this | ||
425 | * so the property transfers. | ||
426 | */ | ||
427 | if (parent->flags & SD_PREFER_SIBLING) | ||
428 | tmp->flags |= SD_PREFER_SIBLING; | ||
429 | destroy_sched_domain(parent); | ||
430 | } else | ||
431 | tmp = tmp->parent; | ||
432 | } | ||
433 | |||
434 | if (sd && sd_degenerate(sd)) { | ||
435 | tmp = sd; | ||
436 | sd = sd->parent; | ||
437 | destroy_sched_domain(tmp); | ||
438 | if (sd) | ||
439 | sd->child = NULL; | ||
440 | } | ||
441 | |||
442 | sched_domain_debug(sd, cpu); | ||
443 | |||
444 | rq_attach_root(rq, rd); | ||
445 | tmp = rq->sd; | ||
446 | rcu_assign_pointer(rq->sd, sd); | ||
447 | destroy_sched_domains(tmp); | ||
448 | |||
449 | update_top_cache_domain(cpu); | ||
450 | } | ||
451 | |||
452 | /* Setup the mask of CPUs configured for isolated domains */ | ||
453 | static int __init isolated_cpu_setup(char *str) | ||
454 | { | ||
455 | int ret; | ||
456 | |||
457 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
458 | ret = cpulist_parse(str, cpu_isolated_map); | ||
459 | if (ret) { | ||
460 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
461 | return 0; | ||
462 | } | ||
463 | return 1; | ||
464 | } | ||
465 | __setup("isolcpus=", isolated_cpu_setup); | ||
466 | |||
467 | struct s_data { | ||
468 | struct sched_domain ** __percpu sd; | ||
469 | struct root_domain *rd; | ||
470 | }; | ||
471 | |||
472 | enum s_alloc { | ||
473 | sa_rootdomain, | ||
474 | sa_sd, | ||
475 | sa_sd_storage, | ||
476 | sa_none, | ||
477 | }; | ||
478 | |||
479 | /* | ||
480 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
481 | * domain traversal. | ||
482 | * | ||
483 | * Asymmetric node setups can result in situations where the domain tree is of | ||
484 | * unequal depth, make sure to skip domains that already cover the entire | ||
485 | * range. | ||
486 | * | ||
487 | * In that case build_sched_domains() will have terminated the iteration early | ||
488 | * and our sibling sd spans will be empty. Domains should always include the | ||
489 | * CPU they're built on, so check that. | ||
490 | */ | ||
491 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
492 | { | ||
493 | const struct cpumask *span = sched_domain_span(sd); | ||
494 | struct sd_data *sdd = sd->private; | ||
495 | struct sched_domain *sibling; | ||
496 | int i; | ||
497 | |||
498 | for_each_cpu(i, span) { | ||
499 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
500 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
501 | continue; | ||
502 | |||
503 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
504 | } | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * Return the canonical balance CPU for this group, this is the first CPU | ||
509 | * of this group that's also in the iteration mask. | ||
510 | */ | ||
511 | int group_balance_cpu(struct sched_group *sg) | ||
512 | { | ||
513 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
514 | } | ||
515 | |||
516 | static int | ||
517 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
518 | { | ||
519 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
520 | const struct cpumask *span = sched_domain_span(sd); | ||
521 | struct cpumask *covered = sched_domains_tmpmask; | ||
522 | struct sd_data *sdd = sd->private; | ||
523 | struct sched_domain *sibling; | ||
524 | int i; | ||
525 | |||
526 | cpumask_clear(covered); | ||
527 | |||
528 | for_each_cpu(i, span) { | ||
529 | struct cpumask *sg_span; | ||
530 | |||
531 | if (cpumask_test_cpu(i, covered)) | ||
532 | continue; | ||
533 | |||
534 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
535 | |||
536 | /* See the comment near build_group_mask(). */ | ||
537 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
538 | continue; | ||
539 | |||
540 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
541 | GFP_KERNEL, cpu_to_node(cpu)); | ||
542 | |||
543 | if (!sg) | ||
544 | goto fail; | ||
545 | |||
546 | sg_span = sched_group_cpus(sg); | ||
547 | if (sibling->child) | ||
548 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
549 | else | ||
550 | cpumask_set_cpu(i, sg_span); | ||
551 | |||
552 | cpumask_or(covered, covered, sg_span); | ||
553 | |||
554 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
555 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
556 | build_group_mask(sd, sg); | ||
557 | |||
558 | /* | ||
559 | * Initialize sgc->capacity such that even if we mess up the | ||
560 | * domains and no possible iteration will get us here, we won't | ||
561 | * die on a /0 trap. | ||
562 | */ | ||
563 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
564 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
565 | |||
566 | /* | ||
567 | * Make sure the first group of this domain contains the | ||
568 | * canonical balance CPU. Otherwise the sched_domain iteration | ||
569 | * breaks. See update_sg_lb_stats(). | ||
570 | */ | ||
571 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
572 | group_balance_cpu(sg) == cpu) | ||
573 | groups = sg; | ||
574 | |||
575 | if (!first) | ||
576 | first = sg; | ||
577 | if (last) | ||
578 | last->next = sg; | ||
579 | last = sg; | ||
580 | last->next = first; | ||
581 | } | ||
582 | sd->groups = groups; | ||
583 | |||
584 | return 0; | ||
585 | |||
586 | fail: | ||
587 | free_sched_groups(first, 0); | ||
588 | |||
589 | return -ENOMEM; | ||
590 | } | ||
591 | |||
592 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
593 | { | ||
594 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
595 | struct sched_domain *child = sd->child; | ||
596 | |||
597 | if (child) | ||
598 | cpu = cpumask_first(sched_domain_span(child)); | ||
599 | |||
600 | if (sg) { | ||
601 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
602 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
603 | |||
604 | /* For claim_allocations: */ | ||
605 | atomic_set(&(*sg)->sgc->ref, 1); | ||
606 | } | ||
607 | |||
608 | return cpu; | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * build_sched_groups will build a circular linked list of the groups | ||
613 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
614 | * and ->cpu_capacity to 0. | ||
615 | * | ||
616 | * Assumes the sched_domain tree is fully constructed | ||
617 | */ | ||
618 | static int | ||
619 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
620 | { | ||
621 | struct sched_group *first = NULL, *last = NULL; | ||
622 | struct sd_data *sdd = sd->private; | ||
623 | const struct cpumask *span = sched_domain_span(sd); | ||
624 | struct cpumask *covered; | ||
625 | int i; | ||
626 | |||
627 | get_group(cpu, sdd, &sd->groups); | ||
628 | atomic_inc(&sd->groups->ref); | ||
629 | |||
630 | if (cpu != cpumask_first(span)) | ||
631 | return 0; | ||
632 | |||
633 | lockdep_assert_held(&sched_domains_mutex); | ||
634 | covered = sched_domains_tmpmask; | ||
635 | |||
636 | cpumask_clear(covered); | ||
637 | |||
638 | for_each_cpu(i, span) { | ||
639 | struct sched_group *sg; | ||
640 | int group, j; | ||
641 | |||
642 | if (cpumask_test_cpu(i, covered)) | ||
643 | continue; | ||
644 | |||
645 | group = get_group(i, sdd, &sg); | ||
646 | cpumask_setall(sched_group_mask(sg)); | ||
647 | |||
648 | for_each_cpu(j, span) { | ||
649 | if (get_group(j, sdd, NULL) != group) | ||
650 | continue; | ||
651 | |||
652 | cpumask_set_cpu(j, covered); | ||
653 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
654 | } | ||
655 | |||
656 | if (!first) | ||
657 | first = sg; | ||
658 | if (last) | ||
659 | last->next = sg; | ||
660 | last = sg; | ||
661 | } | ||
662 | last->next = first; | ||
663 | |||
664 | return 0; | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Initialize sched groups cpu_capacity. | ||
669 | * | ||
670 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
671 | * distributing the load between different sched groups in a sched domain. | ||
672 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
673 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
674 | * group having more cpu_capacity will pickup more load compared to the | ||
675 | * group having less cpu_capacity. | ||
676 | */ | ||
677 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
678 | { | ||
679 | struct sched_group *sg = sd->groups; | ||
680 | |||
681 | WARN_ON(!sg); | ||
682 | |||
683 | do { | ||
684 | int cpu, max_cpu = -1; | ||
685 | |||
686 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
687 | |||
688 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
689 | goto next; | ||
690 | |||
691 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
692 | if (max_cpu < 0) | ||
693 | max_cpu = cpu; | ||
694 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
695 | max_cpu = cpu; | ||
696 | } | ||
697 | sg->asym_prefer_cpu = max_cpu; | ||
698 | |||
699 | next: | ||
700 | sg = sg->next; | ||
701 | } while (sg != sd->groups); | ||
702 | |||
703 | if (cpu != group_balance_cpu(sg)) | ||
704 | return; | ||
705 | |||
706 | update_group_capacity(sd, cpu); | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * Initializers for schedule domains | ||
711 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
712 | */ | ||
713 | |||
714 | static int default_relax_domain_level = -1; | ||
715 | int sched_domain_level_max; | ||
716 | |||
717 | static int __init setup_relax_domain_level(char *str) | ||
718 | { | ||
719 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
720 | pr_warn("Unable to set relax_domain_level\n"); | ||
721 | |||
722 | return 1; | ||
723 | } | ||
724 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
725 | |||
726 | static void set_domain_attribute(struct sched_domain *sd, | ||
727 | struct sched_domain_attr *attr) | ||
728 | { | ||
729 | int request; | ||
730 | |||
731 | if (!attr || attr->relax_domain_level < 0) { | ||
732 | if (default_relax_domain_level < 0) | ||
733 | return; | ||
734 | else | ||
735 | request = default_relax_domain_level; | ||
736 | } else | ||
737 | request = attr->relax_domain_level; | ||
738 | if (request < sd->level) { | ||
739 | /* Turn off idle balance on this domain: */ | ||
740 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
741 | } else { | ||
742 | /* Turn on idle balance on this domain: */ | ||
743 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
744 | } | ||
745 | } | ||
746 | |||
747 | static void __sdt_free(const struct cpumask *cpu_map); | ||
748 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
749 | |||
750 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
751 | const struct cpumask *cpu_map) | ||
752 | { | ||
753 | switch (what) { | ||
754 | case sa_rootdomain: | ||
755 | if (!atomic_read(&d->rd->refcount)) | ||
756 | free_rootdomain(&d->rd->rcu); | ||
757 | /* Fall through */ | ||
758 | case sa_sd: | ||
759 | free_percpu(d->sd); | ||
760 | /* Fall through */ | ||
761 | case sa_sd_storage: | ||
762 | __sdt_free(cpu_map); | ||
763 | /* Fall through */ | ||
764 | case sa_none: | ||
765 | break; | ||
766 | } | ||
767 | } | ||
768 | |||
769 | static enum s_alloc | ||
770 | __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | ||
771 | { | ||
772 | memset(d, 0, sizeof(*d)); | ||
773 | |||
774 | if (__sdt_alloc(cpu_map)) | ||
775 | return sa_sd_storage; | ||
776 | d->sd = alloc_percpu(struct sched_domain *); | ||
777 | if (!d->sd) | ||
778 | return sa_sd_storage; | ||
779 | d->rd = alloc_rootdomain(); | ||
780 | if (!d->rd) | ||
781 | return sa_sd; | ||
782 | return sa_rootdomain; | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | * NULL the sd_data elements we've used to build the sched_domain and | ||
787 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
788 | * will not free the data we're using. | ||
789 | */ | ||
790 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
791 | { | ||
792 | struct sd_data *sdd = sd->private; | ||
793 | |||
794 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
795 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
796 | |||
797 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
798 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
799 | |||
800 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
801 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
802 | |||
803 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
804 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
805 | } | ||
806 | |||
807 | #ifdef CONFIG_NUMA | ||
808 | static int sched_domains_numa_levels; | ||
809 | enum numa_topology_type sched_numa_topology_type; | ||
810 | static int *sched_domains_numa_distance; | ||
811 | int sched_max_numa_distance; | ||
812 | static struct cpumask ***sched_domains_numa_masks; | ||
813 | static int sched_domains_curr_level; | ||
814 | #endif | ||
815 | |||
816 | /* | ||
817 | * SD_flags allowed in topology descriptions. | ||
818 | * | ||
819 | * These flags are purely descriptive of the topology and do not prescribe | ||
820 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
821 | * function: | ||
822 | * | ||
823 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
824 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
825 | * SD_NUMA - describes NUMA topologies | ||
826 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
827 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
828 | * | ||
829 | * Odd one out, which beside describing the topology has a quirk also | ||
830 | * prescribes the desired behaviour that goes along with it: | ||
831 | * | ||
832 | * SD_ASYM_PACKING - describes SMT quirks | ||
833 | */ | ||
834 | #define TOPOLOGY_SD_FLAGS \ | ||
835 | (SD_SHARE_CPUCAPACITY | \ | ||
836 | SD_SHARE_PKG_RESOURCES | \ | ||
837 | SD_NUMA | \ | ||
838 | SD_ASYM_PACKING | \ | ||
839 | SD_ASYM_CPUCAPACITY | \ | ||
840 | SD_SHARE_POWERDOMAIN) | ||
841 | |||
842 | static struct sched_domain * | ||
843 | sd_init(struct sched_domain_topology_level *tl, | ||
844 | const struct cpumask *cpu_map, | ||
845 | struct sched_domain *child, int cpu) | ||
846 | { | ||
847 | struct sd_data *sdd = &tl->data; | ||
848 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
849 | int sd_id, sd_weight, sd_flags = 0; | ||
850 | |||
851 | #ifdef CONFIG_NUMA | ||
852 | /* | ||
853 | * Ugly hack to pass state to sd_numa_mask()... | ||
854 | */ | ||
855 | sched_domains_curr_level = tl->numa_level; | ||
856 | #endif | ||
857 | |||
858 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
859 | |||
860 | if (tl->sd_flags) | ||
861 | sd_flags = (*tl->sd_flags)(); | ||
862 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
863 | "wrong sd_flags in topology description\n")) | ||
864 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
865 | |||
866 | *sd = (struct sched_domain){ | ||
867 | .min_interval = sd_weight, | ||
868 | .max_interval = 2*sd_weight, | ||
869 | .busy_factor = 32, | ||
870 | .imbalance_pct = 125, | ||
871 | |||
872 | .cache_nice_tries = 0, | ||
873 | .busy_idx = 0, | ||
874 | .idle_idx = 0, | ||
875 | .newidle_idx = 0, | ||
876 | .wake_idx = 0, | ||
877 | .forkexec_idx = 0, | ||
878 | |||
879 | .flags = 1*SD_LOAD_BALANCE | ||
880 | | 1*SD_BALANCE_NEWIDLE | ||
881 | | 1*SD_BALANCE_EXEC | ||
882 | | 1*SD_BALANCE_FORK | ||
883 | | 0*SD_BALANCE_WAKE | ||
884 | | 1*SD_WAKE_AFFINE | ||
885 | | 0*SD_SHARE_CPUCAPACITY | ||
886 | | 0*SD_SHARE_PKG_RESOURCES | ||
887 | | 0*SD_SERIALIZE | ||
888 | | 0*SD_PREFER_SIBLING | ||
889 | | 0*SD_NUMA | ||
890 | | sd_flags | ||
891 | , | ||
892 | |||
893 | .last_balance = jiffies, | ||
894 | .balance_interval = sd_weight, | ||
895 | .smt_gain = 0, | ||
896 | .max_newidle_lb_cost = 0, | ||
897 | .next_decay_max_lb_cost = jiffies, | ||
898 | .child = child, | ||
899 | #ifdef CONFIG_SCHED_DEBUG | ||
900 | .name = tl->name, | ||
901 | #endif | ||
902 | }; | ||
903 | |||
904 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
905 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
906 | |||
907 | /* | ||
908 | * Convert topological properties into behaviour. | ||
909 | */ | ||
910 | |||
911 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
912 | struct sched_domain *t = sd; | ||
913 | |||
914 | for_each_lower_domain(t) | ||
915 | t->flags |= SD_BALANCE_WAKE; | ||
916 | } | ||
917 | |||
918 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
919 | sd->flags |= SD_PREFER_SIBLING; | ||
920 | sd->imbalance_pct = 110; | ||
921 | sd->smt_gain = 1178; /* ~15% */ | ||
922 | |||
923 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
924 | sd->imbalance_pct = 117; | ||
925 | sd->cache_nice_tries = 1; | ||
926 | sd->busy_idx = 2; | ||
927 | |||
928 | #ifdef CONFIG_NUMA | ||
929 | } else if (sd->flags & SD_NUMA) { | ||
930 | sd->cache_nice_tries = 2; | ||
931 | sd->busy_idx = 3; | ||
932 | sd->idle_idx = 2; | ||
933 | |||
934 | sd->flags |= SD_SERIALIZE; | ||
935 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
936 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
937 | SD_BALANCE_FORK | | ||
938 | SD_WAKE_AFFINE); | ||
939 | } | ||
940 | |||
941 | #endif | ||
942 | } else { | ||
943 | sd->flags |= SD_PREFER_SIBLING; | ||
944 | sd->cache_nice_tries = 1; | ||
945 | sd->busy_idx = 2; | ||
946 | sd->idle_idx = 1; | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * For all levels sharing cache; connect a sched_domain_shared | ||
951 | * instance. | ||
952 | */ | ||
953 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
954 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
955 | atomic_inc(&sd->shared->ref); | ||
956 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
957 | } | ||
958 | |||
959 | sd->private = sdd; | ||
960 | |||
961 | return sd; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Topology list, bottom-up. | ||
966 | */ | ||
967 | static struct sched_domain_topology_level default_topology[] = { | ||
968 | #ifdef CONFIG_SCHED_SMT | ||
969 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
970 | #endif | ||
971 | #ifdef CONFIG_SCHED_MC | ||
972 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
973 | #endif | ||
974 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
975 | { NULL, }, | ||
976 | }; | ||
977 | |||
978 | static struct sched_domain_topology_level *sched_domain_topology = | ||
979 | default_topology; | ||
980 | |||
981 | #define for_each_sd_topology(tl) \ | ||
982 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
983 | |||
984 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
985 | { | ||
986 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
987 | return; | ||
988 | |||
989 | sched_domain_topology = tl; | ||
990 | } | ||
991 | |||
992 | #ifdef CONFIG_NUMA | ||
993 | |||
994 | static const struct cpumask *sd_numa_mask(int cpu) | ||
995 | { | ||
996 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
997 | } | ||
998 | |||
999 | static void sched_numa_warn(const char *str) | ||
1000 | { | ||
1001 | static int done = false; | ||
1002 | int i,j; | ||
1003 | |||
1004 | if (done) | ||
1005 | return; | ||
1006 | |||
1007 | done = true; | ||
1008 | |||
1009 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
1010 | |||
1011 | for (i = 0; i < nr_node_ids; i++) { | ||
1012 | printk(KERN_WARNING " "); | ||
1013 | for (j = 0; j < nr_node_ids; j++) | ||
1014 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
1015 | printk(KERN_CONT "\n"); | ||
1016 | } | ||
1017 | printk(KERN_WARNING "\n"); | ||
1018 | } | ||
1019 | |||
1020 | bool find_numa_distance(int distance) | ||
1021 | { | ||
1022 | int i; | ||
1023 | |||
1024 | if (distance == node_distance(0, 0)) | ||
1025 | return true; | ||
1026 | |||
1027 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1028 | if (sched_domains_numa_distance[i] == distance) | ||
1029 | return true; | ||
1030 | } | ||
1031 | |||
1032 | return false; | ||
1033 | } | ||
1034 | |||
1035 | /* | ||
1036 | * A system can have three types of NUMA topology: | ||
1037 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
1038 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
1039 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
1040 | * | ||
1041 | * The difference between a glueless mesh topology and a backplane | ||
1042 | * topology lies in whether communication between not directly | ||
1043 | * connected nodes goes through intermediary nodes (where programs | ||
1044 | * could run), or through backplane controllers. This affects | ||
1045 | * placement of programs. | ||
1046 | * | ||
1047 | * The type of topology can be discerned with the following tests: | ||
1048 | * - If the maximum distance between any nodes is 1 hop, the system | ||
1049 | * is directly connected. | ||
1050 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
1051 | * there is an intermediary node C, which is < N hops away from both | ||
1052 | * nodes A and B, the system is a glueless mesh. | ||
1053 | */ | ||
1054 | static void init_numa_topology_type(void) | ||
1055 | { | ||
1056 | int a, b, c, n; | ||
1057 | |||
1058 | n = sched_max_numa_distance; | ||
1059 | |||
1060 | if (sched_domains_numa_levels <= 1) { | ||
1061 | sched_numa_topology_type = NUMA_DIRECT; | ||
1062 | return; | ||
1063 | } | ||
1064 | |||
1065 | for_each_online_node(a) { | ||
1066 | for_each_online_node(b) { | ||
1067 | /* Find two nodes furthest removed from each other. */ | ||
1068 | if (node_distance(a, b) < n) | ||
1069 | continue; | ||
1070 | |||
1071 | /* Is there an intermediary node between a and b? */ | ||
1072 | for_each_online_node(c) { | ||
1073 | if (node_distance(a, c) < n && | ||
1074 | node_distance(b, c) < n) { | ||
1075 | sched_numa_topology_type = | ||
1076 | NUMA_GLUELESS_MESH; | ||
1077 | return; | ||
1078 | } | ||
1079 | } | ||
1080 | |||
1081 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
1082 | return; | ||
1083 | } | ||
1084 | } | ||
1085 | } | ||
1086 | |||
1087 | void sched_init_numa(void) | ||
1088 | { | ||
1089 | int next_distance, curr_distance = node_distance(0, 0); | ||
1090 | struct sched_domain_topology_level *tl; | ||
1091 | int level = 0; | ||
1092 | int i, j, k; | ||
1093 | |||
1094 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
1095 | if (!sched_domains_numa_distance) | ||
1096 | return; | ||
1097 | |||
1098 | /* | ||
1099 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
1100 | * unique distances in the node_distance() table. | ||
1101 | * | ||
1102 | * Assumes node_distance(0,j) includes all distances in | ||
1103 | * node_distance(i,j) in order to avoid cubic time. | ||
1104 | */ | ||
1105 | next_distance = curr_distance; | ||
1106 | for (i = 0; i < nr_node_ids; i++) { | ||
1107 | for (j = 0; j < nr_node_ids; j++) { | ||
1108 | for (k = 0; k < nr_node_ids; k++) { | ||
1109 | int distance = node_distance(i, k); | ||
1110 | |||
1111 | if (distance > curr_distance && | ||
1112 | (distance < next_distance || | ||
1113 | next_distance == curr_distance)) | ||
1114 | next_distance = distance; | ||
1115 | |||
1116 | /* | ||
1117 | * While not a strong assumption it would be nice to know | ||
1118 | * about cases where if node A is connected to B, B is not | ||
1119 | * equally connected to A. | ||
1120 | */ | ||
1121 | if (sched_debug() && node_distance(k, i) != distance) | ||
1122 | sched_numa_warn("Node-distance not symmetric"); | ||
1123 | |||
1124 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
1125 | sched_numa_warn("Node-0 not representative"); | ||
1126 | } | ||
1127 | if (next_distance != curr_distance) { | ||
1128 | sched_domains_numa_distance[level++] = next_distance; | ||
1129 | sched_domains_numa_levels = level; | ||
1130 | curr_distance = next_distance; | ||
1131 | } else break; | ||
1132 | } | ||
1133 | |||
1134 | /* | ||
1135 | * In case of sched_debug() we verify the above assumption. | ||
1136 | */ | ||
1137 | if (!sched_debug()) | ||
1138 | break; | ||
1139 | } | ||
1140 | |||
1141 | if (!level) | ||
1142 | return; | ||
1143 | |||
1144 | /* | ||
1145 | * 'level' contains the number of unique distances, excluding the | ||
1146 | * identity distance node_distance(i,i). | ||
1147 | * | ||
1148 | * The sched_domains_numa_distance[] array includes the actual distance | ||
1149 | * numbers. | ||
1150 | */ | ||
1151 | |||
1152 | /* | ||
1153 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
1154 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
1155 | * the array will contain less then 'level' members. This could be | ||
1156 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
1157 | * in other functions. | ||
1158 | * | ||
1159 | * We reset it to 'level' at the end of this function. | ||
1160 | */ | ||
1161 | sched_domains_numa_levels = 0; | ||
1162 | |||
1163 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
1164 | if (!sched_domains_numa_masks) | ||
1165 | return; | ||
1166 | |||
1167 | /* | ||
1168 | * Now for each level, construct a mask per node which contains all | ||
1169 | * CPUs of nodes that are that many hops away from us. | ||
1170 | */ | ||
1171 | for (i = 0; i < level; i++) { | ||
1172 | sched_domains_numa_masks[i] = | ||
1173 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
1174 | if (!sched_domains_numa_masks[i]) | ||
1175 | return; | ||
1176 | |||
1177 | for (j = 0; j < nr_node_ids; j++) { | ||
1178 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
1179 | if (!mask) | ||
1180 | return; | ||
1181 | |||
1182 | sched_domains_numa_masks[i][j] = mask; | ||
1183 | |||
1184 | for_each_node(k) { | ||
1185 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
1186 | continue; | ||
1187 | |||
1188 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
1189 | } | ||
1190 | } | ||
1191 | } | ||
1192 | |||
1193 | /* Compute default topology size */ | ||
1194 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
1195 | |||
1196 | tl = kzalloc((i + level + 1) * | ||
1197 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
1198 | if (!tl) | ||
1199 | return; | ||
1200 | |||
1201 | /* | ||
1202 | * Copy the default topology bits.. | ||
1203 | */ | ||
1204 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
1205 | tl[i] = sched_domain_topology[i]; | ||
1206 | |||
1207 | /* | ||
1208 | * .. and append 'j' levels of NUMA goodness. | ||
1209 | */ | ||
1210 | for (j = 0; j < level; i++, j++) { | ||
1211 | tl[i] = (struct sched_domain_topology_level){ | ||
1212 | .mask = sd_numa_mask, | ||
1213 | .sd_flags = cpu_numa_flags, | ||
1214 | .flags = SDTL_OVERLAP, | ||
1215 | .numa_level = j, | ||
1216 | SD_INIT_NAME(NUMA) | ||
1217 | }; | ||
1218 | } | ||
1219 | |||
1220 | sched_domain_topology = tl; | ||
1221 | |||
1222 | sched_domains_numa_levels = level; | ||
1223 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
1224 | |||
1225 | init_numa_topology_type(); | ||
1226 | } | ||
1227 | |||
1228 | void sched_domains_numa_masks_set(unsigned int cpu) | ||
1229 | { | ||
1230 | int node = cpu_to_node(cpu); | ||
1231 | int i, j; | ||
1232 | |||
1233 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1234 | for (j = 0; j < nr_node_ids; j++) { | ||
1235 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
1236 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1237 | } | ||
1238 | } | ||
1239 | } | ||
1240 | |||
1241 | void sched_domains_numa_masks_clear(unsigned int cpu) | ||
1242 | { | ||
1243 | int i, j; | ||
1244 | |||
1245 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1246 | for (j = 0; j < nr_node_ids; j++) | ||
1247 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1248 | } | ||
1249 | } | ||
1250 | |||
1251 | #endif /* CONFIG_NUMA */ | ||
1252 | |||
1253 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
1254 | { | ||
1255 | struct sched_domain_topology_level *tl; | ||
1256 | int j; | ||
1257 | |||
1258 | for_each_sd_topology(tl) { | ||
1259 | struct sd_data *sdd = &tl->data; | ||
1260 | |||
1261 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
1262 | if (!sdd->sd) | ||
1263 | return -ENOMEM; | ||
1264 | |||
1265 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
1266 | if (!sdd->sds) | ||
1267 | return -ENOMEM; | ||
1268 | |||
1269 | sdd->sg = alloc_percpu(struct sched_group *); | ||
1270 | if (!sdd->sg) | ||
1271 | return -ENOMEM; | ||
1272 | |||
1273 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
1274 | if (!sdd->sgc) | ||
1275 | return -ENOMEM; | ||
1276 | |||
1277 | for_each_cpu(j, cpu_map) { | ||
1278 | struct sched_domain *sd; | ||
1279 | struct sched_domain_shared *sds; | ||
1280 | struct sched_group *sg; | ||
1281 | struct sched_group_capacity *sgc; | ||
1282 | |||
1283 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
1284 | GFP_KERNEL, cpu_to_node(j)); | ||
1285 | if (!sd) | ||
1286 | return -ENOMEM; | ||
1287 | |||
1288 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
1289 | |||
1290 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
1291 | GFP_KERNEL, cpu_to_node(j)); | ||
1292 | if (!sds) | ||
1293 | return -ENOMEM; | ||
1294 | |||
1295 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
1296 | |||
1297 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
1298 | GFP_KERNEL, cpu_to_node(j)); | ||
1299 | if (!sg) | ||
1300 | return -ENOMEM; | ||
1301 | |||
1302 | sg->next = sg; | ||
1303 | |||
1304 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
1305 | |||
1306 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
1307 | GFP_KERNEL, cpu_to_node(j)); | ||
1308 | if (!sgc) | ||
1309 | return -ENOMEM; | ||
1310 | |||
1311 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
1312 | } | ||
1313 | } | ||
1314 | |||
1315 | return 0; | ||
1316 | } | ||
1317 | |||
1318 | static void __sdt_free(const struct cpumask *cpu_map) | ||
1319 | { | ||
1320 | struct sched_domain_topology_level *tl; | ||
1321 | int j; | ||
1322 | |||
1323 | for_each_sd_topology(tl) { | ||
1324 | struct sd_data *sdd = &tl->data; | ||
1325 | |||
1326 | for_each_cpu(j, cpu_map) { | ||
1327 | struct sched_domain *sd; | ||
1328 | |||
1329 | if (sdd->sd) { | ||
1330 | sd = *per_cpu_ptr(sdd->sd, j); | ||
1331 | if (sd && (sd->flags & SD_OVERLAP)) | ||
1332 | free_sched_groups(sd->groups, 0); | ||
1333 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
1334 | } | ||
1335 | |||
1336 | if (sdd->sds) | ||
1337 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
1338 | if (sdd->sg) | ||
1339 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
1340 | if (sdd->sgc) | ||
1341 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
1342 | } | ||
1343 | free_percpu(sdd->sd); | ||
1344 | sdd->sd = NULL; | ||
1345 | free_percpu(sdd->sds); | ||
1346 | sdd->sds = NULL; | ||
1347 | free_percpu(sdd->sg); | ||
1348 | sdd->sg = NULL; | ||
1349 | free_percpu(sdd->sgc); | ||
1350 | sdd->sgc = NULL; | ||
1351 | } | ||
1352 | } | ||
1353 | |||
1354 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
1355 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
1356 | struct sched_domain *child, int cpu) | ||
1357 | { | ||
1358 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
1359 | |||
1360 | if (child) { | ||
1361 | sd->level = child->level + 1; | ||
1362 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
1363 | child->parent = sd; | ||
1364 | |||
1365 | if (!cpumask_subset(sched_domain_span(child), | ||
1366 | sched_domain_span(sd))) { | ||
1367 | pr_err("BUG: arch topology borken\n"); | ||
1368 | #ifdef CONFIG_SCHED_DEBUG | ||
1369 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
1370 | child->name, sd->name); | ||
1371 | #endif | ||
1372 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
1373 | cpumask_or(sched_domain_span(sd), | ||
1374 | sched_domain_span(sd), | ||
1375 | sched_domain_span(child)); | ||
1376 | } | ||
1377 | |||
1378 | } | ||
1379 | set_domain_attribute(sd, attr); | ||
1380 | |||
1381 | return sd; | ||
1382 | } | ||
1383 | |||
1384 | /* | ||
1385 | * Build sched domains for a given set of CPUs and attach the sched domains | ||
1386 | * to the individual CPUs | ||
1387 | */ | ||
1388 | static int | ||
1389 | build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) | ||
1390 | { | ||
1391 | enum s_alloc alloc_state; | ||
1392 | struct sched_domain *sd; | ||
1393 | struct s_data d; | ||
1394 | struct rq *rq = NULL; | ||
1395 | int i, ret = -ENOMEM; | ||
1396 | |||
1397 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
1398 | if (alloc_state != sa_rootdomain) | ||
1399 | goto error; | ||
1400 | |||
1401 | /* Set up domains for CPUs specified by the cpu_map: */ | ||
1402 | for_each_cpu(i, cpu_map) { | ||
1403 | struct sched_domain_topology_level *tl; | ||
1404 | |||
1405 | sd = NULL; | ||
1406 | for_each_sd_topology(tl) { | ||
1407 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
1408 | if (tl == sched_domain_topology) | ||
1409 | *per_cpu_ptr(d.sd, i) = sd; | ||
1410 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
1411 | sd->flags |= SD_OVERLAP; | ||
1412 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
1413 | break; | ||
1414 | } | ||
1415 | } | ||
1416 | |||
1417 | /* Build the groups for the domains */ | ||
1418 | for_each_cpu(i, cpu_map) { | ||
1419 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1420 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
1421 | if (sd->flags & SD_OVERLAP) { | ||
1422 | if (build_overlap_sched_groups(sd, i)) | ||
1423 | goto error; | ||
1424 | } else { | ||
1425 | if (build_sched_groups(sd, i)) | ||
1426 | goto error; | ||
1427 | } | ||
1428 | } | ||
1429 | } | ||
1430 | |||
1431 | /* Calculate CPU capacity for physical packages and nodes */ | ||
1432 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
1433 | if (!cpumask_test_cpu(i, cpu_map)) | ||
1434 | continue; | ||
1435 | |||
1436 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1437 | claim_allocations(i, sd); | ||
1438 | init_sched_groups_capacity(i, sd); | ||
1439 | } | ||
1440 | } | ||
1441 | |||
1442 | /* Attach the domains */ | ||
1443 | rcu_read_lock(); | ||
1444 | for_each_cpu(i, cpu_map) { | ||
1445 | rq = cpu_rq(i); | ||
1446 | sd = *per_cpu_ptr(d.sd, i); | ||
1447 | |||
1448 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
1449 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
1450 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
1451 | |||
1452 | cpu_attach_domain(sd, d.rd, i); | ||
1453 | } | ||
1454 | rcu_read_unlock(); | ||
1455 | |||
1456 | if (rq && sched_debug_enabled) { | ||
1457 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
1458 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
1459 | } | ||
1460 | |||
1461 | ret = 0; | ||
1462 | error: | ||
1463 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
1464 | return ret; | ||
1465 | } | ||
1466 | |||
1467 | /* Current sched domains: */ | ||
1468 | static cpumask_var_t *doms_cur; | ||
1469 | |||
1470 | /* Number of sched domains in 'doms_cur': */ | ||
1471 | static int ndoms_cur; | ||
1472 | |||
1473 | /* Attribues of custom domains in 'doms_cur' */ | ||
1474 | static struct sched_domain_attr *dattr_cur; | ||
1475 | |||
1476 | /* | ||
1477 | * Special case: If a kmalloc() of a doms_cur partition (array of | ||
1478 | * cpumask) fails, then fallback to a single sched domain, | ||
1479 | * as determined by the single cpumask fallback_doms. | ||
1480 | */ | ||
1481 | cpumask_var_t fallback_doms; | ||
1482 | |||
1483 | /* | ||
1484 | * arch_update_cpu_topology lets virtualized architectures update the | ||
1485 | * CPU core maps. It is supposed to return 1 if the topology changed | ||
1486 | * or 0 if it stayed the same. | ||
1487 | */ | ||
1488 | int __weak arch_update_cpu_topology(void) | ||
1489 | { | ||
1490 | return 0; | ||
1491 | } | ||
1492 | |||
1493 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
1494 | { | ||
1495 | int i; | ||
1496 | cpumask_var_t *doms; | ||
1497 | |||
1498 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
1499 | if (!doms) | ||
1500 | return NULL; | ||
1501 | for (i = 0; i < ndoms; i++) { | ||
1502 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
1503 | free_sched_domains(doms, i); | ||
1504 | return NULL; | ||
1505 | } | ||
1506 | } | ||
1507 | return doms; | ||
1508 | } | ||
1509 | |||
1510 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
1511 | { | ||
1512 | unsigned int i; | ||
1513 | for (i = 0; i < ndoms; i++) | ||
1514 | free_cpumask_var(doms[i]); | ||
1515 | kfree(doms); | ||
1516 | } | ||
1517 | |||
1518 | /* | ||
1519 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
1520 | * For now this just excludes isolated CPUs, but could be used to | ||
1521 | * exclude other special cases in the future. | ||
1522 | */ | ||
1523 | int init_sched_domains(const struct cpumask *cpu_map) | ||
1524 | { | ||
1525 | int err; | ||
1526 | |||
1527 | arch_update_cpu_topology(); | ||
1528 | ndoms_cur = 1; | ||
1529 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
1530 | if (!doms_cur) | ||
1531 | doms_cur = &fallback_doms; | ||
1532 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
1533 | err = build_sched_domains(doms_cur[0], NULL); | ||
1534 | register_sched_domain_sysctl(); | ||
1535 | |||
1536 | return err; | ||
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Detach sched domains from a group of CPUs specified in cpu_map | ||
1541 | * These CPUs will now be attached to the NULL domain | ||
1542 | */ | ||
1543 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
1544 | { | ||
1545 | int i; | ||
1546 | |||
1547 | rcu_read_lock(); | ||
1548 | for_each_cpu(i, cpu_map) | ||
1549 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
1550 | rcu_read_unlock(); | ||
1551 | } | ||
1552 | |||
1553 | /* handle null as "default" */ | ||
1554 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
1555 | struct sched_domain_attr *new, int idx_new) | ||
1556 | { | ||
1557 | struct sched_domain_attr tmp; | ||
1558 | |||
1559 | /* Fast path: */ | ||
1560 | if (!new && !cur) | ||
1561 | return 1; | ||
1562 | |||
1563 | tmp = SD_ATTR_INIT; | ||
1564 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
1565 | new ? (new + idx_new) : &tmp, | ||
1566 | sizeof(struct sched_domain_attr)); | ||
1567 | } | ||
1568 | |||
1569 | /* | ||
1570 | * Partition sched domains as specified by the 'ndoms_new' | ||
1571 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
1572 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
1573 | * It destroys each deleted domain and builds each new domain. | ||
1574 | * | ||
1575 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
1576 | * The masks don't intersect (don't overlap.) We should setup one | ||
1577 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
1578 | * not be load balanced. If the same cpumask appears both in the | ||
1579 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
1580 | * it as it is. | ||
1581 | * | ||
1582 | * The passed in 'doms_new' should be allocated using | ||
1583 | * alloc_sched_domains. This routine takes ownership of it and will | ||
1584 | * free_sched_domains it when done with it. If the caller failed the | ||
1585 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
1586 | * and partition_sched_domains() will fallback to the single partition | ||
1587 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
1588 | * | ||
1589 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
1590 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
1591 | * and it will not create the default domain. | ||
1592 | * | ||
1593 | * Call with hotplug lock held | ||
1594 | */ | ||
1595 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
1596 | struct sched_domain_attr *dattr_new) | ||
1597 | { | ||
1598 | int i, j, n; | ||
1599 | int new_topology; | ||
1600 | |||
1601 | mutex_lock(&sched_domains_mutex); | ||
1602 | |||
1603 | /* Always unregister in case we don't destroy any domains: */ | ||
1604 | unregister_sched_domain_sysctl(); | ||
1605 | |||
1606 | /* Let the architecture update CPU core mappings: */ | ||
1607 | new_topology = arch_update_cpu_topology(); | ||
1608 | |||
1609 | n = doms_new ? ndoms_new : 0; | ||
1610 | |||
1611 | /* Destroy deleted domains: */ | ||
1612 | for (i = 0; i < ndoms_cur; i++) { | ||
1613 | for (j = 0; j < n && !new_topology; j++) { | ||
1614 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
1615 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
1616 | goto match1; | ||
1617 | } | ||
1618 | /* No match - a current sched domain not in new doms_new[] */ | ||
1619 | detach_destroy_domains(doms_cur[i]); | ||
1620 | match1: | ||
1621 | ; | ||
1622 | } | ||
1623 | |||
1624 | n = ndoms_cur; | ||
1625 | if (doms_new == NULL) { | ||
1626 | n = 0; | ||
1627 | doms_new = &fallback_doms; | ||
1628 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
1629 | WARN_ON_ONCE(dattr_new); | ||
1630 | } | ||
1631 | |||
1632 | /* Build new domains: */ | ||
1633 | for (i = 0; i < ndoms_new; i++) { | ||
1634 | for (j = 0; j < n && !new_topology; j++) { | ||
1635 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
1636 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
1637 | goto match2; | ||
1638 | } | ||
1639 | /* No match - add a new doms_new */ | ||
1640 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
1641 | match2: | ||
1642 | ; | ||
1643 | } | ||
1644 | |||
1645 | /* Remember the new sched domains: */ | ||
1646 | if (doms_cur != &fallback_doms) | ||
1647 | free_sched_domains(doms_cur, ndoms_cur); | ||
1648 | |||
1649 | kfree(dattr_cur); | ||
1650 | doms_cur = doms_new; | ||
1651 | dattr_cur = dattr_new; | ||
1652 | ndoms_cur = ndoms_new; | ||
1653 | |||
1654 | register_sched_domain_sysctl(); | ||
1655 | |||
1656 | mutex_unlock(&sched_domains_mutex); | ||
1657 | } | ||
1658 | |||