aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /kernel/sched
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile18
-rw-r--r--kernel/sched/auto_group.c258
-rw-r--r--kernel/sched/auto_group.h64
-rw-r--r--kernel/sched/clock.c350
-rw-r--r--kernel/sched/core.c8162
-rw-r--r--kernel/sched/cpupri.c240
-rw-r--r--kernel/sched/cpupri.h34
-rw-r--r--kernel/sched/cputime.c589
-rw-r--r--kernel/sched/debug.c531
-rw-r--r--kernel/sched/fair.c6174
-rw-r--r--kernel/sched/features.h79
-rw-r--r--kernel/sched/idle_task.c98
-rw-r--r--kernel/sched/rt.c2094
-rw-r--r--kernel/sched/sched.h1241
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h231
-rw-r--r--kernel/sched/stop_task.c128
17 files changed, 0 insertions, 20402 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
deleted file mode 100644
index f06d249e103..00000000000
--- a/kernel/sched/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
deleted file mode 100644
index 0984a21076a..00000000000
--- a/kernel/sched/auto_group.c
+++ /dev/null
@@ -1,258 +0,0 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include "sched.h"
4
5#include <linux/proc_fs.h>
6#include <linux/seq_file.h>
7#include <linux/kallsyms.h>
8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
11
12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
13static struct autogroup autogroup_default;
14static atomic_t autogroup_seq_nr;
15
16void __init autogroup_init(struct task_struct *init_task)
17{
18 autogroup_default.tg = &root_task_group;
19 kref_init(&autogroup_default.kref);
20 init_rwsem(&autogroup_default.lock);
21 init_task->signal->autogroup = &autogroup_default;
22}
23
24void autogroup_free(struct task_group *tg)
25{
26 kfree(tg->autogroup);
27}
28
29static inline void autogroup_destroy(struct kref *kref)
30{
31 struct autogroup *ag = container_of(kref, struct autogroup, kref);
32
33#ifdef CONFIG_RT_GROUP_SCHED
34 /* We've redirected RT tasks to the root task group... */
35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL;
37#endif
38 sched_destroy_group(ag->tg);
39}
40
41static inline void autogroup_kref_put(struct autogroup *ag)
42{
43 kref_put(&ag->kref, autogroup_destroy);
44}
45
46static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
47{
48 kref_get(&ag->kref);
49 return ag;
50}
51
52static inline struct autogroup *autogroup_task_get(struct task_struct *p)
53{
54 struct autogroup *ag;
55 unsigned long flags;
56
57 if (!lock_task_sighand(p, &flags))
58 return autogroup_kref_get(&autogroup_default);
59
60 ag = autogroup_kref_get(p->signal->autogroup);
61 unlock_task_sighand(p, &flags);
62
63 return ag;
64}
65
66static inline struct autogroup *autogroup_create(void)
67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
69 struct task_group *tg;
70
71 if (!ag)
72 goto out_fail;
73
74 tg = sched_create_group(&root_task_group);
75
76 if (IS_ERR(tg))
77 goto out_free;
78
79 kref_init(&ag->kref);
80 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr);
82 ag->tg = tg;
83#ifdef CONFIG_RT_GROUP_SCHED
84 /*
85 * Autogroup RT tasks are redirected to the root task group
86 * so we don't have to move tasks around upon policy change,
87 * or flail around trying to allocate bandwidth on the fly.
88 * A bandwidth exception in __sched_setscheduler() allows
89 * the policy change to proceed. Thereafter, task_group()
90 * returns &root_task_group, so zero bandwidth is required.
91 */
92 free_rt_sched_group(tg);
93 tg->rt_se = root_task_group.rt_se;
94 tg->rt_rq = root_task_group.rt_rq;
95#endif
96 tg->autogroup = ag;
97
98 return ag;
99
100out_free:
101 kfree(ag);
102out_fail:
103 if (printk_ratelimit()) {
104 printk(KERN_WARNING "autogroup_create: %s failure.\n",
105 ag ? "sched_create_group()" : "kmalloc()");
106 }
107
108 return autogroup_kref_get(&autogroup_default);
109}
110
111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112{
113 if (tg != &root_task_group)
114 return false;
115
116 if (p->sched_class != &fair_sched_class)
117 return false;
118
119 /*
120 * We can only assume the task group can't go away on us if
121 * autogroup_move_group() can see us on ->thread_group list.
122 */
123 if (p->flags & PF_EXITING)
124 return false;
125
126 return true;
127}
128
129static void
130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
131{
132 struct autogroup *prev;
133 struct task_struct *t;
134 unsigned long flags;
135
136 BUG_ON(!lock_task_sighand(p, &flags));
137
138 prev = p->signal->autogroup;
139 if (prev == ag) {
140 unlock_task_sighand(p, &flags);
141 return;
142 }
143
144 p->signal->autogroup = autogroup_kref_get(ag);
145
146 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
147 goto out;
148
149 t = p;
150 do {
151 sched_move_task(t);
152 } while_each_thread(p, t);
153
154out:
155 unlock_task_sighand(p, &flags);
156 autogroup_kref_put(prev);
157}
158
159/* Allocates GFP_KERNEL, cannot be called under any spinlock */
160void sched_autogroup_create_attach(struct task_struct *p)
161{
162 struct autogroup *ag = autogroup_create();
163
164 autogroup_move_group(p, ag);
165 /* drop extra reference added by autogroup_create() */
166 autogroup_kref_put(ag);
167}
168EXPORT_SYMBOL(sched_autogroup_create_attach);
169
170/* Cannot be called under siglock. Currently has no users */
171void sched_autogroup_detach(struct task_struct *p)
172{
173 autogroup_move_group(p, &autogroup_default);
174}
175EXPORT_SYMBOL(sched_autogroup_detach);
176
177void sched_autogroup_fork(struct signal_struct *sig)
178{
179 sig->autogroup = autogroup_task_get(current);
180}
181
182void sched_autogroup_exit(struct signal_struct *sig)
183{
184 autogroup_kref_put(sig->autogroup);
185}
186
187static int __init setup_autogroup(char *str)
188{
189 sysctl_sched_autogroup_enabled = 0;
190
191 return 1;
192}
193
194__setup("noautogroup", setup_autogroup);
195
196#ifdef CONFIG_PROC_FS
197
198int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
199{
200 static unsigned long next = INITIAL_JIFFIES;
201 struct autogroup *ag;
202 int err;
203
204 if (nice < -20 || nice > 19)
205 return -EINVAL;
206
207 err = security_task_setnice(current, nice);
208 if (err)
209 return err;
210
211 if (nice < 0 && !can_nice(current, nice))
212 return -EPERM;
213
214 /* this is a heavy operation taking global locks.. */
215 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
216 return -EAGAIN;
217
218 next = HZ / 10 + jiffies;
219 ag = autogroup_task_get(p);
220
221 down_write(&ag->lock);
222 err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
223 if (!err)
224 ag->nice = nice;
225 up_write(&ag->lock);
226
227 autogroup_kref_put(ag);
228
229 return err;
230}
231
232void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
233{
234 struct autogroup *ag = autogroup_task_get(p);
235
236 if (!task_group_is_autogroup(ag->tg))
237 goto out;
238
239 down_read(&ag->lock);
240 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
241 up_read(&ag->lock);
242
243out:
244 autogroup_kref_put(ag);
245}
246#endif /* CONFIG_PROC_FS */
247
248#ifdef CONFIG_SCHED_DEBUG
249int autogroup_path(struct task_group *tg, char *buf, int buflen)
250{
251 if (!task_group_is_autogroup(tg))
252 return 0;
253
254 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
255}
256#endif /* CONFIG_SCHED_DEBUG */
257
258#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
deleted file mode 100644
index 8bd04714281..00000000000
--- a/kernel/sched/auto_group.h
+++ /dev/null
@@ -1,64 +0,0 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
6struct autogroup {
7 /*
8 * reference doesn't mean how many thread attach to this
9 * autogroup now. It just stands for the number of task
10 * could use this autogroup.
11 */
12 struct kref kref;
13 struct task_group *tg;
14 struct rw_semaphore lock;
15 unsigned long id;
16 int nice;
17};
18
19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
29static inline struct task_group *
30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
41
42#else /* !CONFIG_SCHED_AUTOGROUP */
43
44static inline void autogroup_init(struct task_struct *init_task) { }
45static inline void autogroup_free(struct task_group *tg) { }
46static inline bool task_group_is_autogroup(struct task_group *tg)
47{
48 return 0;
49}
50
51static inline struct task_group *
52autogroup_task_group(struct task_struct *p, struct task_group *tg)
53{
54 return tg;
55}
56
57#ifdef CONFIG_SCHED_DEBUG
58static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
59{
60 return 0;
61}
62#endif
63
64#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
deleted file mode 100644
index c685e31492d..00000000000
--- a/kernel/sched/clock.c
+++ /dev/null
@@ -1,350 +0,0 @@
1/*
2 * sched_clock for unstable cpu clocks
3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 * Updates and enhancements:
7 * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
8 *
9 * Based on code by:
10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com>
12 *
13 *
14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
43 * - sched_clock()
44 * - explicit idle events
45 *
46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
49 *
50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
51 * that is otherwise invisible (TSC gets stopped).
52 *
53 *
54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
62 */
63#include <linux/spinlock.h>
64#include <linux/hardirq.h>
65#include <linux/export.h>
66#include <linux/percpu.h>
67#include <linux/ktime.h>
68#include <linux/sched.h>
69
70/*
71 * Scheduler clock - returns current time in nanosec units.
72 * This is default implementation.
73 * Architectures and sub-architectures can override this.
74 */
75unsigned long long __attribute__((weak)) sched_clock(void)
76{
77 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
78 * (NSEC_PER_SEC / HZ);
79}
80EXPORT_SYMBOL_GPL(sched_clock);
81
82__read_mostly int sched_clock_running;
83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable;
86
87struct sched_clock_data {
88 u64 tick_raw;
89 u64 tick_gtod;
90 u64 clock;
91};
92
93static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
94
95static inline struct sched_clock_data *this_scd(void)
96{
97 return &__get_cpu_var(sched_clock_data);
98}
99
100static inline struct sched_clock_data *cpu_sdc(int cpu)
101{
102 return &per_cpu(sched_clock_data, cpu);
103}
104
105void sched_clock_init(void)
106{
107 u64 ktime_now = ktime_to_ns(ktime_get());
108 int cpu;
109
110 for_each_possible_cpu(cpu) {
111 struct sched_clock_data *scd = cpu_sdc(cpu);
112
113 scd->tick_raw = 0;
114 scd->tick_gtod = ktime_now;
115 scd->clock = ktime_now;
116 }
117
118 sched_clock_running = 1;
119}
120
121/*
122 * min, max except they take wrapping into account
123 */
124
125static inline u64 wrap_min(u64 x, u64 y)
126{
127 return (s64)(x - y) < 0 ? x : y;
128}
129
130static inline u64 wrap_max(u64 x, u64 y)
131{
132 return (s64)(x - y) > 0 ? x : y;
133}
134
135/*
136 * update the percpu scd from the raw @now value
137 *
138 * - filter out backward motion
139 * - use the GTOD tick value to create a window to filter crazy TSC values
140 */
141static u64 sched_clock_local(struct sched_clock_data *scd)
142{
143 u64 now, clock, old_clock, min_clock, max_clock;
144 s64 delta;
145
146again:
147 now = sched_clock();
148 delta = now - scd->tick_raw;
149 if (unlikely(delta < 0))
150 delta = 0;
151
152 old_clock = scd->clock;
153
154 /*
155 * scd->clock = clamp(scd->tick_gtod + delta,
156 * max(scd->tick_gtod, scd->clock),
157 * scd->tick_gtod + TICK_NSEC);
158 */
159
160 clock = scd->tick_gtod + delta;
161 min_clock = wrap_max(scd->tick_gtod, old_clock);
162 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
163
164 clock = wrap_max(clock, min_clock);
165 clock = wrap_min(clock, max_clock);
166
167 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
168 goto again;
169
170 return clock;
171}
172
173static u64 sched_clock_remote(struct sched_clock_data *scd)
174{
175 struct sched_clock_data *my_scd = this_scd();
176 u64 this_clock, remote_clock;
177 u64 *ptr, old_val, val;
178
179 sched_clock_local(my_scd);
180again:
181 this_clock = my_scd->clock;
182 remote_clock = scd->clock;
183
184 /*
185 * Use the opportunity that we have both locks
186 * taken to couple the two clocks: we take the
187 * larger time as the latest time for both
188 * runqueues. (this creates monotonic movement)
189 */
190 if (likely((s64)(remote_clock - this_clock) < 0)) {
191 ptr = &scd->clock;
192 old_val = remote_clock;
193 val = this_clock;
194 } else {
195 /*
196 * Should be rare, but possible:
197 */
198 ptr = &my_scd->clock;
199 old_val = this_clock;
200 val = remote_clock;
201 }
202
203 if (cmpxchg64(ptr, old_val, val) != old_val)
204 goto again;
205
206 return val;
207}
208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
214u64 sched_clock_cpu(int cpu)
215{
216 struct sched_clock_data *scd;
217 u64 clock;
218
219 WARN_ON_ONCE(!irqs_disabled());
220
221 if (sched_clock_stable)
222 return sched_clock();
223
224 if (unlikely(!sched_clock_running))
225 return 0ull;
226
227 scd = cpu_sdc(cpu);
228
229 if (cpu != smp_processor_id())
230 clock = sched_clock_remote(scd);
231 else
232 clock = sched_clock_local(scd);
233
234 return clock;
235}
236
237void sched_clock_tick(void)
238{
239 struct sched_clock_data *scd;
240 u64 now, now_gtod;
241
242 if (sched_clock_stable)
243 return;
244
245 if (unlikely(!sched_clock_running))
246 return;
247
248 WARN_ON_ONCE(!irqs_disabled());
249
250 scd = this_scd();
251 now_gtod = ktime_to_ns(ktime_get());
252 now = sched_clock();
253
254 scd->tick_raw = now;
255 scd->tick_gtod = now_gtod;
256 sched_clock_local(scd);
257}
258
259/*
260 * We are going deep-idle (irqs are disabled):
261 */
262void sched_clock_idle_sleep_event(void)
263{
264 sched_clock_cpu(smp_processor_id());
265}
266EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
267
268/*
269 * We just idled delta nanoseconds (called with irqs disabled):
270 */
271void sched_clock_idle_wakeup_event(u64 delta_ns)
272{
273 if (timekeeping_suspended)
274 return;
275
276 sched_clock_tick();
277 touch_softlockup_watchdog();
278}
279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
280
281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
292{
293 u64 clock;
294 unsigned long flags;
295
296 local_irq_save(flags);
297 clock = sched_clock_cpu(cpu);
298 local_irq_restore(flags);
299
300 return clock;
301}
302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
323
324void sched_clock_init(void)
325{
326 sched_clock_running = 1;
327}
328
329u64 sched_clock_cpu(int cpu)
330{
331 if (unlikely(!sched_clock_running))
332 return 0;
333
334 return sched_clock();
335}
336
337u64 cpu_clock(int cpu)
338{
339 return sched_clock_cpu(cpu);
340}
341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
348
349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
deleted file mode 100644
index 257002c13bb..00000000000
--- a/kernel/sched/core.c
+++ /dev/null
@@ -1,8162 +0,0 @@
1/*
2 * kernel/sched/core.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
27 */
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
33#include <linux/uaccess.h>
34#include <linux/highmem.h>
35#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
37#include <linux/capability.h>
38#include <linux/completion.h>
39#include <linux/kernel_stat.h>
40#include <linux/debug_locks.h>
41#include <linux/perf_event.h>
42#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
45#include <linux/freezer.h>
46#include <linux/vmalloc.h>
47#include <linux/blkdev.h>
48#include <linux/delay.h>
49#include <linux/pid_namespace.h>
50#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#include <linux/sysctl.h>
60#include <linux/syscalls.h>
61#include <linux/times.h>
62#include <linux/tsacct_kern.h>
63#include <linux/kprobes.h>
64#include <linux/delayacct.h>
65#include <linux/unistd.h>
66#include <linux/pagemap.h>
67#include <linux/hrtimer.h>
68#include <linux/tick.h>
69#include <linux/debugfs.h>
70#include <linux/ctype.h>
71#include <linux/ftrace.h>
72#include <linux/slab.h>
73#include <linux/init_task.h>
74#include <linux/binfmts.h>
75#include <linux/context_tracking.h>
76
77#include <asm/switch_to.h>
78#include <asm/tlb.h>
79#include <asm/irq_regs.h>
80#include <asm/mutex.h>
81#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h>
83#endif
84
85#include "sched.h"
86#include "../workqueue_sched.h"
87#include "../smpboot.h"
88
89#define CREATE_TRACE_POINTS
90#include <trace/events/sched.h>
91
92void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
93{
94 unsigned long delta;
95 ktime_t soft, hard, now;
96
97 for (;;) {
98 if (hrtimer_active(period_timer))
99 break;
100
101 now = hrtimer_cb_get_time(period_timer);
102 hrtimer_forward(period_timer, now, period);
103
104 soft = hrtimer_get_softexpires(period_timer);
105 hard = hrtimer_get_expires(period_timer);
106 delta = ktime_to_ns(ktime_sub(hard, soft));
107 __hrtimer_start_range_ns(period_timer, soft, delta,
108 HRTIMER_MODE_ABS_PINNED, 0);
109 }
110}
111
112DEFINE_MUTEX(sched_domains_mutex);
113DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
114
115static void update_rq_clock_task(struct rq *rq, s64 delta);
116
117void update_rq_clock(struct rq *rq)
118{
119 s64 delta;
120
121 if (rq->skip_clock_update > 0)
122 return;
123
124 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
125 rq->clock += delta;
126 update_rq_clock_task(rq, delta);
127}
128
129/*
130 * Debugging: various feature bits
131 */
132
133#define SCHED_FEAT(name, enabled) \
134 (1UL << __SCHED_FEAT_##name) * enabled |
135
136const_debug unsigned int sysctl_sched_features =
137#include "features.h"
138 0;
139
140#undef SCHED_FEAT
141
142#ifdef CONFIG_SCHED_DEBUG
143#define SCHED_FEAT(name, enabled) \
144 #name ,
145
146static const char * const sched_feat_names[] = {
147#include "features.h"
148};
149
150#undef SCHED_FEAT
151
152static int sched_feat_show(struct seq_file *m, void *v)
153{
154 int i;
155
156 for (i = 0; i < __SCHED_FEAT_NR; i++) {
157 if (!(sysctl_sched_features & (1UL << i)))
158 seq_puts(m, "NO_");
159 seq_printf(m, "%s ", sched_feat_names[i]);
160 }
161 seq_puts(m, "\n");
162
163 return 0;
164}
165
166#ifdef HAVE_JUMP_LABEL
167
168#define jump_label_key__true STATIC_KEY_INIT_TRUE
169#define jump_label_key__false STATIC_KEY_INIT_FALSE
170
171#define SCHED_FEAT(name, enabled) \
172 jump_label_key__##enabled ,
173
174struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
175#include "features.h"
176};
177
178#undef SCHED_FEAT
179
180static void sched_feat_disable(int i)
181{
182 if (static_key_enabled(&sched_feat_keys[i]))
183 static_key_slow_dec(&sched_feat_keys[i]);
184}
185
186static void sched_feat_enable(int i)
187{
188 if (!static_key_enabled(&sched_feat_keys[i]))
189 static_key_slow_inc(&sched_feat_keys[i]);
190}
191#else
192static void sched_feat_disable(int i) { };
193static void sched_feat_enable(int i) { };
194#endif /* HAVE_JUMP_LABEL */
195
196static int sched_feat_set(char *cmp)
197{
198 int i;
199 int neg = 0;
200
201 if (strncmp(cmp, "NO_", 3) == 0) {
202 neg = 1;
203 cmp += 3;
204 }
205
206 for (i = 0; i < __SCHED_FEAT_NR; i++) {
207 if (strcmp(cmp, sched_feat_names[i]) == 0) {
208 if (neg) {
209 sysctl_sched_features &= ~(1UL << i);
210 sched_feat_disable(i);
211 } else {
212 sysctl_sched_features |= (1UL << i);
213 sched_feat_enable(i);
214 }
215 break;
216 }
217 }
218
219 return i;
220}
221
222static ssize_t
223sched_feat_write(struct file *filp, const char __user *ubuf,
224 size_t cnt, loff_t *ppos)
225{
226 char buf[64];
227 char *cmp;
228 int i;
229
230 if (cnt > 63)
231 cnt = 63;
232
233 if (copy_from_user(&buf, ubuf, cnt))
234 return -EFAULT;
235
236 buf[cnt] = 0;
237 cmp = strstrip(buf);
238
239 i = sched_feat_set(cmp);
240 if (i == __SCHED_FEAT_NR)
241 return -EINVAL;
242
243 *ppos += cnt;
244
245 return cnt;
246}
247
248static int sched_feat_open(struct inode *inode, struct file *filp)
249{
250 return single_open(filp, sched_feat_show, NULL);
251}
252
253static const struct file_operations sched_feat_fops = {
254 .open = sched_feat_open,
255 .write = sched_feat_write,
256 .read = seq_read,
257 .llseek = seq_lseek,
258 .release = single_release,
259};
260
261static __init int sched_init_debug(void)
262{
263 debugfs_create_file("sched_features", 0644, NULL, NULL,
264 &sched_feat_fops);
265
266 return 0;
267}
268late_initcall(sched_init_debug);
269#endif /* CONFIG_SCHED_DEBUG */
270
271/*
272 * Number of tasks to iterate in a single balance run.
273 * Limited because this is done with IRQs disabled.
274 */
275const_debug unsigned int sysctl_sched_nr_migrate = 32;
276
277/*
278 * period over which we average the RT time consumption, measured
279 * in ms.
280 *
281 * default: 1s
282 */
283const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
284
285/*
286 * period over which we measure -rt task cpu usage in us.
287 * default: 1s
288 */
289unsigned int sysctl_sched_rt_period = 1000000;
290
291__read_mostly int scheduler_running;
292
293/*
294 * part of the period that we allow rt tasks to run in us.
295 * default: 0.95s
296 */
297int sysctl_sched_rt_runtime = 950000;
298
299
300
301/*
302 * __task_rq_lock - lock the rq @p resides on.
303 */
304static inline struct rq *__task_rq_lock(struct task_struct *p)
305 __acquires(rq->lock)
306{
307 struct rq *rq;
308
309 lockdep_assert_held(&p->pi_lock);
310
311 for (;;) {
312 rq = task_rq(p);
313 raw_spin_lock(&rq->lock);
314 if (likely(rq == task_rq(p)))
315 return rq;
316 raw_spin_unlock(&rq->lock);
317 }
318}
319
320/*
321 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
322 */
323static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
324 __acquires(p->pi_lock)
325 __acquires(rq->lock)
326{
327 struct rq *rq;
328
329 for (;;) {
330 raw_spin_lock_irqsave(&p->pi_lock, *flags);
331 rq = task_rq(p);
332 raw_spin_lock(&rq->lock);
333 if (likely(rq == task_rq(p)))
334 return rq;
335 raw_spin_unlock(&rq->lock);
336 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
337 }
338}
339
340static void __task_rq_unlock(struct rq *rq)
341 __releases(rq->lock)
342{
343 raw_spin_unlock(&rq->lock);
344}
345
346static inline void
347task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
348 __releases(rq->lock)
349 __releases(p->pi_lock)
350{
351 raw_spin_unlock(&rq->lock);
352 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
353}
354
355/*
356 * this_rq_lock - lock this runqueue and disable interrupts.
357 */
358static struct rq *this_rq_lock(void)
359 __acquires(rq->lock)
360{
361 struct rq *rq;
362
363 local_irq_disable();
364 rq = this_rq();
365 raw_spin_lock(&rq->lock);
366
367 return rq;
368}
369
370#ifdef CONFIG_SCHED_HRTICK
371/*
372 * Use HR-timers to deliver accurate preemption points.
373 *
374 * Its all a bit involved since we cannot program an hrt while holding the
375 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
376 * reschedule event.
377 *
378 * When we get rescheduled we reprogram the hrtick_timer outside of the
379 * rq->lock.
380 */
381
382static void hrtick_clear(struct rq *rq)
383{
384 if (hrtimer_active(&rq->hrtick_timer))
385 hrtimer_cancel(&rq->hrtick_timer);
386}
387
388/*
389 * High-resolution timer tick.
390 * Runs from hardirq context with interrupts disabled.
391 */
392static enum hrtimer_restart hrtick(struct hrtimer *timer)
393{
394 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
395
396 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
397
398 raw_spin_lock(&rq->lock);
399 update_rq_clock(rq);
400 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
401 raw_spin_unlock(&rq->lock);
402
403 return HRTIMER_NORESTART;
404}
405
406#ifdef CONFIG_SMP
407/*
408 * called from hardirq (IPI) context
409 */
410static void __hrtick_start(void *arg)
411{
412 struct rq *rq = arg;
413
414 raw_spin_lock(&rq->lock);
415 hrtimer_restart(&rq->hrtick_timer);
416 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock);
418}
419
420/*
421 * Called to set the hrtick timer state.
422 *
423 * called with rq->lock held and irqs disabled
424 */
425void hrtick_start(struct rq *rq, u64 delay)
426{
427 struct hrtimer *timer = &rq->hrtick_timer;
428 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
429
430 hrtimer_set_expires(timer, time);
431
432 if (rq == this_rq()) {
433 hrtimer_restart(timer);
434 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1;
437 }
438}
439
440static int
441hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
442{
443 int cpu = (int)(long)hcpu;
444
445 switch (action) {
446 case CPU_UP_CANCELED:
447 case CPU_UP_CANCELED_FROZEN:
448 case CPU_DOWN_PREPARE:
449 case CPU_DOWN_PREPARE_FROZEN:
450 case CPU_DEAD:
451 case CPU_DEAD_FROZEN:
452 hrtick_clear(cpu_rq(cpu));
453 return NOTIFY_OK;
454 }
455
456 return NOTIFY_DONE;
457}
458
459static __init void init_hrtick(void)
460{
461 hotcpu_notifier(hotplug_hrtick, 0);
462}
463#else
464/*
465 * Called to set the hrtick timer state.
466 *
467 * called with rq->lock held and irqs disabled
468 */
469void hrtick_start(struct rq *rq, u64 delay)
470{
471 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
472 HRTIMER_MODE_REL_PINNED, 0);
473}
474
475static inline void init_hrtick(void)
476{
477}
478#endif /* CONFIG_SMP */
479
480static void init_rq_hrtick(struct rq *rq)
481{
482#ifdef CONFIG_SMP
483 rq->hrtick_csd_pending = 0;
484
485 rq->hrtick_csd.flags = 0;
486 rq->hrtick_csd.func = __hrtick_start;
487 rq->hrtick_csd.info = rq;
488#endif
489
490 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
491 rq->hrtick_timer.function = hrtick;
492}
493#else /* CONFIG_SCHED_HRTICK */
494static inline void hrtick_clear(struct rq *rq)
495{
496}
497
498static inline void init_rq_hrtick(struct rq *rq)
499{
500}
501
502static inline void init_hrtick(void)
503{
504}
505#endif /* CONFIG_SCHED_HRTICK */
506
507/*
508 * resched_task - mark a task 'to be rescheduled now'.
509 *
510 * On UP this means the setting of the need_resched flag, on SMP it
511 * might also involve a cross-CPU call to trigger the scheduler on
512 * the target CPU.
513 */
514#ifdef CONFIG_SMP
515
516#ifndef tsk_is_polling
517#define tsk_is_polling(t) 0
518#endif
519
520void resched_task(struct task_struct *p)
521{
522 int cpu;
523
524 assert_raw_spin_locked(&task_rq(p)->lock);
525
526 if (test_tsk_need_resched(p))
527 return;
528
529 set_tsk_need_resched(p);
530
531 cpu = task_cpu(p);
532 if (cpu == smp_processor_id())
533 return;
534
535 /* NEED_RESCHED must be visible before we test polling */
536 smp_mb();
537 if (!tsk_is_polling(p))
538 smp_send_reschedule(cpu);
539}
540
541void resched_cpu(int cpu)
542{
543 struct rq *rq = cpu_rq(cpu);
544 unsigned long flags;
545
546 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
547 return;
548 resched_task(cpu_curr(cpu));
549 raw_spin_unlock_irqrestore(&rq->lock, flags);
550}
551
552#ifdef CONFIG_NO_HZ
553/*
554 * In the semi idle case, use the nearest busy cpu for migrating timers
555 * from an idle cpu. This is good for power-savings.
556 *
557 * We don't do similar optimization for completely idle system, as
558 * selecting an idle cpu will add more delays to the timers than intended
559 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
560 */
561int get_nohz_timer_target(void)
562{
563 int cpu = smp_processor_id();
564 int i;
565 struct sched_domain *sd;
566
567 rcu_read_lock();
568 for_each_domain(cpu, sd) {
569 for_each_cpu(i, sched_domain_span(sd)) {
570 if (!idle_cpu(i)) {
571 cpu = i;
572 goto unlock;
573 }
574 }
575 }
576unlock:
577 rcu_read_unlock();
578 return cpu;
579}
580/*
581 * When add_timer_on() enqueues a timer into the timer wheel of an
582 * idle CPU then this timer might expire before the next timer event
583 * which is scheduled to wake up that CPU. In case of a completely
584 * idle system the next event might even be infinite time into the
585 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
586 * leaves the inner idle loop so the newly added timer is taken into
587 * account when the CPU goes back to idle and evaluates the timer
588 * wheel for the next timer event.
589 */
590void wake_up_idle_cpu(int cpu)
591{
592 struct rq *rq = cpu_rq(cpu);
593
594 if (cpu == smp_processor_id())
595 return;
596
597 /*
598 * This is safe, as this function is called with the timer
599 * wheel base lock of (cpu) held. When the CPU is on the way
600 * to idle and has not yet set rq->curr to idle then it will
601 * be serialized on the timer wheel base lock and take the new
602 * timer into account automatically.
603 */
604 if (rq->curr != rq->idle)
605 return;
606
607 /*
608 * We can set TIF_RESCHED on the idle task of the other CPU
609 * lockless. The worst case is that the other CPU runs the
610 * idle task through an additional NOOP schedule()
611 */
612 set_tsk_need_resched(rq->idle);
613
614 /* NEED_RESCHED must be visible before we test polling */
615 smp_mb();
616 if (!tsk_is_polling(rq->idle))
617 smp_send_reschedule(cpu);
618}
619
620static inline bool got_nohz_idle_kick(void)
621{
622 int cpu = smp_processor_id();
623 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
624}
625
626#else /* CONFIG_NO_HZ */
627
628static inline bool got_nohz_idle_kick(void)
629{
630 return false;
631}
632
633#endif /* CONFIG_NO_HZ */
634
635void sched_avg_update(struct rq *rq)
636{
637 s64 period = sched_avg_period();
638
639 while ((s64)(rq->clock - rq->age_stamp) > period) {
640 /*
641 * Inline assembly required to prevent the compiler
642 * optimising this loop into a divmod call.
643 * See __iter_div_u64_rem() for another example of this.
644 */
645 asm("" : "+rm" (rq->age_stamp));
646 rq->age_stamp += period;
647 rq->rt_avg /= 2;
648 }
649}
650
651#else /* !CONFIG_SMP */
652void resched_task(struct task_struct *p)
653{
654 assert_raw_spin_locked(&task_rq(p)->lock);
655 set_tsk_need_resched(p);
656}
657#endif /* CONFIG_SMP */
658
659#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
660 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
661/*
662 * Iterate task_group tree rooted at *from, calling @down when first entering a
663 * node and @up when leaving it for the final time.
664 *
665 * Caller must hold rcu_lock or sufficient equivalent.
666 */
667int walk_tg_tree_from(struct task_group *from,
668 tg_visitor down, tg_visitor up, void *data)
669{
670 struct task_group *parent, *child;
671 int ret;
672
673 parent = from;
674
675down:
676 ret = (*down)(parent, data);
677 if (ret)
678 goto out;
679 list_for_each_entry_rcu(child, &parent->children, siblings) {
680 parent = child;
681 goto down;
682
683up:
684 continue;
685 }
686 ret = (*up)(parent, data);
687 if (ret || parent == from)
688 goto out;
689
690 child = parent;
691 parent = parent->parent;
692 if (parent)
693 goto up;
694out:
695 return ret;
696}
697
698int tg_nop(struct task_group *tg, void *data)
699{
700 return 0;
701}
702#endif
703
704static void set_load_weight(struct task_struct *p)
705{
706 int prio = p->static_prio - MAX_RT_PRIO;
707 struct load_weight *load = &p->se.load;
708
709 /*
710 * SCHED_IDLE tasks get minimal weight:
711 */
712 if (p->policy == SCHED_IDLE) {
713 load->weight = scale_load(WEIGHT_IDLEPRIO);
714 load->inv_weight = WMULT_IDLEPRIO;
715 return;
716 }
717
718 load->weight = scale_load(prio_to_weight[prio]);
719 load->inv_weight = prio_to_wmult[prio];
720}
721
722static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
723{
724 update_rq_clock(rq);
725 sched_info_queued(p);
726 p->sched_class->enqueue_task(rq, p, flags);
727}
728
729static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
730{
731 update_rq_clock(rq);
732 sched_info_dequeued(p);
733 p->sched_class->dequeue_task(rq, p, flags);
734}
735
736void activate_task(struct rq *rq, struct task_struct *p, int flags)
737{
738 if (task_contributes_to_load(p))
739 rq->nr_uninterruptible--;
740
741 enqueue_task(rq, p, flags);
742}
743
744void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
745{
746 if (task_contributes_to_load(p))
747 rq->nr_uninterruptible++;
748
749 dequeue_task(rq, p, flags);
750}
751
752static void update_rq_clock_task(struct rq *rq, s64 delta)
753{
754/*
755 * In theory, the compile should just see 0 here, and optimize out the call
756 * to sched_rt_avg_update. But I don't trust it...
757 */
758#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
759 s64 steal = 0, irq_delta = 0;
760#endif
761#ifdef CONFIG_IRQ_TIME_ACCOUNTING
762 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
763
764 /*
765 * Since irq_time is only updated on {soft,}irq_exit, we might run into
766 * this case when a previous update_rq_clock() happened inside a
767 * {soft,}irq region.
768 *
769 * When this happens, we stop ->clock_task and only update the
770 * prev_irq_time stamp to account for the part that fit, so that a next
771 * update will consume the rest. This ensures ->clock_task is
772 * monotonic.
773 *
774 * It does however cause some slight miss-attribution of {soft,}irq
775 * time, a more accurate solution would be to update the irq_time using
776 * the current rq->clock timestamp, except that would require using
777 * atomic ops.
778 */
779 if (irq_delta > delta)
780 irq_delta = delta;
781
782 rq->prev_irq_time += irq_delta;
783 delta -= irq_delta;
784#endif
785#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
786 if (static_key_false((&paravirt_steal_rq_enabled))) {
787 u64 st;
788
789 steal = paravirt_steal_clock(cpu_of(rq));
790 steal -= rq->prev_steal_time_rq;
791
792 if (unlikely(steal > delta))
793 steal = delta;
794
795 st = steal_ticks(steal);
796 steal = st * TICK_NSEC;
797
798 rq->prev_steal_time_rq += steal;
799
800 delta -= steal;
801 }
802#endif
803
804 rq->clock_task += delta;
805
806#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
807 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
808 sched_rt_avg_update(rq, irq_delta + steal);
809#endif
810}
811
812void sched_set_stop_task(int cpu, struct task_struct *stop)
813{
814 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
815 struct task_struct *old_stop = cpu_rq(cpu)->stop;
816
817 if (stop) {
818 /*
819 * Make it appear like a SCHED_FIFO task, its something
820 * userspace knows about and won't get confused about.
821 *
822 * Also, it will make PI more or less work without too
823 * much confusion -- but then, stop work should not
824 * rely on PI working anyway.
825 */
826 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
827
828 stop->sched_class = &stop_sched_class;
829 }
830
831 cpu_rq(cpu)->stop = stop;
832
833 if (old_stop) {
834 /*
835 * Reset it back to a normal scheduling class so that
836 * it can die in pieces.
837 */
838 old_stop->sched_class = &rt_sched_class;
839 }
840}
841
842/*
843 * __normal_prio - return the priority that is based on the static prio
844 */
845static inline int __normal_prio(struct task_struct *p)
846{
847 return p->static_prio;
848}
849
850/*
851 * Calculate the expected normal priority: i.e. priority
852 * without taking RT-inheritance into account. Might be
853 * boosted by interactivity modifiers. Changes upon fork,
854 * setprio syscalls, and whenever the interactivity
855 * estimator recalculates.
856 */
857static inline int normal_prio(struct task_struct *p)
858{
859 int prio;
860
861 if (task_has_rt_policy(p))
862 prio = MAX_RT_PRIO-1 - p->rt_priority;
863 else
864 prio = __normal_prio(p);
865 return prio;
866}
867
868/*
869 * Calculate the current priority, i.e. the priority
870 * taken into account by the scheduler. This value might
871 * be boosted by RT tasks, or might be boosted by
872 * interactivity modifiers. Will be RT if the task got
873 * RT-boosted. If not then it returns p->normal_prio.
874 */
875static int effective_prio(struct task_struct *p)
876{
877 p->normal_prio = normal_prio(p);
878 /*
879 * If we are RT tasks or we were boosted to RT priority,
880 * keep the priority unchanged. Otherwise, update priority
881 * to the normal priority:
882 */
883 if (!rt_prio(p->prio))
884 return p->normal_prio;
885 return p->prio;
886}
887
888/**
889 * task_curr - is this task currently executing on a CPU?
890 * @p: the task in question.
891 */
892inline int task_curr(const struct task_struct *p)
893{
894 return cpu_curr(task_cpu(p)) == p;
895}
896
897static inline void check_class_changed(struct rq *rq, struct task_struct *p,
898 const struct sched_class *prev_class,
899 int oldprio)
900{
901 if (prev_class != p->sched_class) {
902 if (prev_class->switched_from)
903 prev_class->switched_from(rq, p);
904 p->sched_class->switched_to(rq, p);
905 } else if (oldprio != p->prio)
906 p->sched_class->prio_changed(rq, p, oldprio);
907}
908
909void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
910{
911 const struct sched_class *class;
912
913 if (p->sched_class == rq->curr->sched_class) {
914 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
915 } else {
916 for_each_class(class) {
917 if (class == rq->curr->sched_class)
918 break;
919 if (class == p->sched_class) {
920 resched_task(rq->curr);
921 break;
922 }
923 }
924 }
925
926 /*
927 * A queue event has occurred, and we're going to schedule. In
928 * this case, we can save a useless back to back clock update.
929 */
930 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
931 rq->skip_clock_update = 1;
932}
933
934static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
935
936void register_task_migration_notifier(struct notifier_block *n)
937{
938 atomic_notifier_chain_register(&task_migration_notifier, n);
939}
940
941#ifdef CONFIG_SMP
942void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
943{
944#ifdef CONFIG_SCHED_DEBUG
945 /*
946 * We should never call set_task_cpu() on a blocked task,
947 * ttwu() will sort out the placement.
948 */
949 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
950 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
951
952#ifdef CONFIG_LOCKDEP
953 /*
954 * The caller should hold either p->pi_lock or rq->lock, when changing
955 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
956 *
957 * sched_move_task() holds both and thus holding either pins the cgroup,
958 * see task_group().
959 *
960 * Furthermore, all task_rq users should acquire both locks, see
961 * task_rq_lock().
962 */
963 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
964 lockdep_is_held(&task_rq(p)->lock)));
965#endif
966#endif
967
968 trace_sched_migrate_task(p, new_cpu);
969
970 if (task_cpu(p) != new_cpu) {
971 struct task_migration_notifier tmn;
972
973 if (p->sched_class->migrate_task_rq)
974 p->sched_class->migrate_task_rq(p, new_cpu);
975 p->se.nr_migrations++;
976 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
977
978 tmn.task = p;
979 tmn.from_cpu = task_cpu(p);
980 tmn.to_cpu = new_cpu;
981
982 atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
983 }
984
985 __set_task_cpu(p, new_cpu);
986}
987
988struct migration_arg {
989 struct task_struct *task;
990 int dest_cpu;
991};
992
993static int migration_cpu_stop(void *data);
994
995/*
996 * wait_task_inactive - wait for a thread to unschedule.
997 *
998 * If @match_state is nonzero, it's the @p->state value just checked and
999 * not expected to change. If it changes, i.e. @p might have woken up,
1000 * then return zero. When we succeed in waiting for @p to be off its CPU,
1001 * we return a positive number (its total switch count). If a second call
1002 * a short while later returns the same number, the caller can be sure that
1003 * @p has remained unscheduled the whole time.
1004 *
1005 * The caller must ensure that the task *will* unschedule sometime soon,
1006 * else this function might spin for a *long* time. This function can't
1007 * be called with interrupts off, or it may introduce deadlock with
1008 * smp_call_function() if an IPI is sent by the same process we are
1009 * waiting to become inactive.
1010 */
1011unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1012{
1013 unsigned long flags;
1014 int running, on_rq;
1015 unsigned long ncsw;
1016 struct rq *rq;
1017
1018 for (;;) {
1019 /*
1020 * We do the initial early heuristics without holding
1021 * any task-queue locks at all. We'll only try to get
1022 * the runqueue lock when things look like they will
1023 * work out!
1024 */
1025 rq = task_rq(p);
1026
1027 /*
1028 * If the task is actively running on another CPU
1029 * still, just relax and busy-wait without holding
1030 * any locks.
1031 *
1032 * NOTE! Since we don't hold any locks, it's not
1033 * even sure that "rq" stays as the right runqueue!
1034 * But we don't care, since "task_running()" will
1035 * return false if the runqueue has changed and p
1036 * is actually now running somewhere else!
1037 */
1038 while (task_running(rq, p)) {
1039 if (match_state && unlikely(p->state != match_state))
1040 return 0;
1041 cpu_relax();
1042 }
1043
1044 /*
1045 * Ok, time to look more closely! We need the rq
1046 * lock now, to be *sure*. If we're wrong, we'll
1047 * just go back and repeat.
1048 */
1049 rq = task_rq_lock(p, &flags);
1050 trace_sched_wait_task(p);
1051 running = task_running(rq, p);
1052 on_rq = p->on_rq;
1053 ncsw = 0;
1054 if (!match_state || p->state == match_state)
1055 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1056 task_rq_unlock(rq, p, &flags);
1057
1058 /*
1059 * If it changed from the expected state, bail out now.
1060 */
1061 if (unlikely(!ncsw))
1062 break;
1063
1064 /*
1065 * Was it really running after all now that we
1066 * checked with the proper locks actually held?
1067 *
1068 * Oops. Go back and try again..
1069 */
1070 if (unlikely(running)) {
1071 cpu_relax();
1072 continue;
1073 }
1074
1075 /*
1076 * It's not enough that it's not actively running,
1077 * it must be off the runqueue _entirely_, and not
1078 * preempted!
1079 *
1080 * So if it was still runnable (but just not actively
1081 * running right now), it's preempted, and we should
1082 * yield - it could be a while.
1083 */
1084 if (unlikely(on_rq)) {
1085 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1086
1087 set_current_state(TASK_UNINTERRUPTIBLE);
1088 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1089 continue;
1090 }
1091
1092 /*
1093 * Ahh, all good. It wasn't running, and it wasn't
1094 * runnable, which means that it will never become
1095 * running in the future either. We're all done!
1096 */
1097 break;
1098 }
1099
1100 return ncsw;
1101}
1102
1103/***
1104 * kick_process - kick a running thread to enter/exit the kernel
1105 * @p: the to-be-kicked thread
1106 *
1107 * Cause a process which is running on another CPU to enter
1108 * kernel-mode, without any delay. (to get signals handled.)
1109 *
1110 * NOTE: this function doesn't have to take the runqueue lock,
1111 * because all it wants to ensure is that the remote task enters
1112 * the kernel. If the IPI races and the task has been migrated
1113 * to another CPU then no harm is done and the purpose has been
1114 * achieved as well.
1115 */
1116void kick_process(struct task_struct *p)
1117{
1118 int cpu;
1119
1120 preempt_disable();
1121 cpu = task_cpu(p);
1122 if ((cpu != smp_processor_id()) && task_curr(p))
1123 smp_send_reschedule(cpu);
1124 preempt_enable();
1125}
1126EXPORT_SYMBOL_GPL(kick_process);
1127#endif /* CONFIG_SMP */
1128
1129#ifdef CONFIG_SMP
1130/*
1131 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1132 */
1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1136 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu;
1138
1139 /* Look for allowed, online CPU in same node. */
1140 for_each_cpu(dest_cpu, nodemask) {
1141 if (!cpu_online(dest_cpu))
1142 continue;
1143 if (!cpu_active(dest_cpu))
1144 continue;
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1146 return dest_cpu;
1147 }
1148
1149 for (;;) {
1150 /* Any allowed, online CPU? */
1151 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1152 if (!cpu_online(dest_cpu))
1153 continue;
1154 if (!cpu_active(dest_cpu))
1155 continue;
1156 goto out;
1157 }
1158
1159 switch (state) {
1160 case cpuset:
1161 /* No more Mr. Nice Guy. */
1162 cpuset_cpus_allowed_fallback(p);
1163 state = possible;
1164 break;
1165
1166 case possible:
1167 do_set_cpus_allowed(p, cpu_possible_mask);
1168 state = fail;
1169 break;
1170
1171 case fail:
1172 BUG();
1173 break;
1174 }
1175 }
1176
1177out:
1178 if (state != cpuset) {
1179 /*
1180 * Don't tell them about moving exiting tasks or
1181 * kernel threads (both mm NULL), since they never
1182 * leave kernel.
1183 */
1184 if (p->mm && printk_ratelimit()) {
1185 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1186 task_pid_nr(p), p->comm, cpu);
1187 }
1188 }
1189
1190 return dest_cpu;
1191}
1192
1193/*
1194 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1195 */
1196static inline
1197int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1198{
1199 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1200
1201 /*
1202 * In order not to call set_task_cpu() on a blocking task we need
1203 * to rely on ttwu() to place the task on a valid ->cpus_allowed
1204 * cpu.
1205 *
1206 * Since this is common to all placement strategies, this lives here.
1207 *
1208 * [ this allows ->select_task() to simply return task_cpu(p) and
1209 * not worry about this generic constraint ]
1210 */
1211 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1212 !cpu_online(cpu)))
1213 cpu = select_fallback_rq(task_cpu(p), p);
1214
1215 return cpu;
1216}
1217
1218static void update_avg(u64 *avg, u64 sample)
1219{
1220 s64 diff = sample - *avg;
1221 *avg += diff >> 3;
1222}
1223#endif
1224
1225static void
1226ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1227{
1228#ifdef CONFIG_SCHEDSTATS
1229 struct rq *rq = this_rq();
1230
1231#ifdef CONFIG_SMP
1232 int this_cpu = smp_processor_id();
1233
1234 if (cpu == this_cpu) {
1235 schedstat_inc(rq, ttwu_local);
1236 schedstat_inc(p, se.statistics.nr_wakeups_local);
1237 } else {
1238 struct sched_domain *sd;
1239
1240 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1241 rcu_read_lock();
1242 for_each_domain(this_cpu, sd) {
1243 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1244 schedstat_inc(sd, ttwu_wake_remote);
1245 break;
1246 }
1247 }
1248 rcu_read_unlock();
1249 }
1250
1251 if (wake_flags & WF_MIGRATED)
1252 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1253
1254#endif /* CONFIG_SMP */
1255
1256 schedstat_inc(rq, ttwu_count);
1257 schedstat_inc(p, se.statistics.nr_wakeups);
1258
1259 if (wake_flags & WF_SYNC)
1260 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1261
1262#endif /* CONFIG_SCHEDSTATS */
1263}
1264
1265static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1266{
1267 activate_task(rq, p, en_flags);
1268 p->on_rq = 1;
1269
1270 /* if a worker is waking up, notify workqueue */
1271 if (p->flags & PF_WQ_WORKER)
1272 wq_worker_waking_up(p, cpu_of(rq));
1273}
1274
1275/*
1276 * Mark the task runnable and perform wakeup-preemption.
1277 */
1278static void
1279ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1280{
1281 trace_sched_wakeup(p, true);
1282 check_preempt_curr(rq, p, wake_flags);
1283
1284 p->state = TASK_RUNNING;
1285#ifdef CONFIG_SMP
1286 if (p->sched_class->task_woken)
1287 p->sched_class->task_woken(rq, p);
1288
1289 if (rq->idle_stamp) {
1290 u64 delta = rq->clock - rq->idle_stamp;
1291 u64 max = 2*sysctl_sched_migration_cost;
1292
1293 if (delta > max)
1294 rq->avg_idle = max;
1295 else
1296 update_avg(&rq->avg_idle, delta);
1297 rq->idle_stamp = 0;
1298 }
1299#endif
1300}
1301
1302static void
1303ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1304{
1305#ifdef CONFIG_SMP
1306 if (p->sched_contributes_to_load)
1307 rq->nr_uninterruptible--;
1308#endif
1309
1310 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1311 ttwu_do_wakeup(rq, p, wake_flags);
1312}
1313
1314/*
1315 * Called in case the task @p isn't fully descheduled from its runqueue,
1316 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1317 * since all we need to do is flip p->state to TASK_RUNNING, since
1318 * the task is still ->on_rq.
1319 */
1320static int ttwu_remote(struct task_struct *p, int wake_flags)
1321{
1322 struct rq *rq;
1323 int ret = 0;
1324
1325 rq = __task_rq_lock(p);
1326 if (p->on_rq) {
1327 ttwu_do_wakeup(rq, p, wake_flags);
1328 ret = 1;
1329 }
1330 __task_rq_unlock(rq);
1331
1332 return ret;
1333}
1334
1335#ifdef CONFIG_SMP
1336static void sched_ttwu_pending(void)
1337{
1338 struct rq *rq = this_rq();
1339 struct llist_node *llist = llist_del_all(&rq->wake_list);
1340 struct task_struct *p;
1341
1342 raw_spin_lock(&rq->lock);
1343
1344 while (llist) {
1345 p = llist_entry(llist, struct task_struct, wake_entry);
1346 llist = llist_next(llist);
1347 ttwu_do_activate(rq, p, 0);
1348 }
1349
1350 raw_spin_unlock(&rq->lock);
1351}
1352
1353void scheduler_ipi(void)
1354{
1355 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1356 return;
1357
1358 /*
1359 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1360 * traditionally all their work was done from the interrupt return
1361 * path. Now that we actually do some work, we need to make sure
1362 * we do call them.
1363 *
1364 * Some archs already do call them, luckily irq_enter/exit nest
1365 * properly.
1366 *
1367 * Arguably we should visit all archs and update all handlers,
1368 * however a fair share of IPIs are still resched only so this would
1369 * somewhat pessimize the simple resched case.
1370 */
1371 irq_enter();
1372 sched_ttwu_pending();
1373
1374 /*
1375 * Check if someone kicked us for doing the nohz idle load balance.
1376 */
1377 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1378 this_rq()->idle_balance = 1;
1379 raise_softirq_irqoff(SCHED_SOFTIRQ);
1380 }
1381 irq_exit();
1382}
1383
1384static void ttwu_queue_remote(struct task_struct *p, int cpu)
1385{
1386 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1387 smp_send_reschedule(cpu);
1388}
1389
1390bool cpus_share_cache(int this_cpu, int that_cpu)
1391{
1392 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1393}
1394#endif /* CONFIG_SMP */
1395
1396static void ttwu_queue(struct task_struct *p, int cpu)
1397{
1398 struct rq *rq = cpu_rq(cpu);
1399
1400#if defined(CONFIG_SMP)
1401 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1402 sched_clock_cpu(cpu); /* sync clocks x-cpu */
1403 ttwu_queue_remote(p, cpu);
1404 return;
1405 }
1406#endif
1407
1408 raw_spin_lock(&rq->lock);
1409 ttwu_do_activate(rq, p, 0);
1410 raw_spin_unlock(&rq->lock);
1411}
1412
1413/**
1414 * try_to_wake_up - wake up a thread
1415 * @p: the thread to be awakened
1416 * @state: the mask of task states that can be woken
1417 * @wake_flags: wake modifier flags (WF_*)
1418 *
1419 * Put it on the run-queue if it's not already there. The "current"
1420 * thread is always on the run-queue (except when the actual
1421 * re-schedule is in progress), and as such you're allowed to do
1422 * the simpler "current->state = TASK_RUNNING" to mark yourself
1423 * runnable without the overhead of this.
1424 *
1425 * Returns %true if @p was woken up, %false if it was already running
1426 * or @state didn't match @p's state.
1427 */
1428static int
1429try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1430{
1431 unsigned long flags;
1432 int cpu, success = 0;
1433
1434 smp_wmb();
1435 raw_spin_lock_irqsave(&p->pi_lock, flags);
1436 if (!(p->state & state))
1437 goto out;
1438
1439 success = 1; /* we're going to change ->state */
1440 cpu = task_cpu(p);
1441
1442 if (p->on_rq && ttwu_remote(p, wake_flags))
1443 goto stat;
1444
1445#ifdef CONFIG_SMP
1446 /*
1447 * If the owning (remote) cpu is still in the middle of schedule() with
1448 * this task as prev, wait until its done referencing the task.
1449 */
1450 while (p->on_cpu)
1451 cpu_relax();
1452 /*
1453 * Pairs with the smp_wmb() in finish_lock_switch().
1454 */
1455 smp_rmb();
1456
1457 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1458 p->state = TASK_WAKING;
1459
1460 if (p->sched_class->task_waking)
1461 p->sched_class->task_waking(p);
1462
1463 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1464 if (task_cpu(p) != cpu) {
1465 wake_flags |= WF_MIGRATED;
1466 set_task_cpu(p, cpu);
1467 }
1468#endif /* CONFIG_SMP */
1469
1470 ttwu_queue(p, cpu);
1471stat:
1472 ttwu_stat(p, cpu, wake_flags);
1473out:
1474 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1475
1476 return success;
1477}
1478
1479/**
1480 * try_to_wake_up_local - try to wake up a local task with rq lock held
1481 * @p: the thread to be awakened
1482 *
1483 * Put @p on the run-queue if it's not already there. The caller must
1484 * ensure that this_rq() is locked, @p is bound to this_rq() and not
1485 * the current task.
1486 */
1487static void try_to_wake_up_local(struct task_struct *p)
1488{
1489 struct rq *rq = task_rq(p);
1490
1491 BUG_ON(rq != this_rq());
1492 BUG_ON(p == current);
1493 lockdep_assert_held(&rq->lock);
1494
1495 if (!raw_spin_trylock(&p->pi_lock)) {
1496 raw_spin_unlock(&rq->lock);
1497 raw_spin_lock(&p->pi_lock);
1498 raw_spin_lock(&rq->lock);
1499 }
1500
1501 if (!(p->state & TASK_NORMAL))
1502 goto out;
1503
1504 if (!p->on_rq)
1505 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1506
1507 ttwu_do_wakeup(rq, p, 0);
1508 ttwu_stat(p, smp_processor_id(), 0);
1509out:
1510 raw_spin_unlock(&p->pi_lock);
1511}
1512
1513/**
1514 * wake_up_process - Wake up a specific process
1515 * @p: The process to be woken up.
1516 *
1517 * Attempt to wake up the nominated process and move it to the set of runnable
1518 * processes. Returns 1 if the process was woken up, 0 if it was already
1519 * running.
1520 *
1521 * It may be assumed that this function implies a write memory barrier before
1522 * changing the task state if and only if any tasks are woken up.
1523 */
1524int wake_up_process(struct task_struct *p)
1525{
1526 return try_to_wake_up(p, TASK_ALL, 0);
1527}
1528EXPORT_SYMBOL(wake_up_process);
1529
1530int wake_up_state(struct task_struct *p, unsigned int state)
1531{
1532 return try_to_wake_up(p, state, 0);
1533}
1534
1535/*
1536 * Perform scheduler related setup for a newly forked process p.
1537 * p is forked by current.
1538 *
1539 * __sched_fork() is basic setup used by init_idle() too:
1540 */
1541static void __sched_fork(struct task_struct *p)
1542{
1543 p->on_rq = 0;
1544
1545 p->se.on_rq = 0;
1546 p->se.exec_start = 0;
1547 p->se.sum_exec_runtime = 0;
1548 p->se.prev_sum_exec_runtime = 0;
1549 p->se.nr_migrations = 0;
1550 p->se.vruntime = 0;
1551 INIT_LIST_HEAD(&p->se.group_node);
1552
1553/*
1554 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
1555 * removed when useful for applications beyond shares distribution (e.g.
1556 * load-balance).
1557 */
1558#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1559 p->se.avg.runnable_avg_period = 0;
1560 p->se.avg.runnable_avg_sum = 0;
1561#endif
1562#ifdef CONFIG_SCHEDSTATS
1563 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1564#endif
1565
1566 INIT_LIST_HEAD(&p->rt.run_list);
1567
1568#ifdef CONFIG_PREEMPT_NOTIFIERS
1569 INIT_HLIST_HEAD(&p->preempt_notifiers);
1570#endif
1571
1572#ifdef CONFIG_NUMA_BALANCING
1573 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1574 p->mm->numa_next_scan = jiffies;
1575 p->mm->numa_next_reset = jiffies;
1576 p->mm->numa_scan_seq = 0;
1577 }
1578
1579 p->node_stamp = 0ULL;
1580 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1581 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1582 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1583 p->numa_work.next = &p->numa_work;
1584#endif /* CONFIG_NUMA_BALANCING */
1585}
1586
1587#ifdef CONFIG_NUMA_BALANCING
1588#ifdef CONFIG_SCHED_DEBUG
1589void set_numabalancing_state(bool enabled)
1590{
1591 if (enabled)
1592 sched_feat_set("NUMA");
1593 else
1594 sched_feat_set("NO_NUMA");
1595}
1596#else
1597__read_mostly bool numabalancing_enabled;
1598
1599void set_numabalancing_state(bool enabled)
1600{
1601 numabalancing_enabled = enabled;
1602}
1603#endif /* CONFIG_SCHED_DEBUG */
1604#endif /* CONFIG_NUMA_BALANCING */
1605
1606/*
1607 * fork()/clone()-time setup:
1608 */
1609void sched_fork(struct task_struct *p)
1610{
1611 unsigned long flags;
1612 int cpu = get_cpu();
1613
1614 __sched_fork(p);
1615 /*
1616 * We mark the process as running here. This guarantees that
1617 * nobody will actually run it, and a signal or other external
1618 * event cannot wake it up and insert it on the runqueue either.
1619 */
1620 p->state = TASK_RUNNING;
1621
1622 /*
1623 * Make sure we do not leak PI boosting priority to the child.
1624 */
1625 p->prio = current->normal_prio;
1626
1627 /*
1628 * Revert to default priority/policy on fork if requested.
1629 */
1630 if (unlikely(p->sched_reset_on_fork)) {
1631 if (task_has_rt_policy(p)) {
1632 p->policy = SCHED_NORMAL;
1633 p->static_prio = NICE_TO_PRIO(0);
1634 p->rt_priority = 0;
1635 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1636 p->static_prio = NICE_TO_PRIO(0);
1637
1638 p->prio = p->normal_prio = __normal_prio(p);
1639 set_load_weight(p);
1640
1641 /*
1642 * We don't need the reset flag anymore after the fork. It has
1643 * fulfilled its duty:
1644 */
1645 p->sched_reset_on_fork = 0;
1646 }
1647
1648 if (!rt_prio(p->prio))
1649 p->sched_class = &fair_sched_class;
1650
1651 if (p->sched_class->task_fork)
1652 p->sched_class->task_fork(p);
1653
1654 /*
1655 * The child is not yet in the pid-hash so no cgroup attach races,
1656 * and the cgroup is pinned to this child due to cgroup_fork()
1657 * is ran before sched_fork().
1658 *
1659 * Silence PROVE_RCU.
1660 */
1661 raw_spin_lock_irqsave(&p->pi_lock, flags);
1662 set_task_cpu(p, cpu);
1663 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1664
1665#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1666 if (likely(sched_info_on()))
1667 memset(&p->sched_info, 0, sizeof(p->sched_info));
1668#endif
1669#if defined(CONFIG_SMP)
1670 p->on_cpu = 0;
1671#endif
1672#ifdef CONFIG_PREEMPT_COUNT
1673 /* Want to start with kernel preemption disabled. */
1674 task_thread_info(p)->preempt_count = 1;
1675#endif
1676#ifdef CONFIG_SMP
1677 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1678#endif
1679
1680 put_cpu();
1681}
1682
1683/*
1684 * wake_up_new_task - wake up a newly created task for the first time.
1685 *
1686 * This function will do some initial scheduler statistics housekeeping
1687 * that must be done for every newly created context, then puts the task
1688 * on the runqueue and wakes it.
1689 */
1690void wake_up_new_task(struct task_struct *p)
1691{
1692 unsigned long flags;
1693 struct rq *rq;
1694
1695 raw_spin_lock_irqsave(&p->pi_lock, flags);
1696#ifdef CONFIG_SMP
1697 /*
1698 * Fork balancing, do it here and not earlier because:
1699 * - cpus_allowed can change in the fork path
1700 * - any previously selected cpu might disappear through hotplug
1701 */
1702 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1703#endif
1704
1705 rq = __task_rq_lock(p);
1706 activate_task(rq, p, 0);
1707 p->on_rq = 1;
1708 trace_sched_wakeup_new(p, true);
1709 check_preempt_curr(rq, p, WF_FORK);
1710#ifdef CONFIG_SMP
1711 if (p->sched_class->task_woken)
1712 p->sched_class->task_woken(rq, p);
1713#endif
1714 task_rq_unlock(rq, p, &flags);
1715}
1716
1717#ifdef CONFIG_PREEMPT_NOTIFIERS
1718
1719/**
1720 * preempt_notifier_register - tell me when current is being preempted & rescheduled
1721 * @notifier: notifier struct to register
1722 */
1723void preempt_notifier_register(struct preempt_notifier *notifier)
1724{
1725 hlist_add_head(&notifier->link, &current->preempt_notifiers);
1726}
1727EXPORT_SYMBOL_GPL(preempt_notifier_register);
1728
1729/**
1730 * preempt_notifier_unregister - no longer interested in preemption notifications
1731 * @notifier: notifier struct to unregister
1732 *
1733 * This is safe to call from within a preemption notifier.
1734 */
1735void preempt_notifier_unregister(struct preempt_notifier *notifier)
1736{
1737 hlist_del(&notifier->link);
1738}
1739EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1740
1741static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1742{
1743 struct preempt_notifier *notifier;
1744 struct hlist_node *node;
1745
1746 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1747 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1748}
1749
1750static void
1751fire_sched_out_preempt_notifiers(struct task_struct *curr,
1752 struct task_struct *next)
1753{
1754 struct preempt_notifier *notifier;
1755 struct hlist_node *node;
1756
1757 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1758 notifier->ops->sched_out(notifier, next);
1759}
1760
1761#else /* !CONFIG_PREEMPT_NOTIFIERS */
1762
1763static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1764{
1765}
1766
1767static void
1768fire_sched_out_preempt_notifiers(struct task_struct *curr,
1769 struct task_struct *next)
1770{
1771}
1772
1773#endif /* CONFIG_PREEMPT_NOTIFIERS */
1774
1775/**
1776 * prepare_task_switch - prepare to switch tasks
1777 * @rq: the runqueue preparing to switch
1778 * @prev: the current task that is being switched out
1779 * @next: the task we are going to switch to.
1780 *
1781 * This is called with the rq lock held and interrupts off. It must
1782 * be paired with a subsequent finish_task_switch after the context
1783 * switch.
1784 *
1785 * prepare_task_switch sets up locking and calls architecture specific
1786 * hooks.
1787 */
1788static inline void
1789prepare_task_switch(struct rq *rq, struct task_struct *prev,
1790 struct task_struct *next)
1791{
1792 trace_sched_switch(prev, next);
1793 sched_info_switch(prev, next);
1794 perf_event_task_sched_out(prev, next);
1795 fire_sched_out_preempt_notifiers(prev, next);
1796 prepare_lock_switch(rq, next);
1797 prepare_arch_switch(next);
1798}
1799
1800/**
1801 * finish_task_switch - clean up after a task-switch
1802 * @rq: runqueue associated with task-switch
1803 * @prev: the thread we just switched away from.
1804 *
1805 * finish_task_switch must be called after the context switch, paired
1806 * with a prepare_task_switch call before the context switch.
1807 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1808 * and do any other architecture-specific cleanup actions.
1809 *
1810 * Note that we may have delayed dropping an mm in context_switch(). If
1811 * so, we finish that here outside of the runqueue lock. (Doing it
1812 * with the lock held can cause deadlocks; see schedule() for
1813 * details.)
1814 */
1815static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1816 __releases(rq->lock)
1817{
1818 struct mm_struct *mm = rq->prev_mm;
1819 long prev_state;
1820
1821 rq->prev_mm = NULL;
1822
1823 /*
1824 * A task struct has one reference for the use as "current".
1825 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1826 * schedule one last time. The schedule call will never return, and
1827 * the scheduled task must drop that reference.
1828 * The test for TASK_DEAD must occur while the runqueue locks are
1829 * still held, otherwise prev could be scheduled on another cpu, die
1830 * there before we look at prev->state, and then the reference would
1831 * be dropped twice.
1832 * Manfred Spraul <manfred@colorfullife.com>
1833 */
1834 prev_state = prev->state;
1835 vtime_task_switch(prev);
1836 finish_arch_switch(prev);
1837 perf_event_task_sched_in(prev, current);
1838 finish_lock_switch(rq, prev);
1839 finish_arch_post_lock_switch();
1840
1841 fire_sched_in_preempt_notifiers(current);
1842 if (mm)
1843 mmdrop(mm);
1844 if (unlikely(prev_state == TASK_DEAD)) {
1845 /*
1846 * Remove function-return probe instances associated with this
1847 * task and put them back on the free list.
1848 */
1849 kprobe_flush_task(prev);
1850 put_task_struct(prev);
1851 }
1852}
1853
1854#ifdef CONFIG_SMP
1855
1856/* assumes rq->lock is held */
1857static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1858{
1859 if (prev->sched_class->pre_schedule)
1860 prev->sched_class->pre_schedule(rq, prev);
1861}
1862
1863/* rq->lock is NOT held, but preemption is disabled */
1864static inline void post_schedule(struct rq *rq)
1865{
1866 if (rq->post_schedule) {
1867 unsigned long flags;
1868
1869 raw_spin_lock_irqsave(&rq->lock, flags);
1870 if (rq->curr->sched_class->post_schedule)
1871 rq->curr->sched_class->post_schedule(rq);
1872 raw_spin_unlock_irqrestore(&rq->lock, flags);
1873
1874 rq->post_schedule = 0;
1875 }
1876}
1877
1878#else
1879
1880static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1881{
1882}
1883
1884static inline void post_schedule(struct rq *rq)
1885{
1886}
1887
1888#endif
1889
1890/**
1891 * schedule_tail - first thing a freshly forked thread must call.
1892 * @prev: the thread we just switched away from.
1893 */
1894asmlinkage void schedule_tail(struct task_struct *prev)
1895 __releases(rq->lock)
1896{
1897 struct rq *rq = this_rq();
1898
1899 finish_task_switch(rq, prev);
1900
1901 /*
1902 * FIXME: do we need to worry about rq being invalidated by the
1903 * task_switch?
1904 */
1905 post_schedule(rq);
1906
1907#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1908 /* In this case, finish_task_switch does not reenable preemption */
1909 preempt_enable();
1910#endif
1911 if (current->set_child_tid)
1912 put_user(task_pid_vnr(current), current->set_child_tid);
1913}
1914
1915/*
1916 * context_switch - switch to the new MM and the new
1917 * thread's register state.
1918 */
1919static inline void
1920context_switch(struct rq *rq, struct task_struct *prev,
1921 struct task_struct *next)
1922{
1923 struct mm_struct *mm, *oldmm;
1924
1925 prepare_task_switch(rq, prev, next);
1926
1927 mm = next->mm;
1928 oldmm = prev->active_mm;
1929 /*
1930 * For paravirt, this is coupled with an exit in switch_to to
1931 * combine the page table reload and the switch backend into
1932 * one hypercall.
1933 */
1934 arch_start_context_switch(prev);
1935
1936 if (!mm) {
1937 next->active_mm = oldmm;
1938 atomic_inc(&oldmm->mm_count);
1939 enter_lazy_tlb(oldmm, next);
1940 } else
1941 switch_mm(oldmm, mm, next);
1942
1943 if (!prev->mm) {
1944 prev->active_mm = NULL;
1945 rq->prev_mm = oldmm;
1946 }
1947 /*
1948 * Since the runqueue lock will be released by the next
1949 * task (which is an invalid locking op but in the case
1950 * of the scheduler it's an obvious special-case), so we
1951 * do an early lockdep release here:
1952 */
1953#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1954 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1955#endif
1956
1957 context_tracking_task_switch(prev, next);
1958 /* Here we just switch the register state and the stack. */
1959 switch_to(prev, next, prev);
1960
1961 barrier();
1962 /*
1963 * this_rq must be evaluated again because prev may have moved
1964 * CPUs since it called schedule(), thus the 'rq' on its stack
1965 * frame will be invalid.
1966 */
1967 finish_task_switch(this_rq(), prev);
1968}
1969
1970/*
1971 * nr_running, nr_uninterruptible and nr_context_switches:
1972 *
1973 * externally visible scheduler statistics: current number of runnable
1974 * threads, current number of uninterruptible-sleeping threads, total
1975 * number of context switches performed since bootup.
1976 */
1977unsigned long nr_running(void)
1978{
1979 unsigned long i, sum = 0;
1980
1981 for_each_online_cpu(i)
1982 sum += cpu_rq(i)->nr_running;
1983
1984 return sum;
1985}
1986
1987unsigned long nr_uninterruptible(void)
1988{
1989 unsigned long i, sum = 0;
1990
1991 for_each_possible_cpu(i)
1992 sum += cpu_rq(i)->nr_uninterruptible;
1993
1994 /*
1995 * Since we read the counters lockless, it might be slightly
1996 * inaccurate. Do not allow it to go below zero though:
1997 */
1998 if (unlikely((long)sum < 0))
1999 sum = 0;
2000
2001 return sum;
2002}
2003
2004unsigned long long nr_context_switches(void)
2005{
2006 int i;
2007 unsigned long long sum = 0;
2008
2009 for_each_possible_cpu(i)
2010 sum += cpu_rq(i)->nr_switches;
2011
2012 return sum;
2013}
2014
2015unsigned long nr_iowait(void)
2016{
2017 unsigned long i, sum = 0;
2018
2019 for_each_possible_cpu(i)
2020 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2021
2022 return sum;
2023}
2024
2025unsigned long nr_iowait_cpu(int cpu)
2026{
2027 struct rq *this = cpu_rq(cpu);
2028 return atomic_read(&this->nr_iowait);
2029}
2030
2031unsigned long this_cpu_load(void)
2032{
2033 struct rq *this = this_rq();
2034 return this->cpu_load[0];
2035}
2036
2037
2038/*
2039 * Global load-average calculations
2040 *
2041 * We take a distributed and async approach to calculating the global load-avg
2042 * in order to minimize overhead.
2043 *
2044 * The global load average is an exponentially decaying average of nr_running +
2045 * nr_uninterruptible.
2046 *
2047 * Once every LOAD_FREQ:
2048 *
2049 * nr_active = 0;
2050 * for_each_possible_cpu(cpu)
2051 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2052 *
2053 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2054 *
2055 * Due to a number of reasons the above turns in the mess below:
2056 *
2057 * - for_each_possible_cpu() is prohibitively expensive on machines with
2058 * serious number of cpus, therefore we need to take a distributed approach
2059 * to calculating nr_active.
2060 *
2061 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2062 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2063 *
2064 * So assuming nr_active := 0 when we start out -- true per definition, we
2065 * can simply take per-cpu deltas and fold those into a global accumulate
2066 * to obtain the same result. See calc_load_fold_active().
2067 *
2068 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2069 * across the machine, we assume 10 ticks is sufficient time for every
2070 * cpu to have completed this task.
2071 *
2072 * This places an upper-bound on the IRQ-off latency of the machine. Then
2073 * again, being late doesn't loose the delta, just wrecks the sample.
2074 *
2075 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2076 * this would add another cross-cpu cacheline miss and atomic operation
2077 * to the wakeup path. Instead we increment on whatever cpu the task ran
2078 * when it went into uninterruptible state and decrement on whatever cpu
2079 * did the wakeup. This means that only the sum of nr_uninterruptible over
2080 * all cpus yields the correct result.
2081 *
2082 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2083 */
2084
2085/* Variables and functions for calc_load */
2086static atomic_long_t calc_load_tasks;
2087static unsigned long calc_load_update;
2088unsigned long avenrun[3];
2089EXPORT_SYMBOL(avenrun); /* should be removed */
2090
2091/**
2092 * get_avenrun - get the load average array
2093 * @loads: pointer to dest load array
2094 * @offset: offset to add
2095 * @shift: shift count to shift the result left
2096 *
2097 * These values are estimates at best, so no need for locking.
2098 */
2099void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2100{
2101 loads[0] = (avenrun[0] + offset) << shift;
2102 loads[1] = (avenrun[1] + offset) << shift;
2103 loads[2] = (avenrun[2] + offset) << shift;
2104}
2105
2106static long calc_load_fold_active(struct rq *this_rq)
2107{
2108 long nr_active, delta = 0;
2109
2110 nr_active = this_rq->nr_running;
2111 nr_active += (long) this_rq->nr_uninterruptible;
2112
2113 if (nr_active != this_rq->calc_load_active) {
2114 delta = nr_active - this_rq->calc_load_active;
2115 this_rq->calc_load_active = nr_active;
2116 }
2117
2118 return delta;
2119}
2120
2121/*
2122 * a1 = a0 * e + a * (1 - e)
2123 */
2124static unsigned long
2125calc_load(unsigned long load, unsigned long exp, unsigned long active)
2126{
2127 load *= exp;
2128 load += active * (FIXED_1 - exp);
2129 load += 1UL << (FSHIFT - 1);
2130 return load >> FSHIFT;
2131}
2132
2133#ifdef CONFIG_NO_HZ
2134/*
2135 * Handle NO_HZ for the global load-average.
2136 *
2137 * Since the above described distributed algorithm to compute the global
2138 * load-average relies on per-cpu sampling from the tick, it is affected by
2139 * NO_HZ.
2140 *
2141 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2142 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2143 * when we read the global state.
2144 *
2145 * Obviously reality has to ruin such a delightfully simple scheme:
2146 *
2147 * - When we go NO_HZ idle during the window, we can negate our sample
2148 * contribution, causing under-accounting.
2149 *
2150 * We avoid this by keeping two idle-delta counters and flipping them
2151 * when the window starts, thus separating old and new NO_HZ load.
2152 *
2153 * The only trick is the slight shift in index flip for read vs write.
2154 *
2155 * 0s 5s 10s 15s
2156 * +10 +10 +10 +10
2157 * |-|-----------|-|-----------|-|-----------|-|
2158 * r:0 0 1 1 0 0 1 1 0
2159 * w:0 1 1 0 0 1 1 0 0
2160 *
2161 * This ensures we'll fold the old idle contribution in this window while
2162 * accumlating the new one.
2163 *
2164 * - When we wake up from NO_HZ idle during the window, we push up our
2165 * contribution, since we effectively move our sample point to a known
2166 * busy state.
2167 *
2168 * This is solved by pushing the window forward, and thus skipping the
2169 * sample, for this cpu (effectively using the idle-delta for this cpu which
2170 * was in effect at the time the window opened). This also solves the issue
2171 * of having to deal with a cpu having been in NOHZ idle for multiple
2172 * LOAD_FREQ intervals.
2173 *
2174 * When making the ILB scale, we should try to pull this in as well.
2175 */
2176static atomic_long_t calc_load_idle[2];
2177static int calc_load_idx;
2178
2179static inline int calc_load_write_idx(void)
2180{
2181 int idx = calc_load_idx;
2182
2183 /*
2184 * See calc_global_nohz(), if we observe the new index, we also
2185 * need to observe the new update time.
2186 */
2187 smp_rmb();
2188
2189 /*
2190 * If the folding window started, make sure we start writing in the
2191 * next idle-delta.
2192 */
2193 if (!time_before(jiffies, calc_load_update))
2194 idx++;
2195
2196 return idx & 1;
2197}
2198
2199static inline int calc_load_read_idx(void)
2200{
2201 return calc_load_idx & 1;
2202}
2203
2204void calc_load_enter_idle(void)
2205{
2206 struct rq *this_rq = this_rq();
2207 long delta;
2208
2209 /*
2210 * We're going into NOHZ mode, if there's any pending delta, fold it
2211 * into the pending idle delta.
2212 */
2213 delta = calc_load_fold_active(this_rq);
2214 if (delta) {
2215 int idx = calc_load_write_idx();
2216 atomic_long_add(delta, &calc_load_idle[idx]);
2217 }
2218}
2219
2220void calc_load_exit_idle(void)
2221{
2222 struct rq *this_rq = this_rq();
2223
2224 /*
2225 * If we're still before the sample window, we're done.
2226 */
2227 if (time_before(jiffies, this_rq->calc_load_update))
2228 return;
2229
2230 /*
2231 * We woke inside or after the sample window, this means we're already
2232 * accounted through the nohz accounting, so skip the entire deal and
2233 * sync up for the next window.
2234 */
2235 this_rq->calc_load_update = calc_load_update;
2236 if (time_before(jiffies, this_rq->calc_load_update + 10))
2237 this_rq->calc_load_update += LOAD_FREQ;
2238}
2239
2240static long calc_load_fold_idle(void)
2241{
2242 int idx = calc_load_read_idx();
2243 long delta = 0;
2244
2245 if (atomic_long_read(&calc_load_idle[idx]))
2246 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2247
2248 return delta;
2249}
2250
2251/**
2252 * fixed_power_int - compute: x^n, in O(log n) time
2253 *
2254 * @x: base of the power
2255 * @frac_bits: fractional bits of @x
2256 * @n: power to raise @x to.
2257 *
2258 * By exploiting the relation between the definition of the natural power
2259 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2260 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2261 * (where: n_i \elem {0, 1}, the binary vector representing n),
2262 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2263 * of course trivially computable in O(log_2 n), the length of our binary
2264 * vector.
2265 */
2266static unsigned long
2267fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2268{
2269 unsigned long result = 1UL << frac_bits;
2270
2271 if (n) for (;;) {
2272 if (n & 1) {
2273 result *= x;
2274 result += 1UL << (frac_bits - 1);
2275 result >>= frac_bits;
2276 }
2277 n >>= 1;
2278 if (!n)
2279 break;
2280 x *= x;
2281 x += 1UL << (frac_bits - 1);
2282 x >>= frac_bits;
2283 }
2284
2285 return result;
2286}
2287
2288/*
2289 * a1 = a0 * e + a * (1 - e)
2290 *
2291 * a2 = a1 * e + a * (1 - e)
2292 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2293 * = a0 * e^2 + a * (1 - e) * (1 + e)
2294 *
2295 * a3 = a2 * e + a * (1 - e)
2296 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2297 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2298 *
2299 * ...
2300 *
2301 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2302 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2303 * = a0 * e^n + a * (1 - e^n)
2304 *
2305 * [1] application of the geometric series:
2306 *
2307 * n 1 - x^(n+1)
2308 * S_n := \Sum x^i = -------------
2309 * i=0 1 - x
2310 */
2311static unsigned long
2312calc_load_n(unsigned long load, unsigned long exp,
2313 unsigned long active, unsigned int n)
2314{
2315
2316 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2317}
2318
2319/*
2320 * NO_HZ can leave us missing all per-cpu ticks calling
2321 * calc_load_account_active(), but since an idle CPU folds its delta into
2322 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2323 * in the pending idle delta if our idle period crossed a load cycle boundary.
2324 *
2325 * Once we've updated the global active value, we need to apply the exponential
2326 * weights adjusted to the number of cycles missed.
2327 */
2328static void calc_global_nohz(void)
2329{
2330 long delta, active, n;
2331
2332 if (!time_before(jiffies, calc_load_update + 10)) {
2333 /*
2334 * Catch-up, fold however many we are behind still
2335 */
2336 delta = jiffies - calc_load_update - 10;
2337 n = 1 + (delta / LOAD_FREQ);
2338
2339 active = atomic_long_read(&calc_load_tasks);
2340 active = active > 0 ? active * FIXED_1 : 0;
2341
2342 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2343 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2344 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2345
2346 calc_load_update += n * LOAD_FREQ;
2347 }
2348
2349 /*
2350 * Flip the idle index...
2351 *
2352 * Make sure we first write the new time then flip the index, so that
2353 * calc_load_write_idx() will see the new time when it reads the new
2354 * index, this avoids a double flip messing things up.
2355 */
2356 smp_wmb();
2357 calc_load_idx++;
2358}
2359#else /* !CONFIG_NO_HZ */
2360
2361static inline long calc_load_fold_idle(void) { return 0; }
2362static inline void calc_global_nohz(void) { }
2363
2364#endif /* CONFIG_NO_HZ */
2365
2366/*
2367 * calc_load - update the avenrun load estimates 10 ticks after the
2368 * CPUs have updated calc_load_tasks.
2369 */
2370void calc_global_load(unsigned long ticks)
2371{
2372 long active, delta;
2373
2374 if (time_before(jiffies, calc_load_update + 10))
2375 return;
2376
2377 /*
2378 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2379 */
2380 delta = calc_load_fold_idle();
2381 if (delta)
2382 atomic_long_add(delta, &calc_load_tasks);
2383
2384 active = atomic_long_read(&calc_load_tasks);
2385 active = active > 0 ? active * FIXED_1 : 0;
2386
2387 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2388 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2389 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2390
2391 calc_load_update += LOAD_FREQ;
2392
2393 /*
2394 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2395 */
2396 calc_global_nohz();
2397}
2398
2399/*
2400 * Called from update_cpu_load() to periodically update this CPU's
2401 * active count.
2402 */
2403static void calc_load_account_active(struct rq *this_rq)
2404{
2405 long delta;
2406
2407 if (time_before(jiffies, this_rq->calc_load_update))
2408 return;
2409
2410 delta = calc_load_fold_active(this_rq);
2411 if (delta)
2412 atomic_long_add(delta, &calc_load_tasks);
2413
2414 this_rq->calc_load_update += LOAD_FREQ;
2415}
2416
2417/*
2418 * End of global load-average stuff
2419 */
2420
2421/*
2422 * The exact cpuload at various idx values, calculated at every tick would be
2423 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2424 *
2425 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2426 * on nth tick when cpu may be busy, then we have:
2427 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2428 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2429 *
2430 * decay_load_missed() below does efficient calculation of
2431 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2432 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2433 *
2434 * The calculation is approximated on a 128 point scale.
2435 * degrade_zero_ticks is the number of ticks after which load at any
2436 * particular idx is approximated to be zero.
2437 * degrade_factor is a precomputed table, a row for each load idx.
2438 * Each column corresponds to degradation factor for a power of two ticks,
2439 * based on 128 point scale.
2440 * Example:
2441 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2442 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2443 *
2444 * With this power of 2 load factors, we can degrade the load n times
2445 * by looking at 1 bits in n and doing as many mult/shift instead of
2446 * n mult/shifts needed by the exact degradation.
2447 */
2448#define DEGRADE_SHIFT 7
2449static const unsigned char
2450 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2451static const unsigned char
2452 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2453 {0, 0, 0, 0, 0, 0, 0, 0},
2454 {64, 32, 8, 0, 0, 0, 0, 0},
2455 {96, 72, 40, 12, 1, 0, 0},
2456 {112, 98, 75, 43, 15, 1, 0},
2457 {120, 112, 98, 76, 45, 16, 2} };
2458
2459/*
2460 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2461 * would be when CPU is idle and so we just decay the old load without
2462 * adding any new load.
2463 */
2464static unsigned long
2465decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2466{
2467 int j = 0;
2468
2469 if (!missed_updates)
2470 return load;
2471
2472 if (missed_updates >= degrade_zero_ticks[idx])
2473 return 0;
2474
2475 if (idx == 1)
2476 return load >> missed_updates;
2477
2478 while (missed_updates) {
2479 if (missed_updates % 2)
2480 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2481
2482 missed_updates >>= 1;
2483 j++;
2484 }
2485 return load;
2486}
2487
2488/*
2489 * Update rq->cpu_load[] statistics. This function is usually called every
2490 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2491 * every tick. We fix it up based on jiffies.
2492 */
2493static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2494 unsigned long pending_updates)
2495{
2496 int i, scale;
2497
2498 this_rq->nr_load_updates++;
2499
2500 /* Update our load: */
2501 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2502 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2503 unsigned long old_load, new_load;
2504
2505 /* scale is effectively 1 << i now, and >> i divides by scale */
2506
2507 old_load = this_rq->cpu_load[i];
2508 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2509 new_load = this_load;
2510 /*
2511 * Round up the averaging division if load is increasing. This
2512 * prevents us from getting stuck on 9 if the load is 10, for
2513 * example.
2514 */
2515 if (new_load > old_load)
2516 new_load += scale - 1;
2517
2518 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2519 }
2520
2521 sched_avg_update(this_rq);
2522}
2523
2524#ifdef CONFIG_NO_HZ
2525/*
2526 * There is no sane way to deal with nohz on smp when using jiffies because the
2527 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2528 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2529 *
2530 * Therefore we cannot use the delta approach from the regular tick since that
2531 * would seriously skew the load calculation. However we'll make do for those
2532 * updates happening while idle (nohz_idle_balance) or coming out of idle
2533 * (tick_nohz_idle_exit).
2534 *
2535 * This means we might still be one tick off for nohz periods.
2536 */
2537
2538/*
2539 * Called from nohz_idle_balance() to update the load ratings before doing the
2540 * idle balance.
2541 */
2542void update_idle_cpu_load(struct rq *this_rq)
2543{
2544 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2545 unsigned long load = this_rq->load.weight;
2546 unsigned long pending_updates;
2547
2548 /*
2549 * bail if there's load or we're actually up-to-date.
2550 */
2551 if (load || curr_jiffies == this_rq->last_load_update_tick)
2552 return;
2553
2554 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2555 this_rq->last_load_update_tick = curr_jiffies;
2556
2557 __update_cpu_load(this_rq, load, pending_updates);
2558}
2559
2560/*
2561 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2562 */
2563void update_cpu_load_nohz(void)
2564{
2565 struct rq *this_rq = this_rq();
2566 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2567 unsigned long pending_updates;
2568
2569 if (curr_jiffies == this_rq->last_load_update_tick)
2570 return;
2571
2572 raw_spin_lock(&this_rq->lock);
2573 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2574 if (pending_updates) {
2575 this_rq->last_load_update_tick = curr_jiffies;
2576 /*
2577 * We were idle, this means load 0, the current load might be
2578 * !0 due to remote wakeups and the sort.
2579 */
2580 __update_cpu_load(this_rq, 0, pending_updates);
2581 }
2582 raw_spin_unlock(&this_rq->lock);
2583}
2584#endif /* CONFIG_NO_HZ */
2585
2586/*
2587 * Called from scheduler_tick()
2588 */
2589static void update_cpu_load_active(struct rq *this_rq)
2590{
2591 /*
2592 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2593 */
2594 this_rq->last_load_update_tick = jiffies;
2595 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2596
2597 calc_load_account_active(this_rq);
2598}
2599
2600#ifdef CONFIG_SMP
2601
2602/*
2603 * sched_exec - execve() is a valuable balancing opportunity, because at
2604 * this point the task has the smallest effective memory and cache footprint.
2605 */
2606void sched_exec(void)
2607{
2608 struct task_struct *p = current;
2609 unsigned long flags;
2610 int dest_cpu;
2611
2612 raw_spin_lock_irqsave(&p->pi_lock, flags);
2613 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2614 if (dest_cpu == smp_processor_id())
2615 goto unlock;
2616
2617 if (likely(cpu_active(dest_cpu))) {
2618 struct migration_arg arg = { p, dest_cpu };
2619
2620 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2621 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2622 return;
2623 }
2624unlock:
2625 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2626}
2627
2628#endif
2629
2630DEFINE_PER_CPU(struct kernel_stat, kstat);
2631DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2632
2633EXPORT_PER_CPU_SYMBOL(kstat);
2634EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2635
2636/*
2637 * Return any ns on the sched_clock that have not yet been accounted in
2638 * @p in case that task is currently running.
2639 *
2640 * Called with task_rq_lock() held on @rq.
2641 */
2642static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2643{
2644 u64 ns = 0;
2645
2646 if (task_current(rq, p)) {
2647 update_rq_clock(rq);
2648 ns = rq->clock_task - p->se.exec_start;
2649 if ((s64)ns < 0)
2650 ns = 0;
2651 }
2652
2653 return ns;
2654}
2655
2656unsigned long long task_delta_exec(struct task_struct *p)
2657{
2658 unsigned long flags;
2659 struct rq *rq;
2660 u64 ns = 0;
2661
2662 rq = task_rq_lock(p, &flags);
2663 ns = do_task_delta_exec(p, rq);
2664 task_rq_unlock(rq, p, &flags);
2665
2666 return ns;
2667}
2668
2669/*
2670 * Return accounted runtime for the task.
2671 * In case the task is currently running, return the runtime plus current's
2672 * pending runtime that have not been accounted yet.
2673 */
2674unsigned long long task_sched_runtime(struct task_struct *p)
2675{
2676 unsigned long flags;
2677 struct rq *rq;
2678 u64 ns = 0;
2679
2680 rq = task_rq_lock(p, &flags);
2681 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2682 task_rq_unlock(rq, p, &flags);
2683
2684 return ns;
2685}
2686
2687/*
2688 * This function gets called by the timer code, with HZ frequency.
2689 * We call it with interrupts disabled.
2690 */
2691void scheduler_tick(void)
2692{
2693 int cpu = smp_processor_id();
2694 struct rq *rq = cpu_rq(cpu);
2695 struct task_struct *curr = rq->curr;
2696
2697 sched_clock_tick();
2698
2699 raw_spin_lock(&rq->lock);
2700 update_rq_clock(rq);
2701 update_cpu_load_active(rq);
2702 curr->sched_class->task_tick(rq, curr, 0);
2703 raw_spin_unlock(&rq->lock);
2704
2705 perf_event_task_tick();
2706
2707#ifdef CONFIG_SMP
2708 rq->idle_balance = idle_cpu(cpu);
2709 trigger_load_balance(rq, cpu);
2710#endif
2711}
2712
2713notrace unsigned long get_parent_ip(unsigned long addr)
2714{
2715 if (in_lock_functions(addr)) {
2716 addr = CALLER_ADDR2;
2717 if (in_lock_functions(addr))
2718 addr = CALLER_ADDR3;
2719 }
2720 return addr;
2721}
2722
2723#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2724 defined(CONFIG_PREEMPT_TRACER))
2725
2726void __kprobes add_preempt_count(int val)
2727{
2728#ifdef CONFIG_DEBUG_PREEMPT
2729 /*
2730 * Underflow?
2731 */
2732 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2733 return;
2734#endif
2735 preempt_count() += val;
2736#ifdef CONFIG_DEBUG_PREEMPT
2737 /*
2738 * Spinlock count overflowing soon?
2739 */
2740 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2741 PREEMPT_MASK - 10);
2742#endif
2743 if (preempt_count() == val)
2744 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2745}
2746EXPORT_SYMBOL(add_preempt_count);
2747
2748void __kprobes sub_preempt_count(int val)
2749{
2750#ifdef CONFIG_DEBUG_PREEMPT
2751 /*
2752 * Underflow?
2753 */
2754 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2755 return;
2756 /*
2757 * Is the spinlock portion underflowing?
2758 */
2759 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2760 !(preempt_count() & PREEMPT_MASK)))
2761 return;
2762#endif
2763
2764 if (preempt_count() == val)
2765 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2766 preempt_count() -= val;
2767}
2768EXPORT_SYMBOL(sub_preempt_count);
2769
2770#endif
2771
2772/*
2773 * Print scheduling while atomic bug:
2774 */
2775static noinline void __schedule_bug(struct task_struct *prev)
2776{
2777 if (oops_in_progress)
2778 return;
2779
2780 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2781 prev->comm, prev->pid, preempt_count());
2782
2783 debug_show_held_locks(prev);
2784 print_modules();
2785 if (irqs_disabled())
2786 print_irqtrace_events(prev);
2787 dump_stack();
2788 add_taint(TAINT_WARN);
2789}
2790
2791/*
2792 * Various schedule()-time debugging checks and statistics:
2793 */
2794static inline void schedule_debug(struct task_struct *prev)
2795{
2796 /*
2797 * Test if we are atomic. Since do_exit() needs to call into
2798 * schedule() atomically, we ignore that path for now.
2799 * Otherwise, whine if we are scheduling when we should not be.
2800 */
2801 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
2802 __schedule_bug(prev);
2803 rcu_sleep_check();
2804
2805 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2806
2807 schedstat_inc(this_rq(), sched_count);
2808}
2809
2810static void put_prev_task(struct rq *rq, struct task_struct *prev)
2811{
2812 if (prev->on_rq || rq->skip_clock_update < 0)
2813 update_rq_clock(rq);
2814 prev->sched_class->put_prev_task(rq, prev);
2815}
2816
2817/*
2818 * Pick up the highest-prio task:
2819 */
2820static inline struct task_struct *
2821pick_next_task(struct rq *rq)
2822{
2823 const struct sched_class *class;
2824 struct task_struct *p;
2825
2826 /*
2827 * Optimization: we know that if all tasks are in
2828 * the fair class we can call that function directly:
2829 */
2830 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
2831 p = fair_sched_class.pick_next_task(rq);
2832 if (likely(p))
2833 return p;
2834 }
2835
2836 for_each_class(class) {
2837 p = class->pick_next_task(rq);
2838 if (p)
2839 return p;
2840 }
2841
2842 BUG(); /* the idle class will always have a runnable task */
2843}
2844
2845/*
2846 * __schedule() is the main scheduler function.
2847 *
2848 * The main means of driving the scheduler and thus entering this function are:
2849 *
2850 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2851 *
2852 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2853 * paths. For example, see arch/x86/entry_64.S.
2854 *
2855 * To drive preemption between tasks, the scheduler sets the flag in timer
2856 * interrupt handler scheduler_tick().
2857 *
2858 * 3. Wakeups don't really cause entry into schedule(). They add a
2859 * task to the run-queue and that's it.
2860 *
2861 * Now, if the new task added to the run-queue preempts the current
2862 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2863 * called on the nearest possible occasion:
2864 *
2865 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2866 *
2867 * - in syscall or exception context, at the next outmost
2868 * preempt_enable(). (this might be as soon as the wake_up()'s
2869 * spin_unlock()!)
2870 *
2871 * - in IRQ context, return from interrupt-handler to
2872 * preemptible context
2873 *
2874 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2875 * then at the next:
2876 *
2877 * - cond_resched() call
2878 * - explicit schedule() call
2879 * - return from syscall or exception to user-space
2880 * - return from interrupt-handler to user-space
2881 */
2882static void __sched __schedule(void)
2883{
2884 struct task_struct *prev, *next;
2885 unsigned long *switch_count;
2886 struct rq *rq;
2887 int cpu;
2888
2889need_resched:
2890 preempt_disable();
2891 cpu = smp_processor_id();
2892 rq = cpu_rq(cpu);
2893 rcu_note_context_switch(cpu);
2894 prev = rq->curr;
2895
2896 schedule_debug(prev);
2897
2898 if (sched_feat(HRTICK))
2899 hrtick_clear(rq);
2900
2901 raw_spin_lock_irq(&rq->lock);
2902
2903 switch_count = &prev->nivcsw;
2904 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2905 if (unlikely(signal_pending_state(prev->state, prev))) {
2906 prev->state = TASK_RUNNING;
2907 } else {
2908 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2909 prev->on_rq = 0;
2910
2911 /*
2912 * If a worker went to sleep, notify and ask workqueue
2913 * whether it wants to wake up a task to maintain
2914 * concurrency.
2915 */
2916 if (prev->flags & PF_WQ_WORKER) {
2917 struct task_struct *to_wakeup;
2918
2919 to_wakeup = wq_worker_sleeping(prev, cpu);
2920 if (to_wakeup)
2921 try_to_wake_up_local(to_wakeup);
2922 }
2923 }
2924 switch_count = &prev->nvcsw;
2925 }
2926
2927 pre_schedule(rq, prev);
2928
2929 if (unlikely(!rq->nr_running))
2930 idle_balance(cpu, rq);
2931
2932 put_prev_task(rq, prev);
2933 next = pick_next_task(rq);
2934 clear_tsk_need_resched(prev);
2935 rq->skip_clock_update = 0;
2936
2937 if (likely(prev != next)) {
2938 rq->nr_switches++;
2939 rq->curr = next;
2940 ++*switch_count;
2941
2942 context_switch(rq, prev, next); /* unlocks the rq */
2943 /*
2944 * The context switch have flipped the stack from under us
2945 * and restored the local variables which were saved when
2946 * this task called schedule() in the past. prev == current
2947 * is still correct, but it can be moved to another cpu/rq.
2948 */
2949 cpu = smp_processor_id();
2950 rq = cpu_rq(cpu);
2951 } else
2952 raw_spin_unlock_irq(&rq->lock);
2953
2954 post_schedule(rq);
2955
2956 sched_preempt_enable_no_resched();
2957 if (need_resched())
2958 goto need_resched;
2959}
2960
2961static inline void sched_submit_work(struct task_struct *tsk)
2962{
2963 if (!tsk->state || tsk_is_pi_blocked(tsk))
2964 return;
2965 /*
2966 * If we are going to sleep and we have plugged IO queued,
2967 * make sure to submit it to avoid deadlocks.
2968 */
2969 if (blk_needs_flush_plug(tsk))
2970 blk_schedule_flush_plug(tsk);
2971}
2972
2973asmlinkage void __sched schedule(void)
2974{
2975 struct task_struct *tsk = current;
2976
2977 sched_submit_work(tsk);
2978 __schedule();
2979}
2980EXPORT_SYMBOL(schedule);
2981
2982#ifdef CONFIG_CONTEXT_TRACKING
2983asmlinkage void __sched schedule_user(void)
2984{
2985 /*
2986 * If we come here after a random call to set_need_resched(),
2987 * or we have been woken up remotely but the IPI has not yet arrived,
2988 * we haven't yet exited the RCU idle mode. Do it here manually until
2989 * we find a better solution.
2990 */
2991 user_exit();
2992 schedule();
2993 user_enter();
2994}
2995#endif
2996
2997/**
2998 * schedule_preempt_disabled - called with preemption disabled
2999 *
3000 * Returns with preemption disabled. Note: preempt_count must be 1
3001 */
3002void __sched schedule_preempt_disabled(void)
3003{
3004 sched_preempt_enable_no_resched();
3005 schedule();
3006 preempt_disable();
3007}
3008
3009#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3010
3011static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3012{
3013 if (lock->owner != owner)
3014 return false;
3015
3016 /*
3017 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3018 * lock->owner still matches owner, if that fails, owner might
3019 * point to free()d memory, if it still matches, the rcu_read_lock()
3020 * ensures the memory stays valid.
3021 */
3022 barrier();
3023
3024 return owner->on_cpu;
3025}
3026
3027/*
3028 * Look out! "owner" is an entirely speculative pointer
3029 * access and not reliable.
3030 */
3031int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3032{
3033 if (!sched_feat(OWNER_SPIN))
3034 return 0;
3035
3036 rcu_read_lock();
3037 while (owner_running(lock, owner)) {
3038 if (need_resched())
3039 break;
3040
3041 arch_mutex_cpu_relax();
3042 }
3043 rcu_read_unlock();
3044
3045 /*
3046 * We break out the loop above on need_resched() and when the
3047 * owner changed, which is a sign for heavy contention. Return
3048 * success only when lock->owner is NULL.
3049 */
3050 return lock->owner == NULL;
3051}
3052#endif
3053
3054#ifdef CONFIG_PREEMPT
3055/*
3056 * this is the entry point to schedule() from in-kernel preemption
3057 * off of preempt_enable. Kernel preemptions off return from interrupt
3058 * occur there and call schedule directly.
3059 */
3060asmlinkage void __sched notrace preempt_schedule(void)
3061{
3062 struct thread_info *ti = current_thread_info();
3063
3064 /*
3065 * If there is a non-zero preempt_count or interrupts are disabled,
3066 * we do not want to preempt the current task. Just return..
3067 */
3068 if (likely(ti->preempt_count || irqs_disabled()))
3069 return;
3070
3071 do {
3072 add_preempt_count_notrace(PREEMPT_ACTIVE);
3073 __schedule();
3074 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3075
3076 /*
3077 * Check again in case we missed a preemption opportunity
3078 * between schedule and now.
3079 */
3080 barrier();
3081 } while (need_resched());
3082}
3083EXPORT_SYMBOL(preempt_schedule);
3084
3085/*
3086 * this is the entry point to schedule() from kernel preemption
3087 * off of irq context.
3088 * Note, that this is called and return with irqs disabled. This will
3089 * protect us against recursive calling from irq.
3090 */
3091asmlinkage void __sched preempt_schedule_irq(void)
3092{
3093 struct thread_info *ti = current_thread_info();
3094
3095 /* Catch callers which need to be fixed */
3096 BUG_ON(ti->preempt_count || !irqs_disabled());
3097
3098 user_exit();
3099 do {
3100 add_preempt_count(PREEMPT_ACTIVE);
3101 local_irq_enable();
3102 __schedule();
3103 local_irq_disable();
3104 sub_preempt_count(PREEMPT_ACTIVE);
3105
3106 /*
3107 * Check again in case we missed a preemption opportunity
3108 * between schedule and now.
3109 */
3110 barrier();
3111 } while (need_resched());
3112}
3113
3114#endif /* CONFIG_PREEMPT */
3115
3116int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3117 void *key)
3118{
3119 return try_to_wake_up(curr->private, mode, wake_flags);
3120}
3121EXPORT_SYMBOL(default_wake_function);
3122
3123/*
3124 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3125 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3126 * number) then we wake all the non-exclusive tasks and one exclusive task.
3127 *
3128 * There are circumstances in which we can try to wake a task which has already
3129 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3130 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3131 */
3132static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3133 int nr_exclusive, int wake_flags, void *key)
3134{
3135 wait_queue_t *curr, *next;
3136
3137 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3138 unsigned flags = curr->flags;
3139
3140 if (curr->func(curr, mode, wake_flags, key) &&
3141 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3142 break;
3143 }
3144}
3145
3146/**
3147 * __wake_up - wake up threads blocked on a waitqueue.
3148 * @q: the waitqueue
3149 * @mode: which threads
3150 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3151 * @key: is directly passed to the wakeup function
3152 *
3153 * It may be assumed that this function implies a write memory barrier before
3154 * changing the task state if and only if any tasks are woken up.
3155 */
3156void __wake_up(wait_queue_head_t *q, unsigned int mode,
3157 int nr_exclusive, void *key)
3158{
3159 unsigned long flags;
3160
3161 spin_lock_irqsave(&q->lock, flags);
3162 __wake_up_common(q, mode, nr_exclusive, 0, key);
3163 spin_unlock_irqrestore(&q->lock, flags);
3164}
3165EXPORT_SYMBOL(__wake_up);
3166
3167/*
3168 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3169 */
3170void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3171{
3172 __wake_up_common(q, mode, nr, 0, NULL);
3173}
3174EXPORT_SYMBOL_GPL(__wake_up_locked);
3175
3176void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3177{
3178 __wake_up_common(q, mode, 1, 0, key);
3179}
3180EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3181
3182/**
3183 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
3184 * @q: the waitqueue
3185 * @mode: which threads
3186 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3187 * @key: opaque value to be passed to wakeup targets
3188 *
3189 * The sync wakeup differs that the waker knows that it will schedule
3190 * away soon, so while the target thread will be woken up, it will not
3191 * be migrated to another CPU - ie. the two threads are 'synchronized'
3192 * with each other. This can prevent needless bouncing between CPUs.
3193 *
3194 * On UP it can prevent extra preemption.
3195 *
3196 * It may be assumed that this function implies a write memory barrier before
3197 * changing the task state if and only if any tasks are woken up.
3198 */
3199void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3200 int nr_exclusive, void *key)
3201{
3202 unsigned long flags;
3203 int wake_flags = WF_SYNC;
3204
3205 if (unlikely(!q))
3206 return;
3207
3208 if (unlikely(!nr_exclusive))
3209 wake_flags = 0;
3210
3211 spin_lock_irqsave(&q->lock, flags);
3212 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3213 spin_unlock_irqrestore(&q->lock, flags);
3214}
3215EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3216
3217/*
3218 * __wake_up_sync - see __wake_up_sync_key()
3219 */
3220void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3221{
3222 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3223}
3224EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3225
3226/**
3227 * complete: - signals a single thread waiting on this completion
3228 * @x: holds the state of this particular completion
3229 *
3230 * This will wake up a single thread waiting on this completion. Threads will be
3231 * awakened in the same order in which they were queued.
3232 *
3233 * See also complete_all(), wait_for_completion() and related routines.
3234 *
3235 * It may be assumed that this function implies a write memory barrier before
3236 * changing the task state if and only if any tasks are woken up.
3237 */
3238void complete(struct completion *x)
3239{
3240 unsigned long flags;
3241
3242 spin_lock_irqsave(&x->wait.lock, flags);
3243 x->done++;
3244 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3245 spin_unlock_irqrestore(&x->wait.lock, flags);
3246}
3247EXPORT_SYMBOL(complete);
3248
3249/**
3250 * complete_all: - signals all threads waiting on this completion
3251 * @x: holds the state of this particular completion
3252 *
3253 * This will wake up all threads waiting on this particular completion event.
3254 *
3255 * It may be assumed that this function implies a write memory barrier before
3256 * changing the task state if and only if any tasks are woken up.
3257 */
3258void complete_all(struct completion *x)
3259{
3260 unsigned long flags;
3261
3262 spin_lock_irqsave(&x->wait.lock, flags);
3263 x->done += UINT_MAX/2;
3264 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3265 spin_unlock_irqrestore(&x->wait.lock, flags);
3266}
3267EXPORT_SYMBOL(complete_all);
3268
3269static inline long __sched
3270do_wait_for_common(struct completion *x, long timeout, int state)
3271{
3272 if (!x->done) {
3273 DECLARE_WAITQUEUE(wait, current);
3274
3275 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3276 do {
3277 if (signal_pending_state(state, current)) {
3278 timeout = -ERESTARTSYS;
3279 break;
3280 }
3281 __set_current_state(state);
3282 spin_unlock_irq(&x->wait.lock);
3283 timeout = schedule_timeout(timeout);
3284 spin_lock_irq(&x->wait.lock);
3285 } while (!x->done && timeout);
3286 __remove_wait_queue(&x->wait, &wait);
3287 if (!x->done)
3288 return timeout;
3289 }
3290 x->done--;
3291 return timeout ?: 1;
3292}
3293
3294static long __sched
3295wait_for_common(struct completion *x, long timeout, int state)
3296{
3297 might_sleep();
3298
3299 spin_lock_irq(&x->wait.lock);
3300 timeout = do_wait_for_common(x, timeout, state);
3301 spin_unlock_irq(&x->wait.lock);
3302 return timeout;
3303}
3304
3305/**
3306 * wait_for_completion: - waits for completion of a task
3307 * @x: holds the state of this particular completion
3308 *
3309 * This waits to be signaled for completion of a specific task. It is NOT
3310 * interruptible and there is no timeout.
3311 *
3312 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
3313 * and interrupt capability. Also see complete().
3314 */
3315void __sched wait_for_completion(struct completion *x)
3316{
3317 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3318}
3319EXPORT_SYMBOL(wait_for_completion);
3320
3321/**
3322 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
3323 * @x: holds the state of this particular completion
3324 * @timeout: timeout value in jiffies
3325 *
3326 * This waits for either a completion of a specific task to be signaled or for a
3327 * specified timeout to expire. The timeout is in jiffies. It is not
3328 * interruptible.
3329 *
3330 * The return value is 0 if timed out, and positive (at least 1, or number of
3331 * jiffies left till timeout) if completed.
3332 */
3333unsigned long __sched
3334wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3335{
3336 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3337}
3338EXPORT_SYMBOL(wait_for_completion_timeout);
3339
3340/**
3341 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3342 * @x: holds the state of this particular completion
3343 *
3344 * This waits for completion of a specific task to be signaled. It is
3345 * interruptible.
3346 *
3347 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3348 */
3349int __sched wait_for_completion_interruptible(struct completion *x)
3350{
3351 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3352 if (t == -ERESTARTSYS)
3353 return t;
3354 return 0;
3355}
3356EXPORT_SYMBOL(wait_for_completion_interruptible);
3357
3358/**
3359 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
3360 * @x: holds the state of this particular completion
3361 * @timeout: timeout value in jiffies
3362 *
3363 * This waits for either a completion of a specific task to be signaled or for a
3364 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3365 *
3366 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3367 * positive (at least 1, or number of jiffies left till timeout) if completed.
3368 */
3369long __sched
3370wait_for_completion_interruptible_timeout(struct completion *x,
3371 unsigned long timeout)
3372{
3373 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3374}
3375EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3376
3377/**
3378 * wait_for_completion_killable: - waits for completion of a task (killable)
3379 * @x: holds the state of this particular completion
3380 *
3381 * This waits to be signaled for completion of a specific task. It can be
3382 * interrupted by a kill signal.
3383 *
3384 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3385 */
3386int __sched wait_for_completion_killable(struct completion *x)
3387{
3388 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3389 if (t == -ERESTARTSYS)
3390 return t;
3391 return 0;
3392}
3393EXPORT_SYMBOL(wait_for_completion_killable);
3394
3395/**
3396 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
3397 * @x: holds the state of this particular completion
3398 * @timeout: timeout value in jiffies
3399 *
3400 * This waits for either a completion of a specific task to be
3401 * signaled or for a specified timeout to expire. It can be
3402 * interrupted by a kill signal. The timeout is in jiffies.
3403 *
3404 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3405 * positive (at least 1, or number of jiffies left till timeout) if completed.
3406 */
3407long __sched
3408wait_for_completion_killable_timeout(struct completion *x,
3409 unsigned long timeout)
3410{
3411 return wait_for_common(x, timeout, TASK_KILLABLE);
3412}
3413EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3414
3415/**
3416 * try_wait_for_completion - try to decrement a completion without blocking
3417 * @x: completion structure
3418 *
3419 * Returns: 0 if a decrement cannot be done without blocking
3420 * 1 if a decrement succeeded.
3421 *
3422 * If a completion is being used as a counting completion,
3423 * attempt to decrement the counter without blocking. This
3424 * enables us to avoid waiting if the resource the completion
3425 * is protecting is not available.
3426 */
3427bool try_wait_for_completion(struct completion *x)
3428{
3429 unsigned long flags;
3430 int ret = 1;
3431
3432 spin_lock_irqsave(&x->wait.lock, flags);
3433 if (!x->done)
3434 ret = 0;
3435 else
3436 x->done--;
3437 spin_unlock_irqrestore(&x->wait.lock, flags);
3438 return ret;
3439}
3440EXPORT_SYMBOL(try_wait_for_completion);
3441
3442/**
3443 * completion_done - Test to see if a completion has any waiters
3444 * @x: completion structure
3445 *
3446 * Returns: 0 if there are waiters (wait_for_completion() in progress)
3447 * 1 if there are no waiters.
3448 *
3449 */
3450bool completion_done(struct completion *x)
3451{
3452 unsigned long flags;
3453 int ret = 1;
3454
3455 spin_lock_irqsave(&x->wait.lock, flags);
3456 if (!x->done)
3457 ret = 0;
3458 spin_unlock_irqrestore(&x->wait.lock, flags);
3459 return ret;
3460}
3461EXPORT_SYMBOL(completion_done);
3462
3463static long __sched
3464sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3465{
3466 unsigned long flags;
3467 wait_queue_t wait;
3468
3469 init_waitqueue_entry(&wait, current);
3470
3471 __set_current_state(state);
3472
3473 spin_lock_irqsave(&q->lock, flags);
3474 __add_wait_queue(q, &wait);
3475 spin_unlock(&q->lock);
3476 timeout = schedule_timeout(timeout);
3477 spin_lock_irq(&q->lock);
3478 __remove_wait_queue(q, &wait);
3479 spin_unlock_irqrestore(&q->lock, flags);
3480
3481 return timeout;
3482}
3483
3484void __sched interruptible_sleep_on(wait_queue_head_t *q)
3485{
3486 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3487}
3488EXPORT_SYMBOL(interruptible_sleep_on);
3489
3490long __sched
3491interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3492{
3493 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3494}
3495EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3496
3497void __sched sleep_on(wait_queue_head_t *q)
3498{
3499 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3500}
3501EXPORT_SYMBOL(sleep_on);
3502
3503long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3504{
3505 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3506}
3507EXPORT_SYMBOL(sleep_on_timeout);
3508
3509#ifdef CONFIG_RT_MUTEXES
3510
3511/*
3512 * rt_mutex_setprio - set the current priority of a task
3513 * @p: task
3514 * @prio: prio value (kernel-internal form)
3515 *
3516 * This function changes the 'effective' priority of a task. It does
3517 * not touch ->normal_prio like __setscheduler().
3518 *
3519 * Used by the rt_mutex code to implement priority inheritance logic.
3520 */
3521void rt_mutex_setprio(struct task_struct *p, int prio)
3522{
3523 int oldprio, on_rq, running;
3524 struct rq *rq;
3525 const struct sched_class *prev_class;
3526
3527 BUG_ON(prio < 0 || prio > MAX_PRIO);
3528
3529 rq = __task_rq_lock(p);
3530
3531 /*
3532 * Idle task boosting is a nono in general. There is one
3533 * exception, when PREEMPT_RT and NOHZ is active:
3534 *
3535 * The idle task calls get_next_timer_interrupt() and holds
3536 * the timer wheel base->lock on the CPU and another CPU wants
3537 * to access the timer (probably to cancel it). We can safely
3538 * ignore the boosting request, as the idle CPU runs this code
3539 * with interrupts disabled and will complete the lock
3540 * protected section without being interrupted. So there is no
3541 * real need to boost.
3542 */
3543 if (unlikely(p == rq->idle)) {
3544 WARN_ON(p != rq->curr);
3545 WARN_ON(p->pi_blocked_on);
3546 goto out_unlock;
3547 }
3548
3549 trace_sched_pi_setprio(p, prio);
3550 oldprio = p->prio;
3551 prev_class = p->sched_class;
3552 on_rq = p->on_rq;
3553 running = task_current(rq, p);
3554 if (on_rq)
3555 dequeue_task(rq, p, 0);
3556 if (running)
3557 p->sched_class->put_prev_task(rq, p);
3558
3559 if (rt_prio(prio))
3560 p->sched_class = &rt_sched_class;
3561 else
3562 p->sched_class = &fair_sched_class;
3563
3564 p->prio = prio;
3565
3566 if (running)
3567 p->sched_class->set_curr_task(rq);
3568 if (on_rq)
3569 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3570
3571 check_class_changed(rq, p, prev_class, oldprio);
3572out_unlock:
3573 __task_rq_unlock(rq);
3574}
3575#endif
3576void set_user_nice(struct task_struct *p, long nice)
3577{
3578 int old_prio, delta, on_rq;
3579 unsigned long flags;
3580 struct rq *rq;
3581
3582 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3583 return;
3584 /*
3585 * We have to be careful, if called from sys_setpriority(),
3586 * the task might be in the middle of scheduling on another CPU.
3587 */
3588 rq = task_rq_lock(p, &flags);
3589 /*
3590 * The RT priorities are set via sched_setscheduler(), but we still
3591 * allow the 'normal' nice value to be set - but as expected
3592 * it wont have any effect on scheduling until the task is
3593 * SCHED_FIFO/SCHED_RR:
3594 */
3595 if (task_has_rt_policy(p)) {
3596 p->static_prio = NICE_TO_PRIO(nice);
3597 goto out_unlock;
3598 }
3599 on_rq = p->on_rq;
3600 if (on_rq)
3601 dequeue_task(rq, p, 0);
3602
3603 p->static_prio = NICE_TO_PRIO(nice);
3604 set_load_weight(p);
3605 old_prio = p->prio;
3606 p->prio = effective_prio(p);
3607 delta = p->prio - old_prio;
3608
3609 if (on_rq) {
3610 enqueue_task(rq, p, 0);
3611 /*
3612 * If the task increased its priority or is running and
3613 * lowered its priority, then reschedule its CPU:
3614 */
3615 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3616 resched_task(rq->curr);
3617 }
3618out_unlock:
3619 task_rq_unlock(rq, p, &flags);
3620}
3621EXPORT_SYMBOL(set_user_nice);
3622
3623/*
3624 * can_nice - check if a task can reduce its nice value
3625 * @p: task
3626 * @nice: nice value
3627 */
3628int can_nice(const struct task_struct *p, const int nice)
3629{
3630 /* convert nice value [19,-20] to rlimit style value [1,40] */
3631 int nice_rlim = 20 - nice;
3632
3633 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3634 capable(CAP_SYS_NICE));
3635}
3636
3637#ifdef __ARCH_WANT_SYS_NICE
3638
3639/*
3640 * sys_nice - change the priority of the current process.
3641 * @increment: priority increment
3642 *
3643 * sys_setpriority is a more generic, but much slower function that
3644 * does similar things.
3645 */
3646SYSCALL_DEFINE1(nice, int, increment)
3647{
3648 long nice, retval;
3649
3650 /*
3651 * Setpriority might change our priority at the same moment.
3652 * We don't have to worry. Conceptually one call occurs first
3653 * and we have a single winner.
3654 */
3655 if (increment < -40)
3656 increment = -40;
3657 if (increment > 40)
3658 increment = 40;
3659
3660 nice = TASK_NICE(current) + increment;
3661 if (nice < -20)
3662 nice = -20;
3663 if (nice > 19)
3664 nice = 19;
3665
3666 if (increment < 0 && !can_nice(current, nice))
3667 return -EPERM;
3668
3669 retval = security_task_setnice(current, nice);
3670 if (retval)
3671 return retval;
3672
3673 set_user_nice(current, nice);
3674 return 0;
3675}
3676
3677#endif
3678
3679/**
3680 * task_prio - return the priority value of a given task.
3681 * @p: the task in question.
3682 *
3683 * This is the priority value as seen by users in /proc.
3684 * RT tasks are offset by -200. Normal tasks are centered
3685 * around 0, value goes from -16 to +15.
3686 */
3687int task_prio(const struct task_struct *p)
3688{
3689 return p->prio - MAX_RT_PRIO;
3690}
3691
3692/**
3693 * task_nice - return the nice value of a given task.
3694 * @p: the task in question.
3695 */
3696int task_nice(const struct task_struct *p)
3697{
3698 return TASK_NICE(p);
3699}
3700EXPORT_SYMBOL(task_nice);
3701
3702/**
3703 * idle_cpu - is a given cpu idle currently?
3704 * @cpu: the processor in question.
3705 */
3706int idle_cpu(int cpu)
3707{
3708 struct rq *rq = cpu_rq(cpu);
3709
3710 if (rq->curr != rq->idle)
3711 return 0;
3712
3713 if (rq->nr_running)
3714 return 0;
3715
3716#ifdef CONFIG_SMP
3717 if (!llist_empty(&rq->wake_list))
3718 return 0;
3719#endif
3720
3721 return 1;
3722}
3723
3724/**
3725 * idle_task - return the idle task for a given cpu.
3726 * @cpu: the processor in question.
3727 */
3728struct task_struct *idle_task(int cpu)
3729{
3730 return cpu_rq(cpu)->idle;
3731}
3732
3733/**
3734 * find_process_by_pid - find a process with a matching PID value.
3735 * @pid: the pid in question.
3736 */
3737static struct task_struct *find_process_by_pid(pid_t pid)
3738{
3739 return pid ? find_task_by_vpid(pid) : current;
3740}
3741
3742/* Actually do priority change: must hold rq lock. */
3743static void
3744__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3745{
3746 p->policy = policy;
3747 p->rt_priority = prio;
3748 p->normal_prio = normal_prio(p);
3749 /* we are holding p->pi_lock already */
3750 p->prio = rt_mutex_getprio(p);
3751 if (rt_prio(p->prio))
3752 p->sched_class = &rt_sched_class;
3753 else
3754 p->sched_class = &fair_sched_class;
3755 set_load_weight(p);
3756}
3757
3758/*
3759 * check the target process has a UID that matches the current process's
3760 */
3761static bool check_same_owner(struct task_struct *p)
3762{
3763 const struct cred *cred = current_cred(), *pcred;
3764 bool match;
3765
3766 rcu_read_lock();
3767 pcred = __task_cred(p);
3768 match = (uid_eq(cred->euid, pcred->euid) ||
3769 uid_eq(cred->euid, pcred->uid));
3770 rcu_read_unlock();
3771 return match;
3772}
3773
3774static int __sched_setscheduler(struct task_struct *p, int policy,
3775 const struct sched_param *param, bool user)
3776{
3777 int retval, oldprio, oldpolicy = -1, on_rq, running;
3778 unsigned long flags;
3779 const struct sched_class *prev_class;
3780 struct rq *rq;
3781 int reset_on_fork;
3782
3783 /* may grab non-irq protected spin_locks */
3784 BUG_ON(in_interrupt());
3785recheck:
3786 /* double check policy once rq lock held */
3787 if (policy < 0) {
3788 reset_on_fork = p->sched_reset_on_fork;
3789 policy = oldpolicy = p->policy;
3790 } else {
3791 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
3792 policy &= ~SCHED_RESET_ON_FORK;
3793
3794 if (policy != SCHED_FIFO && policy != SCHED_RR &&
3795 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3796 policy != SCHED_IDLE)
3797 return -EINVAL;
3798 }
3799
3800 /*
3801 * Valid priorities for SCHED_FIFO and SCHED_RR are
3802 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3803 * SCHED_BATCH and SCHED_IDLE is 0.
3804 */
3805 if (param->sched_priority < 0 ||
3806 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3807 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3808 return -EINVAL;
3809 if (rt_policy(policy) != (param->sched_priority != 0))
3810 return -EINVAL;
3811
3812 /*
3813 * Allow unprivileged RT tasks to decrease priority:
3814 */
3815 if (user && !capable(CAP_SYS_NICE)) {
3816 if (rt_policy(policy)) {
3817 unsigned long rlim_rtprio =
3818 task_rlimit(p, RLIMIT_RTPRIO);
3819
3820 /* can't set/change the rt policy */
3821 if (policy != p->policy && !rlim_rtprio)
3822 return -EPERM;
3823
3824 /* can't increase priority */
3825 if (param->sched_priority > p->rt_priority &&
3826 param->sched_priority > rlim_rtprio)
3827 return -EPERM;
3828 }
3829
3830 /*
3831 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3832 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3833 */
3834 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3835 if (!can_nice(p, TASK_NICE(p)))
3836 return -EPERM;
3837 }
3838
3839 /* can't change other user's priorities */
3840 if (!check_same_owner(p))
3841 return -EPERM;
3842
3843 /* Normal users shall not reset the sched_reset_on_fork flag */
3844 if (p->sched_reset_on_fork && !reset_on_fork)
3845 return -EPERM;
3846 }
3847
3848 if (user) {
3849 retval = security_task_setscheduler(p);
3850 if (retval)
3851 return retval;
3852 }
3853
3854 /*
3855 * make sure no PI-waiters arrive (or leave) while we are
3856 * changing the priority of the task:
3857 *
3858 * To be able to change p->policy safely, the appropriate
3859 * runqueue lock must be held.
3860 */
3861 rq = task_rq_lock(p, &flags);
3862
3863 /*
3864 * Changing the policy of the stop threads its a very bad idea
3865 */
3866 if (p == rq->stop) {
3867 task_rq_unlock(rq, p, &flags);
3868 return -EINVAL;
3869 }
3870
3871 /*
3872 * If not changing anything there's no need to proceed further:
3873 */
3874 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
3875 param->sched_priority == p->rt_priority))) {
3876 task_rq_unlock(rq, p, &flags);
3877 return 0;
3878 }
3879
3880#ifdef CONFIG_RT_GROUP_SCHED
3881 if (user) {
3882 /*
3883 * Do not allow realtime tasks into groups that have no runtime
3884 * assigned.
3885 */
3886 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3887 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3888 !task_group_is_autogroup(task_group(p))) {
3889 task_rq_unlock(rq, p, &flags);
3890 return -EPERM;
3891 }
3892 }
3893#endif
3894
3895 /* recheck policy now with rq lock held */
3896 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3897 policy = oldpolicy = -1;
3898 task_rq_unlock(rq, p, &flags);
3899 goto recheck;
3900 }
3901 on_rq = p->on_rq;
3902 running = task_current(rq, p);
3903 if (on_rq)
3904 dequeue_task(rq, p, 0);
3905 if (running)
3906 p->sched_class->put_prev_task(rq, p);
3907
3908 p->sched_reset_on_fork = reset_on_fork;
3909
3910 oldprio = p->prio;
3911 prev_class = p->sched_class;
3912 __setscheduler(rq, p, policy, param->sched_priority);
3913
3914 if (running)
3915 p->sched_class->set_curr_task(rq);
3916 if (on_rq)
3917 enqueue_task(rq, p, 0);
3918
3919 check_class_changed(rq, p, prev_class, oldprio);
3920 task_rq_unlock(rq, p, &flags);
3921
3922 rt_mutex_adjust_pi(p);
3923
3924 return 0;
3925}
3926
3927/**
3928 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3929 * @p: the task in question.
3930 * @policy: new policy.
3931 * @param: structure containing the new RT priority.
3932 *
3933 * NOTE that the task may be already dead.
3934 */
3935int sched_setscheduler(struct task_struct *p, int policy,
3936 const struct sched_param *param)
3937{
3938 return __sched_setscheduler(p, policy, param, true);
3939}
3940EXPORT_SYMBOL_GPL(sched_setscheduler);
3941
3942/**
3943 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3944 * @p: the task in question.
3945 * @policy: new policy.
3946 * @param: structure containing the new RT priority.
3947 *
3948 * Just like sched_setscheduler, only don't bother checking if the
3949 * current context has permission. For example, this is needed in
3950 * stop_machine(): we create temporary high priority worker threads,
3951 * but our caller might not have that capability.
3952 */
3953int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3954 const struct sched_param *param)
3955{
3956 return __sched_setscheduler(p, policy, param, false);
3957}
3958
3959static int
3960do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3961{
3962 struct sched_param lparam;
3963 struct task_struct *p;
3964 int retval;
3965
3966 if (!param || pid < 0)
3967 return -EINVAL;
3968 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3969 return -EFAULT;
3970
3971 rcu_read_lock();
3972 retval = -ESRCH;
3973 p = find_process_by_pid(pid);
3974 if (p != NULL)
3975 retval = sched_setscheduler(p, policy, &lparam);
3976 rcu_read_unlock();
3977
3978 return retval;
3979}
3980
3981/**
3982 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3983 * @pid: the pid in question.
3984 * @policy: new policy.
3985 * @param: structure containing the new RT priority.
3986 */
3987SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3988 struct sched_param __user *, param)
3989{
3990 /* negative values for policy are not valid */
3991 if (policy < 0)
3992 return -EINVAL;
3993
3994 return do_sched_setscheduler(pid, policy, param);
3995}
3996
3997/**
3998 * sys_sched_setparam - set/change the RT priority of a thread
3999 * @pid: the pid in question.
4000 * @param: structure containing the new RT priority.
4001 */
4002SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4003{
4004 return do_sched_setscheduler(pid, -1, param);
4005}
4006
4007/**
4008 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4009 * @pid: the pid in question.
4010 */
4011SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4012{
4013 struct task_struct *p;
4014 int retval;
4015
4016 if (pid < 0)
4017 return -EINVAL;
4018
4019 retval = -ESRCH;
4020 rcu_read_lock();
4021 p = find_process_by_pid(pid);
4022 if (p) {
4023 retval = security_task_getscheduler(p);
4024 if (!retval)
4025 retval = p->policy
4026 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4027 }
4028 rcu_read_unlock();
4029 return retval;
4030}
4031
4032/**
4033 * sys_sched_getparam - get the RT priority of a thread
4034 * @pid: the pid in question.
4035 * @param: structure containing the RT priority.
4036 */
4037SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4038{
4039 struct sched_param lp;
4040 struct task_struct *p;
4041 int retval;
4042
4043 if (!param || pid < 0)
4044 return -EINVAL;
4045
4046 rcu_read_lock();
4047 p = find_process_by_pid(pid);
4048 retval = -ESRCH;
4049 if (!p)
4050 goto out_unlock;
4051
4052 retval = security_task_getscheduler(p);
4053 if (retval)
4054 goto out_unlock;
4055
4056 lp.sched_priority = p->rt_priority;
4057 rcu_read_unlock();
4058
4059 /*
4060 * This one might sleep, we cannot do it with a spinlock held ...
4061 */
4062 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4063
4064 return retval;
4065
4066out_unlock:
4067 rcu_read_unlock();
4068 return retval;
4069}
4070
4071long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4072{
4073 cpumask_var_t cpus_allowed, new_mask;
4074 struct task_struct *p;
4075 int retval;
4076
4077 get_online_cpus();
4078 rcu_read_lock();
4079
4080 p = find_process_by_pid(pid);
4081 if (!p) {
4082 rcu_read_unlock();
4083 put_online_cpus();
4084 return -ESRCH;
4085 }
4086
4087 /* Prevent p going away */
4088 get_task_struct(p);
4089 rcu_read_unlock();
4090
4091 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4092 retval = -ENOMEM;
4093 goto out_put_task;
4094 }
4095 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4096 retval = -ENOMEM;
4097 goto out_free_cpus_allowed;
4098 }
4099 retval = -EPERM;
4100 if (!check_same_owner(p)) {
4101 rcu_read_lock();
4102 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4103 rcu_read_unlock();
4104 goto out_unlock;
4105 }
4106 rcu_read_unlock();
4107 }
4108
4109 retval = security_task_setscheduler(p);
4110 if (retval)
4111 goto out_unlock;
4112
4113 cpuset_cpus_allowed(p, cpus_allowed);
4114 cpumask_and(new_mask, in_mask, cpus_allowed);
4115again:
4116 retval = set_cpus_allowed_ptr(p, new_mask);
4117
4118 if (!retval) {
4119 cpuset_cpus_allowed(p, cpus_allowed);
4120 if (!cpumask_subset(new_mask, cpus_allowed)) {
4121 /*
4122 * We must have raced with a concurrent cpuset
4123 * update. Just reset the cpus_allowed to the
4124 * cpuset's cpus_allowed
4125 */
4126 cpumask_copy(new_mask, cpus_allowed);
4127 goto again;
4128 }
4129 }
4130out_unlock:
4131 free_cpumask_var(new_mask);
4132out_free_cpus_allowed:
4133 free_cpumask_var(cpus_allowed);
4134out_put_task:
4135 put_task_struct(p);
4136 put_online_cpus();
4137 return retval;
4138}
4139
4140static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4141 struct cpumask *new_mask)
4142{
4143 if (len < cpumask_size())
4144 cpumask_clear(new_mask);
4145 else if (len > cpumask_size())
4146 len = cpumask_size();
4147
4148 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4149}
4150
4151/**
4152 * sys_sched_setaffinity - set the cpu affinity of a process
4153 * @pid: pid of the process
4154 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4155 * @user_mask_ptr: user-space pointer to the new cpu mask
4156 */
4157SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4158 unsigned long __user *, user_mask_ptr)
4159{
4160 cpumask_var_t new_mask;
4161 int retval;
4162
4163 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4164 return -ENOMEM;
4165
4166 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4167 if (retval == 0)
4168 retval = sched_setaffinity(pid, new_mask);
4169 free_cpumask_var(new_mask);
4170 return retval;
4171}
4172
4173long sched_getaffinity(pid_t pid, struct cpumask *mask)
4174{
4175 struct task_struct *p;
4176 unsigned long flags;
4177 int retval;
4178
4179 get_online_cpus();
4180 rcu_read_lock();
4181
4182 retval = -ESRCH;
4183 p = find_process_by_pid(pid);
4184 if (!p)
4185 goto out_unlock;
4186
4187 retval = security_task_getscheduler(p);
4188 if (retval)
4189 goto out_unlock;
4190
4191 raw_spin_lock_irqsave(&p->pi_lock, flags);
4192 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4193 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4194
4195out_unlock:
4196 rcu_read_unlock();
4197 put_online_cpus();
4198
4199 return retval;
4200}
4201
4202/**
4203 * sys_sched_getaffinity - get the cpu affinity of a process
4204 * @pid: pid of the process
4205 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4206 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4207 */
4208SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4209 unsigned long __user *, user_mask_ptr)
4210{
4211 int ret;
4212 cpumask_var_t mask;
4213
4214 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4215 return -EINVAL;
4216 if (len & (sizeof(unsigned long)-1))
4217 return -EINVAL;
4218
4219 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4220 return -ENOMEM;
4221
4222 ret = sched_getaffinity(pid, mask);
4223 if (ret == 0) {
4224 size_t retlen = min_t(size_t, len, cpumask_size());
4225
4226 if (copy_to_user(user_mask_ptr, mask, retlen))
4227 ret = -EFAULT;
4228 else
4229 ret = retlen;
4230 }
4231 free_cpumask_var(mask);
4232
4233 return ret;
4234}
4235
4236/**
4237 * sys_sched_yield - yield the current processor to other threads.
4238 *
4239 * This function yields the current CPU to other tasks. If there are no
4240 * other threads running on this CPU then this function will return.
4241 */
4242SYSCALL_DEFINE0(sched_yield)
4243{
4244 struct rq *rq = this_rq_lock();
4245
4246 schedstat_inc(rq, yld_count);
4247 current->sched_class->yield_task(rq);
4248
4249 /*
4250 * Since we are going to call schedule() anyway, there's
4251 * no need to preempt or enable interrupts:
4252 */
4253 __release(rq->lock);
4254 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4255 do_raw_spin_unlock(&rq->lock);
4256 sched_preempt_enable_no_resched();
4257
4258 schedule();
4259
4260 return 0;
4261}
4262
4263static inline int should_resched(void)
4264{
4265 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4266}
4267
4268static void __cond_resched(void)
4269{
4270 add_preempt_count(PREEMPT_ACTIVE);
4271 __schedule();
4272 sub_preempt_count(PREEMPT_ACTIVE);
4273}
4274
4275int __sched _cond_resched(void)
4276{
4277 if (should_resched()) {
4278 __cond_resched();
4279 return 1;
4280 }
4281 return 0;
4282}
4283EXPORT_SYMBOL(_cond_resched);
4284
4285/*
4286 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4287 * call schedule, and on return reacquire the lock.
4288 *
4289 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4290 * operations here to prevent schedule() from being called twice (once via
4291 * spin_unlock(), once by hand).
4292 */
4293int __cond_resched_lock(spinlock_t *lock)
4294{
4295 int resched = should_resched();
4296 int ret = 0;
4297
4298 lockdep_assert_held(lock);
4299
4300 if (spin_needbreak(lock) || resched) {
4301 spin_unlock(lock);
4302 if (resched)
4303 __cond_resched();
4304 else
4305 cpu_relax();
4306 ret = 1;
4307 spin_lock(lock);
4308 }
4309 return ret;
4310}
4311EXPORT_SYMBOL(__cond_resched_lock);
4312
4313int __sched __cond_resched_softirq(void)
4314{
4315 BUG_ON(!in_softirq());
4316
4317 if (should_resched()) {
4318 local_bh_enable();
4319 __cond_resched();
4320 local_bh_disable();
4321 return 1;
4322 }
4323 return 0;
4324}
4325EXPORT_SYMBOL(__cond_resched_softirq);
4326
4327/**
4328 * yield - yield the current processor to other threads.
4329 *
4330 * Do not ever use this function, there's a 99% chance you're doing it wrong.
4331 *
4332 * The scheduler is at all times free to pick the calling task as the most
4333 * eligible task to run, if removing the yield() call from your code breaks
4334 * it, its already broken.
4335 *
4336 * Typical broken usage is:
4337 *
4338 * while (!event)
4339 * yield();
4340 *
4341 * where one assumes that yield() will let 'the other' process run that will
4342 * make event true. If the current task is a SCHED_FIFO task that will never
4343 * happen. Never use yield() as a progress guarantee!!
4344 *
4345 * If you want to use yield() to wait for something, use wait_event().
4346 * If you want to use yield() to be 'nice' for others, use cond_resched().
4347 * If you still want to use yield(), do not!
4348 */
4349void __sched yield(void)
4350{
4351 set_current_state(TASK_RUNNING);
4352 sys_sched_yield();
4353}
4354EXPORT_SYMBOL(yield);
4355
4356/**
4357 * yield_to - yield the current processor to another thread in
4358 * your thread group, or accelerate that thread toward the
4359 * processor it's on.
4360 * @p: target task
4361 * @preempt: whether task preemption is allowed or not
4362 *
4363 * It's the caller's job to ensure that the target task struct
4364 * can't go away on us before we can do any checks.
4365 *
4366 * Returns true if we indeed boosted the target task.
4367 */
4368bool __sched yield_to(struct task_struct *p, bool preempt)
4369{
4370 struct task_struct *curr = current;
4371 struct rq *rq, *p_rq;
4372 unsigned long flags;
4373 bool yielded = 0;
4374
4375 local_irq_save(flags);
4376 rq = this_rq();
4377
4378again:
4379 p_rq = task_rq(p);
4380 double_rq_lock(rq, p_rq);
4381 while (task_rq(p) != p_rq) {
4382 double_rq_unlock(rq, p_rq);
4383 goto again;
4384 }
4385
4386 if (!curr->sched_class->yield_to_task)
4387 goto out;
4388
4389 if (curr->sched_class != p->sched_class)
4390 goto out;
4391
4392 if (task_running(p_rq, p) || p->state)
4393 goto out;
4394
4395 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4396 if (yielded) {
4397 schedstat_inc(rq, yld_count);
4398 /*
4399 * Make p's CPU reschedule; pick_next_entity takes care of
4400 * fairness.
4401 */
4402 if (preempt && rq != p_rq)
4403 resched_task(p_rq->curr);
4404 }
4405
4406out:
4407 double_rq_unlock(rq, p_rq);
4408 local_irq_restore(flags);
4409
4410 if (yielded)
4411 schedule();
4412
4413 return yielded;
4414}
4415EXPORT_SYMBOL_GPL(yield_to);
4416
4417/*
4418 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4419 * that process accounting knows that this is a task in IO wait state.
4420 */
4421void __sched io_schedule(void)
4422{
4423 struct rq *rq = raw_rq();
4424
4425 delayacct_blkio_start();
4426 atomic_inc(&rq->nr_iowait);
4427 blk_flush_plug(current);
4428 current->in_iowait = 1;
4429 schedule();
4430 current->in_iowait = 0;
4431 atomic_dec(&rq->nr_iowait);
4432 delayacct_blkio_end();
4433}
4434EXPORT_SYMBOL(io_schedule);
4435
4436long __sched io_schedule_timeout(long timeout)
4437{
4438 struct rq *rq = raw_rq();
4439 long ret;
4440
4441 delayacct_blkio_start();
4442 atomic_inc(&rq->nr_iowait);
4443 blk_flush_plug(current);
4444 current->in_iowait = 1;
4445 ret = schedule_timeout(timeout);
4446 current->in_iowait = 0;
4447 atomic_dec(&rq->nr_iowait);
4448 delayacct_blkio_end();
4449 return ret;
4450}
4451
4452/**
4453 * sys_sched_get_priority_max - return maximum RT priority.
4454 * @policy: scheduling class.
4455 *
4456 * this syscall returns the maximum rt_priority that can be used
4457 * by a given scheduling class.
4458 */
4459SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4460{
4461 int ret = -EINVAL;
4462
4463 switch (policy) {
4464 case SCHED_FIFO:
4465 case SCHED_RR:
4466 ret = MAX_USER_RT_PRIO-1;
4467 break;
4468 case SCHED_NORMAL:
4469 case SCHED_BATCH:
4470 case SCHED_IDLE:
4471 ret = 0;
4472 break;
4473 }
4474 return ret;
4475}
4476
4477/**
4478 * sys_sched_get_priority_min - return minimum RT priority.
4479 * @policy: scheduling class.
4480 *
4481 * this syscall returns the minimum rt_priority that can be used
4482 * by a given scheduling class.
4483 */
4484SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4485{
4486 int ret = -EINVAL;
4487
4488 switch (policy) {
4489 case SCHED_FIFO:
4490 case SCHED_RR:
4491 ret = 1;
4492 break;
4493 case SCHED_NORMAL:
4494 case SCHED_BATCH:
4495 case SCHED_IDLE:
4496 ret = 0;
4497 }
4498 return ret;
4499}
4500
4501/**
4502 * sys_sched_rr_get_interval - return the default timeslice of a process.
4503 * @pid: pid of the process.
4504 * @interval: userspace pointer to the timeslice value.
4505 *
4506 * this syscall writes the default timeslice value of a given process
4507 * into the user-space timespec buffer. A value of '0' means infinity.
4508 */
4509SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4510 struct timespec __user *, interval)
4511{
4512 struct task_struct *p;
4513 unsigned int time_slice;
4514 unsigned long flags;
4515 struct rq *rq;
4516 int retval;
4517 struct timespec t;
4518
4519 if (pid < 0)
4520 return -EINVAL;
4521
4522 retval = -ESRCH;
4523 rcu_read_lock();
4524 p = find_process_by_pid(pid);
4525 if (!p)
4526 goto out_unlock;
4527
4528 retval = security_task_getscheduler(p);
4529 if (retval)
4530 goto out_unlock;
4531
4532 rq = task_rq_lock(p, &flags);
4533 time_slice = p->sched_class->get_rr_interval(rq, p);
4534 task_rq_unlock(rq, p, &flags);
4535
4536 rcu_read_unlock();
4537 jiffies_to_timespec(time_slice, &t);
4538 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4539 return retval;
4540
4541out_unlock:
4542 rcu_read_unlock();
4543 return retval;
4544}
4545
4546static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4547
4548void sched_show_task(struct task_struct *p)
4549{
4550 unsigned long free = 0;
4551 int ppid;
4552 unsigned state;
4553
4554 state = p->state ? __ffs(p->state) + 1 : 0;
4555 printk(KERN_INFO "%-15.15s %c", p->comm,
4556 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4557#if BITS_PER_LONG == 32
4558 if (state == TASK_RUNNING)
4559 printk(KERN_CONT " running ");
4560 else
4561 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4562#else
4563 if (state == TASK_RUNNING)
4564 printk(KERN_CONT " running task ");
4565 else
4566 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4567#endif
4568#ifdef CONFIG_DEBUG_STACK_USAGE
4569 free = stack_not_used(p);
4570#endif
4571 rcu_read_lock();
4572 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4573 rcu_read_unlock();
4574 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4575 task_pid_nr(p), ppid,
4576 (unsigned long)task_thread_info(p)->flags);
4577
4578 show_stack(p, NULL);
4579}
4580
4581void show_state_filter(unsigned long state_filter)
4582{
4583 struct task_struct *g, *p;
4584
4585#if BITS_PER_LONG == 32
4586 printk(KERN_INFO
4587 " task PC stack pid father\n");
4588#else
4589 printk(KERN_INFO
4590 " task PC stack pid father\n");
4591#endif
4592 rcu_read_lock();
4593 do_each_thread(g, p) {
4594 /*
4595 * reset the NMI-timeout, listing all files on a slow
4596 * console might take a lot of time:
4597 */
4598 touch_nmi_watchdog();
4599 if (!state_filter || (p->state & state_filter))
4600 sched_show_task(p);
4601 } while_each_thread(g, p);
4602
4603 touch_all_softlockup_watchdogs();
4604
4605#ifdef CONFIG_SCHED_DEBUG
4606 sysrq_sched_debug_show();
4607#endif
4608 rcu_read_unlock();
4609 /*
4610 * Only show locks if all tasks are dumped:
4611 */
4612 if (!state_filter)
4613 debug_show_all_locks();
4614}
4615
4616void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4617{
4618 idle->sched_class = &idle_sched_class;
4619}
4620
4621/**
4622 * init_idle - set up an idle thread for a given CPU
4623 * @idle: task in question
4624 * @cpu: cpu the idle task belongs to
4625 *
4626 * NOTE: this function does not set the idle thread's NEED_RESCHED
4627 * flag, to make booting more robust.
4628 */
4629void __cpuinit init_idle(struct task_struct *idle, int cpu)
4630{
4631 struct rq *rq = cpu_rq(cpu);
4632 unsigned long flags;
4633
4634 raw_spin_lock_irqsave(&rq->lock, flags);
4635
4636 __sched_fork(idle);
4637 idle->state = TASK_RUNNING;
4638 idle->se.exec_start = sched_clock();
4639
4640 do_set_cpus_allowed(idle, cpumask_of(cpu));
4641 /*
4642 * We're having a chicken and egg problem, even though we are
4643 * holding rq->lock, the cpu isn't yet set to this cpu so the
4644 * lockdep check in task_group() will fail.
4645 *
4646 * Similar case to sched_fork(). / Alternatively we could
4647 * use task_rq_lock() here and obtain the other rq->lock.
4648 *
4649 * Silence PROVE_RCU
4650 */
4651 rcu_read_lock();
4652 __set_task_cpu(idle, cpu);
4653 rcu_read_unlock();
4654
4655 rq->curr = rq->idle = idle;
4656#if defined(CONFIG_SMP)
4657 idle->on_cpu = 1;
4658#endif
4659 raw_spin_unlock_irqrestore(&rq->lock, flags);
4660
4661 /* Set the preempt count _outside_ the spinlocks! */
4662 task_thread_info(idle)->preempt_count = 0;
4663
4664 /*
4665 * The idle tasks have their own, simple scheduling class:
4666 */
4667 idle->sched_class = &idle_sched_class;
4668 ftrace_graph_init_idle_task(idle, cpu);
4669#if defined(CONFIG_SMP)
4670 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4671#endif
4672}
4673
4674#ifdef CONFIG_SMP
4675void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4676{
4677 if (p->sched_class && p->sched_class->set_cpus_allowed)
4678 p->sched_class->set_cpus_allowed(p, new_mask);
4679
4680 cpumask_copy(&p->cpus_allowed, new_mask);
4681 p->nr_cpus_allowed = cpumask_weight(new_mask);
4682}
4683
4684/*
4685 * This is how migration works:
4686 *
4687 * 1) we invoke migration_cpu_stop() on the target CPU using
4688 * stop_one_cpu().
4689 * 2) stopper starts to run (implicitly forcing the migrated thread
4690 * off the CPU)
4691 * 3) it checks whether the migrated task is still in the wrong runqueue.
4692 * 4) if it's in the wrong runqueue then the migration thread removes
4693 * it and puts it into the right queue.
4694 * 5) stopper completes and stop_one_cpu() returns and the migration
4695 * is done.
4696 */
4697
4698/*
4699 * Change a given task's CPU affinity. Migrate the thread to a
4700 * proper CPU and schedule it away if the CPU it's executing on
4701 * is removed from the allowed bitmask.
4702 *
4703 * NOTE: the caller must have a valid reference to the task, the
4704 * task must not exit() & deallocate itself prematurely. The
4705 * call is not atomic; no spinlocks may be held.
4706 */
4707int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4708{
4709 unsigned long flags;
4710 struct rq *rq;
4711 unsigned int dest_cpu;
4712 int ret = 0;
4713
4714 rq = task_rq_lock(p, &flags);
4715
4716 if (cpumask_equal(&p->cpus_allowed, new_mask))
4717 goto out;
4718
4719 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4720 ret = -EINVAL;
4721 goto out;
4722 }
4723
4724 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4725 ret = -EINVAL;
4726 goto out;
4727 }
4728
4729 do_set_cpus_allowed(p, new_mask);
4730
4731 /* Can the task run on the task's current CPU? If so, we're done */
4732 if (cpumask_test_cpu(task_cpu(p), new_mask))
4733 goto out;
4734
4735 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4736 if (p->on_rq) {
4737 struct migration_arg arg = { p, dest_cpu };
4738 /* Need help from migration thread: drop lock and wait. */
4739 task_rq_unlock(rq, p, &flags);
4740 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4741 tlb_migrate_finish(p->mm);
4742 return 0;
4743 }
4744out:
4745 task_rq_unlock(rq, p, &flags);
4746
4747 return ret;
4748}
4749EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4750
4751/*
4752 * Move (not current) task off this cpu, onto dest cpu. We're doing
4753 * this because either it can't run here any more (set_cpus_allowed()
4754 * away from this CPU, or CPU going down), or because we're
4755 * attempting to rebalance this task on exec (sched_exec).
4756 *
4757 * So we race with normal scheduler movements, but that's OK, as long
4758 * as the task is no longer on this CPU.
4759 *
4760 * Returns non-zero if task was successfully migrated.
4761 */
4762static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4763{
4764 struct rq *rq_dest, *rq_src;
4765 int ret = 0;
4766
4767 if (unlikely(!cpu_active(dest_cpu)))
4768 return ret;
4769
4770 rq_src = cpu_rq(src_cpu);
4771 rq_dest = cpu_rq(dest_cpu);
4772
4773 raw_spin_lock(&p->pi_lock);
4774 double_rq_lock(rq_src, rq_dest);
4775 /* Already moved. */
4776 if (task_cpu(p) != src_cpu)
4777 goto done;
4778 /* Affinity changed (again). */
4779 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4780 goto fail;
4781
4782 /*
4783 * If we're not on a rq, the next wake-up will ensure we're
4784 * placed properly.
4785 */
4786 if (p->on_rq) {
4787 dequeue_task(rq_src, p, 0);
4788 set_task_cpu(p, dest_cpu);
4789 enqueue_task(rq_dest, p, 0);
4790 check_preempt_curr(rq_dest, p, 0);
4791 }
4792done:
4793 ret = 1;
4794fail:
4795 double_rq_unlock(rq_src, rq_dest);
4796 raw_spin_unlock(&p->pi_lock);
4797 return ret;
4798}
4799
4800/*
4801 * migration_cpu_stop - this will be executed by a highprio stopper thread
4802 * and performs thread migration by bumping thread off CPU then
4803 * 'pushing' onto another runqueue.
4804 */
4805static int migration_cpu_stop(void *data)
4806{
4807 struct migration_arg *arg = data;
4808
4809 /*
4810 * The original target cpu might have gone down and we might
4811 * be on another cpu but it doesn't matter.
4812 */
4813 local_irq_disable();
4814 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4815 local_irq_enable();
4816 return 0;
4817}
4818
4819#ifdef CONFIG_HOTPLUG_CPU
4820
4821/*
4822 * Ensures that the idle task is using init_mm right before its cpu goes
4823 * offline.
4824 */
4825void idle_task_exit(void)
4826{
4827 struct mm_struct *mm = current->active_mm;
4828
4829 BUG_ON(cpu_online(smp_processor_id()));
4830
4831 if (mm != &init_mm)
4832 switch_mm(mm, &init_mm, current);
4833 mmdrop(mm);
4834}
4835
4836/*
4837 * Since this CPU is going 'away' for a while, fold any nr_active delta
4838 * we might have. Assumes we're called after migrate_tasks() so that the
4839 * nr_active count is stable.
4840 *
4841 * Also see the comment "Global load-average calculations".
4842 */
4843static void calc_load_migrate(struct rq *rq)
4844{
4845 long delta = calc_load_fold_active(rq);
4846 if (delta)
4847 atomic_long_add(delta, &calc_load_tasks);
4848}
4849
4850/*
4851 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4852 * try_to_wake_up()->select_task_rq().
4853 *
4854 * Called with rq->lock held even though we'er in stop_machine() and
4855 * there's no concurrency possible, we hold the required locks anyway
4856 * because of lock validation efforts.
4857 */
4858static void migrate_tasks(unsigned int dead_cpu)
4859{
4860 struct rq *rq = cpu_rq(dead_cpu);
4861 struct task_struct *next, *stop = rq->stop;
4862 int dest_cpu;
4863
4864 /*
4865 * Fudge the rq selection such that the below task selection loop
4866 * doesn't get stuck on the currently eligible stop task.
4867 *
4868 * We're currently inside stop_machine() and the rq is either stuck
4869 * in the stop_machine_cpu_stop() loop, or we're executing this code,
4870 * either way we should never end up calling schedule() until we're
4871 * done here.
4872 */
4873 rq->stop = NULL;
4874
4875 for ( ; ; ) {
4876 /*
4877 * There's this thread running, bail when that's the only
4878 * remaining thread.
4879 */
4880 if (rq->nr_running == 1)
4881 break;
4882
4883 next = pick_next_task(rq);
4884 BUG_ON(!next);
4885 next->sched_class->put_prev_task(rq, next);
4886
4887 /* Find suitable destination for @next, with force if needed. */
4888 dest_cpu = select_fallback_rq(dead_cpu, next);
4889 raw_spin_unlock(&rq->lock);
4890
4891 __migrate_task(next, dead_cpu, dest_cpu);
4892
4893 raw_spin_lock(&rq->lock);
4894 }
4895
4896 rq->stop = stop;
4897}
4898
4899#endif /* CONFIG_HOTPLUG_CPU */
4900
4901#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4902
4903static struct ctl_table sd_ctl_dir[] = {
4904 {
4905 .procname = "sched_domain",
4906 .mode = 0555,
4907 },
4908 {}
4909};
4910
4911static struct ctl_table sd_ctl_root[] = {
4912 {
4913 .procname = "kernel",
4914 .mode = 0555,
4915 .child = sd_ctl_dir,
4916 },
4917 {}
4918};
4919
4920static struct ctl_table *sd_alloc_ctl_entry(int n)
4921{
4922 struct ctl_table *entry =
4923 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4924
4925 return entry;
4926}
4927
4928static void sd_free_ctl_entry(struct ctl_table **tablep)
4929{
4930 struct ctl_table *entry;
4931
4932 /*
4933 * In the intermediate directories, both the child directory and
4934 * procname are dynamically allocated and could fail but the mode
4935 * will always be set. In the lowest directory the names are
4936 * static strings and all have proc handlers.
4937 */
4938 for (entry = *tablep; entry->mode; entry++) {
4939 if (entry->child)
4940 sd_free_ctl_entry(&entry->child);
4941 if (entry->proc_handler == NULL)
4942 kfree(entry->procname);
4943 }
4944
4945 kfree(*tablep);
4946 *tablep = NULL;
4947}
4948
4949static int min_load_idx = 0;
4950static int max_load_idx = CPU_LOAD_IDX_MAX;
4951
4952static void
4953set_table_entry(struct ctl_table *entry,
4954 const char *procname, void *data, int maxlen,
4955 umode_t mode, proc_handler *proc_handler,
4956 bool load_idx)
4957{
4958 entry->procname = procname;
4959 entry->data = data;
4960 entry->maxlen = maxlen;
4961 entry->mode = mode;
4962 entry->proc_handler = proc_handler;
4963
4964 if (load_idx) {
4965 entry->extra1 = &min_load_idx;
4966 entry->extra2 = &max_load_idx;
4967 }
4968}
4969
4970static struct ctl_table *
4971sd_alloc_ctl_domain_table(struct sched_domain *sd)
4972{
4973 struct ctl_table *table = sd_alloc_ctl_entry(13);
4974
4975 if (table == NULL)
4976 return NULL;
4977
4978 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4979 sizeof(long), 0644, proc_doulongvec_minmax, false);
4980 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4981 sizeof(long), 0644, proc_doulongvec_minmax, false);
4982 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4983 sizeof(int), 0644, proc_dointvec_minmax, true);
4984 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4985 sizeof(int), 0644, proc_dointvec_minmax, true);
4986 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4987 sizeof(int), 0644, proc_dointvec_minmax, true);
4988 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4989 sizeof(int), 0644, proc_dointvec_minmax, true);
4990 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4991 sizeof(int), 0644, proc_dointvec_minmax, true);
4992 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4993 sizeof(int), 0644, proc_dointvec_minmax, false);
4994 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4995 sizeof(int), 0644, proc_dointvec_minmax, false);
4996 set_table_entry(&table[9], "cache_nice_tries",
4997 &sd->cache_nice_tries,
4998 sizeof(int), 0644, proc_dointvec_minmax, false);
4999 set_table_entry(&table[10], "flags", &sd->flags,
5000 sizeof(int), 0644, proc_dointvec_minmax, false);
5001 set_table_entry(&table[11], "name", sd->name,
5002 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5003 /* &table[12] is terminator */
5004
5005 return table;
5006}
5007
5008static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5009{
5010 struct ctl_table *entry, *table;
5011 struct sched_domain *sd;
5012 int domain_num = 0, i;
5013 char buf[32];
5014
5015 for_each_domain(cpu, sd)
5016 domain_num++;
5017 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5018 if (table == NULL)
5019 return NULL;
5020
5021 i = 0;
5022 for_each_domain(cpu, sd) {
5023 snprintf(buf, 32, "domain%d", i);
5024 entry->procname = kstrdup(buf, GFP_KERNEL);
5025 entry->mode = 0555;
5026 entry->child = sd_alloc_ctl_domain_table(sd);
5027 entry++;
5028 i++;
5029 }
5030 return table;
5031}
5032
5033static struct ctl_table_header *sd_sysctl_header;
5034static void register_sched_domain_sysctl(void)
5035{
5036 int i, cpu_num = num_possible_cpus();
5037 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5038 char buf[32];
5039
5040 WARN_ON(sd_ctl_dir[0].child);
5041 sd_ctl_dir[0].child = entry;
5042
5043 if (entry == NULL)
5044 return;
5045
5046 for_each_possible_cpu(i) {
5047 snprintf(buf, 32, "cpu%d", i);
5048 entry->procname = kstrdup(buf, GFP_KERNEL);
5049 entry->mode = 0555;
5050 entry->child = sd_alloc_ctl_cpu_table(i);
5051 entry++;
5052 }
5053
5054 WARN_ON(sd_sysctl_header);
5055 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5056}
5057
5058/* may be called multiple times per register */
5059static void unregister_sched_domain_sysctl(void)
5060{
5061 if (sd_sysctl_header)
5062 unregister_sysctl_table(sd_sysctl_header);
5063 sd_sysctl_header = NULL;
5064 if (sd_ctl_dir[0].child)
5065 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5066}
5067#else
5068static void register_sched_domain_sysctl(void)
5069{
5070}
5071static void unregister_sched_domain_sysctl(void)
5072{
5073}
5074#endif
5075
5076static void set_rq_online(struct rq *rq)
5077{
5078 if (!rq->online) {
5079 const struct sched_class *class;
5080
5081 cpumask_set_cpu(rq->cpu, rq->rd->online);
5082 rq->online = 1;
5083
5084 for_each_class(class) {
5085 if (class->rq_online)
5086 class->rq_online(rq);
5087 }
5088 }
5089}
5090
5091static void set_rq_offline(struct rq *rq)
5092{
5093 if (rq->online) {
5094 const struct sched_class *class;
5095
5096 for_each_class(class) {
5097 if (class->rq_offline)
5098 class->rq_offline(rq);
5099 }
5100
5101 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5102 rq->online = 0;
5103 }
5104}
5105
5106/*
5107 * migration_call - callback that gets triggered when a CPU is added.
5108 * Here we can start up the necessary migration thread for the new CPU.
5109 */
5110static int __cpuinit
5111migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5112{
5113 int cpu = (long)hcpu;
5114 unsigned long flags;
5115 struct rq *rq = cpu_rq(cpu);
5116
5117 switch (action & ~CPU_TASKS_FROZEN) {
5118
5119 case CPU_UP_PREPARE:
5120 rq->calc_load_update = calc_load_update;
5121 break;
5122
5123 case CPU_ONLINE:
5124 /* Update our root-domain */
5125 raw_spin_lock_irqsave(&rq->lock, flags);
5126 if (rq->rd) {
5127 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5128
5129 set_rq_online(rq);
5130 }
5131 raw_spin_unlock_irqrestore(&rq->lock, flags);
5132 break;
5133
5134#ifdef CONFIG_HOTPLUG_CPU
5135 case CPU_DYING:
5136 sched_ttwu_pending();
5137 /* Update our root-domain */
5138 raw_spin_lock_irqsave(&rq->lock, flags);
5139 if (rq->rd) {
5140 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5141 set_rq_offline(rq);
5142 }
5143 migrate_tasks(cpu);
5144 BUG_ON(rq->nr_running != 1); /* the migration thread */
5145 raw_spin_unlock_irqrestore(&rq->lock, flags);
5146 break;
5147
5148 case CPU_DEAD:
5149 calc_load_migrate(rq);
5150 break;
5151#endif
5152 }
5153
5154 update_max_interval();
5155
5156 return NOTIFY_OK;
5157}
5158
5159/*
5160 * Register at high priority so that task migration (migrate_all_tasks)
5161 * happens before everything else. This has to be lower priority than
5162 * the notifier in the perf_event subsystem, though.
5163 */
5164static struct notifier_block __cpuinitdata migration_notifier = {
5165 .notifier_call = migration_call,
5166 .priority = CPU_PRI_MIGRATION,
5167};
5168
5169static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5170 unsigned long action, void *hcpu)
5171{
5172 switch (action & ~CPU_TASKS_FROZEN) {
5173 case CPU_STARTING:
5174 case CPU_DOWN_FAILED:
5175 set_cpu_active((long)hcpu, true);
5176 return NOTIFY_OK;
5177 default:
5178 return NOTIFY_DONE;
5179 }
5180}
5181
5182static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5183 unsigned long action, void *hcpu)
5184{
5185 switch (action & ~CPU_TASKS_FROZEN) {
5186 case CPU_DOWN_PREPARE:
5187 set_cpu_active((long)hcpu, false);
5188 return NOTIFY_OK;
5189 default:
5190 return NOTIFY_DONE;
5191 }
5192}
5193
5194static int __init migration_init(void)
5195{
5196 void *cpu = (void *)(long)smp_processor_id();
5197 int err;
5198
5199 /* Initialize migration for the boot CPU */
5200 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5201 BUG_ON(err == NOTIFY_BAD);
5202 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5203 register_cpu_notifier(&migration_notifier);
5204
5205 /* Register cpu active notifiers */
5206 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5207 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5208
5209 return 0;
5210}
5211early_initcall(migration_init);
5212#endif
5213
5214#ifdef CONFIG_SMP
5215
5216static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5217
5218#ifdef CONFIG_SCHED_DEBUG
5219
5220static __read_mostly int sched_debug_enabled;
5221
5222static int __init sched_debug_setup(char *str)
5223{
5224 sched_debug_enabled = 1;
5225
5226 return 0;
5227}
5228early_param("sched_debug", sched_debug_setup);
5229
5230static inline bool sched_debug(void)
5231{
5232 return sched_debug_enabled;
5233}
5234
5235static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5236 struct cpumask *groupmask)
5237{
5238 struct sched_group *group = sd->groups;
5239 char str[256];
5240
5241 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5242 cpumask_clear(groupmask);
5243
5244 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5245
5246 if (!(sd->flags & SD_LOAD_BALANCE)) {
5247 printk("does not load-balance\n");
5248 if (sd->parent)
5249 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5250 " has parent");
5251 return -1;
5252 }
5253
5254 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5255
5256 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5257 printk(KERN_ERR "ERROR: domain->span does not contain "
5258 "CPU%d\n", cpu);
5259 }
5260 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5261 printk(KERN_ERR "ERROR: domain->groups does not contain"
5262 " CPU%d\n", cpu);
5263 }
5264
5265 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5266 do {
5267 if (!group) {
5268 printk("\n");
5269 printk(KERN_ERR "ERROR: group is NULL\n");
5270 break;
5271 }
5272
5273 /*
5274 * Even though we initialize ->power to something semi-sane,
5275 * we leave power_orig unset. This allows us to detect if
5276 * domain iteration is still funny without causing /0 traps.
5277 */
5278 if (!group->sgp->power_orig) {
5279 printk(KERN_CONT "\n");
5280 printk(KERN_ERR "ERROR: domain->cpu_power not "
5281 "set\n");
5282 break;
5283 }
5284
5285 if (!cpumask_weight(sched_group_cpus(group))) {
5286 printk(KERN_CONT "\n");
5287 printk(KERN_ERR "ERROR: empty group\n");
5288 break;
5289 }
5290
5291 if (!(sd->flags & SD_OVERLAP) &&
5292 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5293 printk(KERN_CONT "\n");
5294 printk(KERN_ERR "ERROR: repeated CPUs\n");
5295 break;
5296 }
5297
5298 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5299
5300 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5301
5302 printk(KERN_CONT " %s", str);
5303 if (group->sgp->power != SCHED_POWER_SCALE) {
5304 printk(KERN_CONT " (cpu_power = %d)",
5305 group->sgp->power);
5306 }
5307
5308 group = group->next;
5309 } while (group != sd->groups);
5310 printk(KERN_CONT "\n");
5311
5312 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5313 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5314
5315 if (sd->parent &&
5316 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5317 printk(KERN_ERR "ERROR: parent span is not a superset "
5318 "of domain->span\n");
5319 return 0;
5320}
5321
5322static void sched_domain_debug(struct sched_domain *sd, int cpu)
5323{
5324 int level = 0;
5325
5326 if (!sched_debug_enabled)
5327 return;
5328
5329 if (!sd) {
5330 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5331 return;
5332 }
5333
5334 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5335
5336 for (;;) {
5337 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5338 break;
5339 level++;
5340 sd = sd->parent;
5341 if (!sd)
5342 break;
5343 }
5344}
5345#else /* !CONFIG_SCHED_DEBUG */
5346# define sched_domain_debug(sd, cpu) do { } while (0)
5347static inline bool sched_debug(void)
5348{
5349 return false;
5350}
5351#endif /* CONFIG_SCHED_DEBUG */
5352
5353static int sd_degenerate(struct sched_domain *sd)
5354{
5355 if (cpumask_weight(sched_domain_span(sd)) == 1)
5356 return 1;
5357
5358 /* Following flags need at least 2 groups */
5359 if (sd->flags & (SD_LOAD_BALANCE |
5360 SD_BALANCE_NEWIDLE |
5361 SD_BALANCE_FORK |
5362 SD_BALANCE_EXEC |
5363 SD_SHARE_CPUPOWER |
5364 SD_SHARE_PKG_RESOURCES)) {
5365 if (sd->groups != sd->groups->next)
5366 return 0;
5367 }
5368
5369 /* Following flags don't use groups */
5370 if (sd->flags & (SD_WAKE_AFFINE))
5371 return 0;
5372
5373 return 1;
5374}
5375
5376static int
5377sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5378{
5379 unsigned long cflags = sd->flags, pflags = parent->flags;
5380
5381 if (sd_degenerate(parent))
5382 return 1;
5383
5384 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5385 return 0;
5386
5387 /* Flags needing groups don't count if only 1 group in parent */
5388 if (parent->groups == parent->groups->next) {
5389 pflags &= ~(SD_LOAD_BALANCE |
5390 SD_BALANCE_NEWIDLE |
5391 SD_BALANCE_FORK |
5392 SD_BALANCE_EXEC |
5393 SD_SHARE_CPUPOWER |
5394 SD_SHARE_PKG_RESOURCES);
5395 if (nr_node_ids == 1)
5396 pflags &= ~SD_SERIALIZE;
5397 }
5398 if (~cflags & pflags)
5399 return 0;
5400
5401 return 1;
5402}
5403
5404static void free_rootdomain(struct rcu_head *rcu)
5405{
5406 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5407
5408 cpupri_cleanup(&rd->cpupri);
5409 free_cpumask_var(rd->rto_mask);
5410 free_cpumask_var(rd->online);
5411 free_cpumask_var(rd->span);
5412 kfree(rd);
5413}
5414
5415static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5416{
5417 struct root_domain *old_rd = NULL;
5418 unsigned long flags;
5419
5420 raw_spin_lock_irqsave(&rq->lock, flags);
5421
5422 if (rq->rd) {
5423 old_rd = rq->rd;
5424
5425 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5426 set_rq_offline(rq);
5427
5428 cpumask_clear_cpu(rq->cpu, old_rd->span);
5429
5430 /*
5431 * If we dont want to free the old_rt yet then
5432 * set old_rd to NULL to skip the freeing later
5433 * in this function:
5434 */
5435 if (!atomic_dec_and_test(&old_rd->refcount))
5436 old_rd = NULL;
5437 }
5438
5439 atomic_inc(&rd->refcount);
5440 rq->rd = rd;
5441
5442 cpumask_set_cpu(rq->cpu, rd->span);
5443 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5444 set_rq_online(rq);
5445
5446 raw_spin_unlock_irqrestore(&rq->lock, flags);
5447
5448 if (old_rd)
5449 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5450}
5451
5452static int init_rootdomain(struct root_domain *rd)
5453{
5454 memset(rd, 0, sizeof(*rd));
5455
5456 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5457 goto out;
5458 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5459 goto free_span;
5460 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5461 goto free_online;
5462
5463 if (cpupri_init(&rd->cpupri) != 0)
5464 goto free_rto_mask;
5465 return 0;
5466
5467free_rto_mask:
5468 free_cpumask_var(rd->rto_mask);
5469free_online:
5470 free_cpumask_var(rd->online);
5471free_span:
5472 free_cpumask_var(rd->span);
5473out:
5474 return -ENOMEM;
5475}
5476
5477/*
5478 * By default the system creates a single root-domain with all cpus as
5479 * members (mimicking the global state we have today).
5480 */
5481struct root_domain def_root_domain;
5482
5483static void init_defrootdomain(void)
5484{
5485 init_rootdomain(&def_root_domain);
5486
5487 atomic_set(&def_root_domain.refcount, 1);
5488}
5489
5490static struct root_domain *alloc_rootdomain(void)
5491{
5492 struct root_domain *rd;
5493
5494 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5495 if (!rd)
5496 return NULL;
5497
5498 if (init_rootdomain(rd) != 0) {
5499 kfree(rd);
5500 return NULL;
5501 }
5502
5503 return rd;
5504}
5505
5506static void free_sched_groups(struct sched_group *sg, int free_sgp)
5507{
5508 struct sched_group *tmp, *first;
5509
5510 if (!sg)
5511 return;
5512
5513 first = sg;
5514 do {
5515 tmp = sg->next;
5516
5517 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5518 kfree(sg->sgp);
5519
5520 kfree(sg);
5521 sg = tmp;
5522 } while (sg != first);
5523}
5524
5525static void free_sched_domain(struct rcu_head *rcu)
5526{
5527 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5528
5529 /*
5530 * If its an overlapping domain it has private groups, iterate and
5531 * nuke them all.
5532 */
5533 if (sd->flags & SD_OVERLAP) {
5534 free_sched_groups(sd->groups, 1);
5535 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5536 kfree(sd->groups->sgp);
5537 kfree(sd->groups);
5538 }
5539 kfree(sd);
5540}
5541
5542static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5543{
5544 call_rcu(&sd->rcu, free_sched_domain);
5545}
5546
5547static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5548{
5549 for (; sd; sd = sd->parent)
5550 destroy_sched_domain(sd, cpu);
5551}
5552
5553/*
5554 * Keep a special pointer to the highest sched_domain that has
5555 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5556 * allows us to avoid some pointer chasing select_idle_sibling().
5557 *
5558 * Also keep a unique ID per domain (we use the first cpu number in
5559 * the cpumask of the domain), this allows us to quickly tell if
5560 * two cpus are in the same cache domain, see cpus_share_cache().
5561 */
5562DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5563DEFINE_PER_CPU(int, sd_llc_id);
5564
5565static void update_top_cache_domain(int cpu)
5566{
5567 struct sched_domain *sd;
5568 int id = cpu;
5569
5570 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5571 if (sd)
5572 id = cpumask_first(sched_domain_span(sd));
5573
5574 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5575 per_cpu(sd_llc_id, cpu) = id;
5576}
5577
5578/*
5579 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5580 * hold the hotplug lock.
5581 */
5582static void
5583cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5584{
5585 struct rq *rq = cpu_rq(cpu);
5586 struct sched_domain *tmp;
5587
5588 /* Remove the sched domains which do not contribute to scheduling. */
5589 for (tmp = sd; tmp; ) {
5590 struct sched_domain *parent = tmp->parent;
5591 if (!parent)
5592 break;
5593
5594 if (sd_parent_degenerate(tmp, parent)) {
5595 tmp->parent = parent->parent;
5596 if (parent->parent)
5597 parent->parent->child = tmp;
5598 destroy_sched_domain(parent, cpu);
5599 } else
5600 tmp = tmp->parent;
5601 }
5602
5603 if (sd && sd_degenerate(sd)) {
5604 tmp = sd;
5605 sd = sd->parent;
5606 destroy_sched_domain(tmp, cpu);
5607 if (sd)
5608 sd->child = NULL;
5609 }
5610
5611 sched_domain_debug(sd, cpu);
5612
5613 rq_attach_root(rq, rd);
5614 tmp = rq->sd;
5615 rcu_assign_pointer(rq->sd, sd);
5616 destroy_sched_domains(tmp, cpu);
5617
5618 update_top_cache_domain(cpu);
5619}
5620
5621/* cpus with isolated domains */
5622static cpumask_var_t cpu_isolated_map;
5623
5624/* Setup the mask of cpus configured for isolated domains */
5625static int __init isolated_cpu_setup(char *str)
5626{
5627 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5628 cpulist_parse(str, cpu_isolated_map);
5629 return 1;
5630}
5631
5632__setup("isolcpus=", isolated_cpu_setup);
5633
5634static const struct cpumask *cpu_cpu_mask(int cpu)
5635{
5636 return cpumask_of_node(cpu_to_node(cpu));
5637}
5638
5639struct sd_data {
5640 struct sched_domain **__percpu sd;
5641 struct sched_group **__percpu sg;
5642 struct sched_group_power **__percpu sgp;
5643};
5644
5645struct s_data {
5646 struct sched_domain ** __percpu sd;
5647 struct root_domain *rd;
5648};
5649
5650enum s_alloc {
5651 sa_rootdomain,
5652 sa_sd,
5653 sa_sd_storage,
5654 sa_none,
5655};
5656
5657struct sched_domain_topology_level;
5658
5659typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5660typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5661
5662#define SDTL_OVERLAP 0x01
5663
5664struct sched_domain_topology_level {
5665 sched_domain_init_f init;
5666 sched_domain_mask_f mask;
5667 int flags;
5668 int numa_level;
5669 struct sd_data data;
5670};
5671
5672/*
5673 * Build an iteration mask that can exclude certain CPUs from the upwards
5674 * domain traversal.
5675 *
5676 * Asymmetric node setups can result in situations where the domain tree is of
5677 * unequal depth, make sure to skip domains that already cover the entire
5678 * range.
5679 *
5680 * In that case build_sched_domains() will have terminated the iteration early
5681 * and our sibling sd spans will be empty. Domains should always include the
5682 * cpu they're built on, so check that.
5683 *
5684 */
5685static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5686{
5687 const struct cpumask *span = sched_domain_span(sd);
5688 struct sd_data *sdd = sd->private;
5689 struct sched_domain *sibling;
5690 int i;
5691
5692 for_each_cpu(i, span) {
5693 sibling = *per_cpu_ptr(sdd->sd, i);
5694 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5695 continue;
5696
5697 cpumask_set_cpu(i, sched_group_mask(sg));
5698 }
5699}
5700
5701/*
5702 * Return the canonical balance cpu for this group, this is the first cpu
5703 * of this group that's also in the iteration mask.
5704 */
5705int group_balance_cpu(struct sched_group *sg)
5706{
5707 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5708}
5709
5710static int
5711build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5712{
5713 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5714 const struct cpumask *span = sched_domain_span(sd);
5715 struct cpumask *covered = sched_domains_tmpmask;
5716 struct sd_data *sdd = sd->private;
5717 struct sched_domain *child;
5718 int i;
5719
5720 cpumask_clear(covered);
5721
5722 for_each_cpu(i, span) {
5723 struct cpumask *sg_span;
5724
5725 if (cpumask_test_cpu(i, covered))
5726 continue;
5727
5728 child = *per_cpu_ptr(sdd->sd, i);
5729
5730 /* See the comment near build_group_mask(). */
5731 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5732 continue;
5733
5734 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5735 GFP_KERNEL, cpu_to_node(cpu));
5736
5737 if (!sg)
5738 goto fail;
5739
5740 sg_span = sched_group_cpus(sg);
5741 if (child->child) {
5742 child = child->child;
5743 cpumask_copy(sg_span, sched_domain_span(child));
5744 } else
5745 cpumask_set_cpu(i, sg_span);
5746
5747 cpumask_or(covered, covered, sg_span);
5748
5749 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5750 if (atomic_inc_return(&sg->sgp->ref) == 1)
5751 build_group_mask(sd, sg);
5752
5753 /*
5754 * Initialize sgp->power such that even if we mess up the
5755 * domains and no possible iteration will get us here, we won't
5756 * die on a /0 trap.
5757 */
5758 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5759
5760 /*
5761 * Make sure the first group of this domain contains the
5762 * canonical balance cpu. Otherwise the sched_domain iteration
5763 * breaks. See update_sg_lb_stats().
5764 */
5765 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5766 group_balance_cpu(sg) == cpu)
5767 groups = sg;
5768
5769 if (!first)
5770 first = sg;
5771 if (last)
5772 last->next = sg;
5773 last = sg;
5774 last->next = first;
5775 }
5776 sd->groups = groups;
5777
5778 return 0;
5779
5780fail:
5781 free_sched_groups(first, 0);
5782
5783 return -ENOMEM;
5784}
5785
5786static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5787{
5788 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5789 struct sched_domain *child = sd->child;
5790
5791 if (child)
5792 cpu = cpumask_first(sched_domain_span(child));
5793
5794 if (sg) {
5795 *sg = *per_cpu_ptr(sdd->sg, cpu);
5796 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5797 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
5798 }
5799
5800 return cpu;
5801}
5802
5803/*
5804 * build_sched_groups will build a circular linked list of the groups
5805 * covered by the given span, and will set each group's ->cpumask correctly,
5806 * and ->cpu_power to 0.
5807 *
5808 * Assumes the sched_domain tree is fully constructed
5809 */
5810static int
5811build_sched_groups(struct sched_domain *sd, int cpu)
5812{
5813 struct sched_group *first = NULL, *last = NULL;
5814 struct sd_data *sdd = sd->private;
5815 const struct cpumask *span = sched_domain_span(sd);
5816 struct cpumask *covered;
5817 int i;
5818
5819 get_group(cpu, sdd, &sd->groups);
5820 atomic_inc(&sd->groups->ref);
5821
5822 if (cpu != cpumask_first(sched_domain_span(sd)))
5823 return 0;
5824
5825 lockdep_assert_held(&sched_domains_mutex);
5826 covered = sched_domains_tmpmask;
5827
5828 cpumask_clear(covered);
5829
5830 for_each_cpu(i, span) {
5831 struct sched_group *sg;
5832 int group = get_group(i, sdd, &sg);
5833 int j;
5834
5835 if (cpumask_test_cpu(i, covered))
5836 continue;
5837
5838 cpumask_clear(sched_group_cpus(sg));
5839 sg->sgp->power = 0;
5840 cpumask_setall(sched_group_mask(sg));
5841
5842 for_each_cpu(j, span) {
5843 if (get_group(j, sdd, NULL) != group)
5844 continue;
5845
5846 cpumask_set_cpu(j, covered);
5847 cpumask_set_cpu(j, sched_group_cpus(sg));
5848 }
5849
5850 if (!first)
5851 first = sg;
5852 if (last)
5853 last->next = sg;
5854 last = sg;
5855 }
5856 last->next = first;
5857
5858 return 0;
5859}
5860
5861/*
5862 * Initialize sched groups cpu_power.
5863 *
5864 * cpu_power indicates the capacity of sched group, which is used while
5865 * distributing the load between different sched groups in a sched domain.
5866 * Typically cpu_power for all the groups in a sched domain will be same unless
5867 * there are asymmetries in the topology. If there are asymmetries, group
5868 * having more cpu_power will pickup more load compared to the group having
5869 * less cpu_power.
5870 */
5871static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5872{
5873 struct sched_group *sg = sd->groups;
5874
5875 WARN_ON(!sd || !sg);
5876
5877 do {
5878 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5879 sg = sg->next;
5880 } while (sg != sd->groups);
5881
5882 if (cpu != group_balance_cpu(sg))
5883 return;
5884
5885 update_group_power(sd, cpu);
5886 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5887}
5888
5889int __weak arch_sd_sibling_asym_packing(void)
5890{
5891 return 0*SD_ASYM_PACKING;
5892}
5893
5894/*
5895 * Initializers for schedule domains
5896 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5897 */
5898
5899#ifdef CONFIG_SCHED_DEBUG
5900# define SD_INIT_NAME(sd, type) sd->name = #type
5901#else
5902# define SD_INIT_NAME(sd, type) do { } while (0)
5903#endif
5904
5905#define SD_INIT_FUNC(type) \
5906static noinline struct sched_domain * \
5907sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5908{ \
5909 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5910 *sd = SD_##type##_INIT; \
5911 SD_INIT_NAME(sd, type); \
5912 sd->private = &tl->data; \
5913 return sd; \
5914}
5915
5916SD_INIT_FUNC(CPU)
5917#ifdef CONFIG_SCHED_SMT
5918 SD_INIT_FUNC(SIBLING)
5919#endif
5920#ifdef CONFIG_SCHED_MC
5921 SD_INIT_FUNC(MC)
5922#endif
5923#ifdef CONFIG_SCHED_BOOK
5924 SD_INIT_FUNC(BOOK)
5925#endif
5926
5927static int default_relax_domain_level = -1;
5928int sched_domain_level_max;
5929
5930static int __init setup_relax_domain_level(char *str)
5931{
5932 if (kstrtoint(str, 0, &default_relax_domain_level))
5933 pr_warn("Unable to set relax_domain_level\n");
5934
5935 return 1;
5936}
5937__setup("relax_domain_level=", setup_relax_domain_level);
5938
5939static void set_domain_attribute(struct sched_domain *sd,
5940 struct sched_domain_attr *attr)
5941{
5942 int request;
5943
5944 if (!attr || attr->relax_domain_level < 0) {
5945 if (default_relax_domain_level < 0)
5946 return;
5947 else
5948 request = default_relax_domain_level;
5949 } else
5950 request = attr->relax_domain_level;
5951 if (request < sd->level) {
5952 /* turn off idle balance on this domain */
5953 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5954 } else {
5955 /* turn on idle balance on this domain */
5956 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5957 }
5958}
5959
5960static void __sdt_free(const struct cpumask *cpu_map);
5961static int __sdt_alloc(const struct cpumask *cpu_map);
5962
5963static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5964 const struct cpumask *cpu_map)
5965{
5966 switch (what) {
5967 case sa_rootdomain:
5968 if (!atomic_read(&d->rd->refcount))
5969 free_rootdomain(&d->rd->rcu); /* fall through */
5970 case sa_sd:
5971 free_percpu(d->sd); /* fall through */
5972 case sa_sd_storage:
5973 __sdt_free(cpu_map); /* fall through */
5974 case sa_none:
5975 break;
5976 }
5977}
5978
5979static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5980 const struct cpumask *cpu_map)
5981{
5982 memset(d, 0, sizeof(*d));
5983
5984 if (__sdt_alloc(cpu_map))
5985 return sa_sd_storage;
5986 d->sd = alloc_percpu(struct sched_domain *);
5987 if (!d->sd)
5988 return sa_sd_storage;
5989 d->rd = alloc_rootdomain();
5990 if (!d->rd)
5991 return sa_sd;
5992 return sa_rootdomain;
5993}
5994
5995/*
5996 * NULL the sd_data elements we've used to build the sched_domain and
5997 * sched_group structure so that the subsequent __free_domain_allocs()
5998 * will not free the data we're using.
5999 */
6000static void claim_allocations(int cpu, struct sched_domain *sd)
6001{
6002 struct sd_data *sdd = sd->private;
6003
6004 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6005 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6006
6007 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6008 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6009
6010 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6011 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
6012}
6013
6014#ifdef CONFIG_SCHED_SMT
6015static const struct cpumask *cpu_smt_mask(int cpu)
6016{
6017 return topology_thread_cpumask(cpu);
6018}
6019#endif
6020
6021/*
6022 * Topology list, bottom-up.
6023 */
6024static struct sched_domain_topology_level default_topology[] = {
6025#ifdef CONFIG_SCHED_SMT
6026 { sd_init_SIBLING, cpu_smt_mask, },
6027#endif
6028#ifdef CONFIG_SCHED_MC
6029 { sd_init_MC, cpu_coregroup_mask, },
6030#endif
6031#ifdef CONFIG_SCHED_BOOK
6032 { sd_init_BOOK, cpu_book_mask, },
6033#endif
6034 { sd_init_CPU, cpu_cpu_mask, },
6035 { NULL, },
6036};
6037
6038static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6039
6040#ifdef CONFIG_NUMA
6041
6042static int sched_domains_numa_levels;
6043static int *sched_domains_numa_distance;
6044static struct cpumask ***sched_domains_numa_masks;
6045static int sched_domains_curr_level;
6046
6047static inline int sd_local_flags(int level)
6048{
6049 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6050 return 0;
6051
6052 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6053}
6054
6055static struct sched_domain *
6056sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6057{
6058 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6059 int level = tl->numa_level;
6060 int sd_weight = cpumask_weight(
6061 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6062
6063 *sd = (struct sched_domain){
6064 .min_interval = sd_weight,
6065 .max_interval = 2*sd_weight,
6066 .busy_factor = 32,
6067 .imbalance_pct = 125,
6068 .cache_nice_tries = 2,
6069 .busy_idx = 3,
6070 .idle_idx = 2,
6071 .newidle_idx = 0,
6072 .wake_idx = 0,
6073 .forkexec_idx = 0,
6074
6075 .flags = 1*SD_LOAD_BALANCE
6076 | 1*SD_BALANCE_NEWIDLE
6077 | 0*SD_BALANCE_EXEC
6078 | 0*SD_BALANCE_FORK
6079 | 0*SD_BALANCE_WAKE
6080 | 0*SD_WAKE_AFFINE
6081 | 0*SD_SHARE_CPUPOWER
6082 | 0*SD_SHARE_PKG_RESOURCES
6083 | 1*SD_SERIALIZE
6084 | 0*SD_PREFER_SIBLING
6085 | sd_local_flags(level)
6086 ,
6087 .last_balance = jiffies,
6088 .balance_interval = sd_weight,
6089 };
6090 SD_INIT_NAME(sd, NUMA);
6091 sd->private = &tl->data;
6092
6093 /*
6094 * Ugly hack to pass state to sd_numa_mask()...
6095 */
6096 sched_domains_curr_level = tl->numa_level;
6097
6098 return sd;
6099}
6100
6101static const struct cpumask *sd_numa_mask(int cpu)
6102{
6103 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6104}
6105
6106static void sched_numa_warn(const char *str)
6107{
6108 static int done = false;
6109 int i,j;
6110
6111 if (done)
6112 return;
6113
6114 done = true;
6115
6116 printk(KERN_WARNING "ERROR: %s\n\n", str);
6117
6118 for (i = 0; i < nr_node_ids; i++) {
6119 printk(KERN_WARNING " ");
6120 for (j = 0; j < nr_node_ids; j++)
6121 printk(KERN_CONT "%02d ", node_distance(i,j));
6122 printk(KERN_CONT "\n");
6123 }
6124 printk(KERN_WARNING "\n");
6125}
6126
6127static bool find_numa_distance(int distance)
6128{
6129 int i;
6130
6131 if (distance == node_distance(0, 0))
6132 return true;
6133
6134 for (i = 0; i < sched_domains_numa_levels; i++) {
6135 if (sched_domains_numa_distance[i] == distance)
6136 return true;
6137 }
6138
6139 return false;
6140}
6141
6142static void sched_init_numa(void)
6143{
6144 int next_distance, curr_distance = node_distance(0, 0);
6145 struct sched_domain_topology_level *tl;
6146 int level = 0;
6147 int i, j, k;
6148
6149 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6150 if (!sched_domains_numa_distance)
6151 return;
6152
6153 /*
6154 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6155 * unique distances in the node_distance() table.
6156 *
6157 * Assumes node_distance(0,j) includes all distances in
6158 * node_distance(i,j) in order to avoid cubic time.
6159 */
6160 next_distance = curr_distance;
6161 for (i = 0; i < nr_node_ids; i++) {
6162 for (j = 0; j < nr_node_ids; j++) {
6163 for (k = 0; k < nr_node_ids; k++) {
6164 int distance = node_distance(i, k);
6165
6166 if (distance > curr_distance &&
6167 (distance < next_distance ||
6168 next_distance == curr_distance))
6169 next_distance = distance;
6170
6171 /*
6172 * While not a strong assumption it would be nice to know
6173 * about cases where if node A is connected to B, B is not
6174 * equally connected to A.
6175 */
6176 if (sched_debug() && node_distance(k, i) != distance)
6177 sched_numa_warn("Node-distance not symmetric");
6178
6179 if (sched_debug() && i && !find_numa_distance(distance))
6180 sched_numa_warn("Node-0 not representative");
6181 }
6182 if (next_distance != curr_distance) {
6183 sched_domains_numa_distance[level++] = next_distance;
6184 sched_domains_numa_levels = level;
6185 curr_distance = next_distance;
6186 } else break;
6187 }
6188
6189 /*
6190 * In case of sched_debug() we verify the above assumption.
6191 */
6192 if (!sched_debug())
6193 break;
6194 }
6195 /*
6196 * 'level' contains the number of unique distances, excluding the
6197 * identity distance node_distance(i,i).
6198 *
6199 * The sched_domains_nume_distance[] array includes the actual distance
6200 * numbers.
6201 */
6202
6203 /*
6204 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6205 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6206 * the array will contain less then 'level' members. This could be
6207 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6208 * in other functions.
6209 *
6210 * We reset it to 'level' at the end of this function.
6211 */
6212 sched_domains_numa_levels = 0;
6213
6214 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6215 if (!sched_domains_numa_masks)
6216 return;
6217
6218 /*
6219 * Now for each level, construct a mask per node which contains all
6220 * cpus of nodes that are that many hops away from us.
6221 */
6222 for (i = 0; i < level; i++) {
6223 sched_domains_numa_masks[i] =
6224 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6225 if (!sched_domains_numa_masks[i])
6226 return;
6227
6228 for (j = 0; j < nr_node_ids; j++) {
6229 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6230 if (!mask)
6231 return;
6232
6233 sched_domains_numa_masks[i][j] = mask;
6234
6235 for (k = 0; k < nr_node_ids; k++) {
6236 if (node_distance(j, k) > sched_domains_numa_distance[i])
6237 continue;
6238
6239 cpumask_or(mask, mask, cpumask_of_node(k));
6240 }
6241 }
6242 }
6243
6244 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6245 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6246 if (!tl)
6247 return;
6248
6249 /*
6250 * Copy the default topology bits..
6251 */
6252 for (i = 0; default_topology[i].init; i++)
6253 tl[i] = default_topology[i];
6254
6255 /*
6256 * .. and append 'j' levels of NUMA goodness.
6257 */
6258 for (j = 0; j < level; i++, j++) {
6259 tl[i] = (struct sched_domain_topology_level){
6260 .init = sd_numa_init,
6261 .mask = sd_numa_mask,
6262 .flags = SDTL_OVERLAP,
6263 .numa_level = j,
6264 };
6265 }
6266
6267 sched_domain_topology = tl;
6268
6269 sched_domains_numa_levels = level;
6270}
6271
6272static void sched_domains_numa_masks_set(int cpu)
6273{
6274 int i, j;
6275 int node = cpu_to_node(cpu);
6276
6277 for (i = 0; i < sched_domains_numa_levels; i++) {
6278 for (j = 0; j < nr_node_ids; j++) {
6279 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6280 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6281 }
6282 }
6283}
6284
6285static void sched_domains_numa_masks_clear(int cpu)
6286{
6287 int i, j;
6288 for (i = 0; i < sched_domains_numa_levels; i++) {
6289 for (j = 0; j < nr_node_ids; j++)
6290 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6291 }
6292}
6293
6294/*
6295 * Update sched_domains_numa_masks[level][node] array when new cpus
6296 * are onlined.
6297 */
6298static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6299 unsigned long action,
6300 void *hcpu)
6301{
6302 int cpu = (long)hcpu;
6303
6304 switch (action & ~CPU_TASKS_FROZEN) {
6305 case CPU_ONLINE:
6306 sched_domains_numa_masks_set(cpu);
6307 break;
6308
6309 case CPU_DEAD:
6310 sched_domains_numa_masks_clear(cpu);
6311 break;
6312
6313 default:
6314 return NOTIFY_DONE;
6315 }
6316
6317 return NOTIFY_OK;
6318}
6319#else
6320static inline void sched_init_numa(void)
6321{
6322}
6323
6324static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6325 unsigned long action,
6326 void *hcpu)
6327{
6328 return 0;
6329}
6330#endif /* CONFIG_NUMA */
6331
6332static int __sdt_alloc(const struct cpumask *cpu_map)
6333{
6334 struct sched_domain_topology_level *tl;
6335 int j;
6336
6337 for (tl = sched_domain_topology; tl->init; tl++) {
6338 struct sd_data *sdd = &tl->data;
6339
6340 sdd->sd = alloc_percpu(struct sched_domain *);
6341 if (!sdd->sd)
6342 return -ENOMEM;
6343
6344 sdd->sg = alloc_percpu(struct sched_group *);
6345 if (!sdd->sg)
6346 return -ENOMEM;
6347
6348 sdd->sgp = alloc_percpu(struct sched_group_power *);
6349 if (!sdd->sgp)
6350 return -ENOMEM;
6351
6352 for_each_cpu(j, cpu_map) {
6353 struct sched_domain *sd;
6354 struct sched_group *sg;
6355 struct sched_group_power *sgp;
6356
6357 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6358 GFP_KERNEL, cpu_to_node(j));
6359 if (!sd)
6360 return -ENOMEM;
6361
6362 *per_cpu_ptr(sdd->sd, j) = sd;
6363
6364 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6365 GFP_KERNEL, cpu_to_node(j));
6366 if (!sg)
6367 return -ENOMEM;
6368
6369 sg->next = sg;
6370
6371 *per_cpu_ptr(sdd->sg, j) = sg;
6372
6373 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6374 GFP_KERNEL, cpu_to_node(j));
6375 if (!sgp)
6376 return -ENOMEM;
6377
6378 *per_cpu_ptr(sdd->sgp, j) = sgp;
6379 }
6380 }
6381
6382 return 0;
6383}
6384
6385static void __sdt_free(const struct cpumask *cpu_map)
6386{
6387 struct sched_domain_topology_level *tl;
6388 int j;
6389
6390 for (tl = sched_domain_topology; tl->init; tl++) {
6391 struct sd_data *sdd = &tl->data;
6392
6393 for_each_cpu(j, cpu_map) {
6394 struct sched_domain *sd;
6395
6396 if (sdd->sd) {
6397 sd = *per_cpu_ptr(sdd->sd, j);
6398 if (sd && (sd->flags & SD_OVERLAP))
6399 free_sched_groups(sd->groups, 0);
6400 kfree(*per_cpu_ptr(sdd->sd, j));
6401 }
6402
6403 if (sdd->sg)
6404 kfree(*per_cpu_ptr(sdd->sg, j));
6405 if (sdd->sgp)
6406 kfree(*per_cpu_ptr(sdd->sgp, j));
6407 }
6408 free_percpu(sdd->sd);
6409 sdd->sd = NULL;
6410 free_percpu(sdd->sg);
6411 sdd->sg = NULL;
6412 free_percpu(sdd->sgp);
6413 sdd->sgp = NULL;
6414 }
6415}
6416
6417struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6418 struct s_data *d, const struct cpumask *cpu_map,
6419 struct sched_domain_attr *attr, struct sched_domain *child,
6420 int cpu)
6421{
6422 struct sched_domain *sd = tl->init(tl, cpu);
6423 if (!sd)
6424 return child;
6425
6426 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6427 if (child) {
6428 sd->level = child->level + 1;
6429 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6430 child->parent = sd;
6431 }
6432 sd->child = child;
6433 set_domain_attribute(sd, attr);
6434
6435 return sd;
6436}
6437
6438/*
6439 * Build sched domains for a given set of cpus and attach the sched domains
6440 * to the individual cpus
6441 */
6442static int build_sched_domains(const struct cpumask *cpu_map,
6443 struct sched_domain_attr *attr)
6444{
6445 enum s_alloc alloc_state = sa_none;
6446 struct sched_domain *sd;
6447 struct s_data d;
6448 int i, ret = -ENOMEM;
6449
6450 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6451 if (alloc_state != sa_rootdomain)
6452 goto error;
6453
6454 /* Set up domains for cpus specified by the cpu_map. */
6455 for_each_cpu(i, cpu_map) {
6456 struct sched_domain_topology_level *tl;
6457
6458 sd = NULL;
6459 for (tl = sched_domain_topology; tl->init; tl++) {
6460 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6461 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6462 sd->flags |= SD_OVERLAP;
6463 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6464 break;
6465 }
6466
6467 while (sd->child)
6468 sd = sd->child;
6469
6470 *per_cpu_ptr(d.sd, i) = sd;
6471 }
6472
6473 /* Build the groups for the domains */
6474 for_each_cpu(i, cpu_map) {
6475 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6476 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6477 if (sd->flags & SD_OVERLAP) {
6478 if (build_overlap_sched_groups(sd, i))
6479 goto error;
6480 } else {
6481 if (build_sched_groups(sd, i))
6482 goto error;
6483 }
6484 }
6485 }
6486
6487 /* Calculate CPU power for physical packages and nodes */
6488 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6489 if (!cpumask_test_cpu(i, cpu_map))
6490 continue;
6491
6492 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6493 claim_allocations(i, sd);
6494 init_sched_groups_power(i, sd);
6495 }
6496 }
6497
6498 /* Attach the domains */
6499 rcu_read_lock();
6500 for_each_cpu(i, cpu_map) {
6501 sd = *per_cpu_ptr(d.sd, i);
6502 cpu_attach_domain(sd, d.rd, i);
6503 }
6504 rcu_read_unlock();
6505
6506 ret = 0;
6507error:
6508 __free_domain_allocs(&d, alloc_state, cpu_map);
6509 return ret;
6510}
6511
6512static cpumask_var_t *doms_cur; /* current sched domains */
6513static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6514static struct sched_domain_attr *dattr_cur;
6515 /* attribues of custom domains in 'doms_cur' */
6516
6517/*
6518 * Special case: If a kmalloc of a doms_cur partition (array of
6519 * cpumask) fails, then fallback to a single sched domain,
6520 * as determined by the single cpumask fallback_doms.
6521 */
6522static cpumask_var_t fallback_doms;
6523
6524/*
6525 * arch_update_cpu_topology lets virtualized architectures update the
6526 * cpu core maps. It is supposed to return 1 if the topology changed
6527 * or 0 if it stayed the same.
6528 */
6529int __attribute__((weak)) arch_update_cpu_topology(void)
6530{
6531 return 0;
6532}
6533
6534cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6535{
6536 int i;
6537 cpumask_var_t *doms;
6538
6539 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6540 if (!doms)
6541 return NULL;
6542 for (i = 0; i < ndoms; i++) {
6543 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6544 free_sched_domains(doms, i);
6545 return NULL;
6546 }
6547 }
6548 return doms;
6549}
6550
6551void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6552{
6553 unsigned int i;
6554 for (i = 0; i < ndoms; i++)
6555 free_cpumask_var(doms[i]);
6556 kfree(doms);
6557}
6558
6559/*
6560 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6561 * For now this just excludes isolated cpus, but could be used to
6562 * exclude other special cases in the future.
6563 */
6564static int init_sched_domains(const struct cpumask *cpu_map)
6565{
6566 int err;
6567
6568 arch_update_cpu_topology();
6569 ndoms_cur = 1;
6570 doms_cur = alloc_sched_domains(ndoms_cur);
6571 if (!doms_cur)
6572 doms_cur = &fallback_doms;
6573 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6574 err = build_sched_domains(doms_cur[0], NULL);
6575 register_sched_domain_sysctl();
6576
6577 return err;
6578}
6579
6580/*
6581 * Detach sched domains from a group of cpus specified in cpu_map
6582 * These cpus will now be attached to the NULL domain
6583 */
6584static void detach_destroy_domains(const struct cpumask *cpu_map)
6585{
6586 int i;
6587
6588 rcu_read_lock();
6589 for_each_cpu(i, cpu_map)
6590 cpu_attach_domain(NULL, &def_root_domain, i);
6591 rcu_read_unlock();
6592}
6593
6594/* handle null as "default" */
6595static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6596 struct sched_domain_attr *new, int idx_new)
6597{
6598 struct sched_domain_attr tmp;
6599
6600 /* fast path */
6601 if (!new && !cur)
6602 return 1;
6603
6604 tmp = SD_ATTR_INIT;
6605 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6606 new ? (new + idx_new) : &tmp,
6607 sizeof(struct sched_domain_attr));
6608}
6609
6610/*
6611 * Partition sched domains as specified by the 'ndoms_new'
6612 * cpumasks in the array doms_new[] of cpumasks. This compares
6613 * doms_new[] to the current sched domain partitioning, doms_cur[].
6614 * It destroys each deleted domain and builds each new domain.
6615 *
6616 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6617 * The masks don't intersect (don't overlap.) We should setup one
6618 * sched domain for each mask. CPUs not in any of the cpumasks will
6619 * not be load balanced. If the same cpumask appears both in the
6620 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6621 * it as it is.
6622 *
6623 * The passed in 'doms_new' should be allocated using
6624 * alloc_sched_domains. This routine takes ownership of it and will
6625 * free_sched_domains it when done with it. If the caller failed the
6626 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6627 * and partition_sched_domains() will fallback to the single partition
6628 * 'fallback_doms', it also forces the domains to be rebuilt.
6629 *
6630 * If doms_new == NULL it will be replaced with cpu_online_mask.
6631 * ndoms_new == 0 is a special case for destroying existing domains,
6632 * and it will not create the default domain.
6633 *
6634 * Call with hotplug lock held
6635 */
6636void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6637 struct sched_domain_attr *dattr_new)
6638{
6639 int i, j, n;
6640 int new_topology;
6641
6642 mutex_lock(&sched_domains_mutex);
6643
6644 /* always unregister in case we don't destroy any domains */
6645 unregister_sched_domain_sysctl();
6646
6647 /* Let architecture update cpu core mappings. */
6648 new_topology = arch_update_cpu_topology();
6649
6650 n = doms_new ? ndoms_new : 0;
6651
6652 /* Destroy deleted domains */
6653 for (i = 0; i < ndoms_cur; i++) {
6654 for (j = 0; j < n && !new_topology; j++) {
6655 if (cpumask_equal(doms_cur[i], doms_new[j])
6656 && dattrs_equal(dattr_cur, i, dattr_new, j))
6657 goto match1;
6658 }
6659 /* no match - a current sched domain not in new doms_new[] */
6660 detach_destroy_domains(doms_cur[i]);
6661match1:
6662 ;
6663 }
6664
6665 if (doms_new == NULL) {
6666 ndoms_cur = 0;
6667 doms_new = &fallback_doms;
6668 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6669 WARN_ON_ONCE(dattr_new);
6670 }
6671
6672 /* Build new domains */
6673 for (i = 0; i < ndoms_new; i++) {
6674 for (j = 0; j < ndoms_cur && !new_topology; j++) {
6675 if (cpumask_equal(doms_new[i], doms_cur[j])
6676 && dattrs_equal(dattr_new, i, dattr_cur, j))
6677 goto match2;
6678 }
6679 /* no match - add a new doms_new */
6680 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6681match2:
6682 ;
6683 }
6684
6685 /* Remember the new sched domains */
6686 if (doms_cur != &fallback_doms)
6687 free_sched_domains(doms_cur, ndoms_cur);
6688 kfree(dattr_cur); /* kfree(NULL) is safe */
6689 doms_cur = doms_new;
6690 dattr_cur = dattr_new;
6691 ndoms_cur = ndoms_new;
6692
6693 register_sched_domain_sysctl();
6694
6695 mutex_unlock(&sched_domains_mutex);
6696}
6697
6698static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
6699
6700/*
6701 * Update cpusets according to cpu_active mask. If cpusets are
6702 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6703 * around partition_sched_domains().
6704 *
6705 * If we come here as part of a suspend/resume, don't touch cpusets because we
6706 * want to restore it back to its original state upon resume anyway.
6707 */
6708static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6709 void *hcpu)
6710{
6711 switch (action) {
6712 case CPU_ONLINE_FROZEN:
6713 case CPU_DOWN_FAILED_FROZEN:
6714
6715 /*
6716 * num_cpus_frozen tracks how many CPUs are involved in suspend
6717 * resume sequence. As long as this is not the last online
6718 * operation in the resume sequence, just build a single sched
6719 * domain, ignoring cpusets.
6720 */
6721 num_cpus_frozen--;
6722 if (likely(num_cpus_frozen)) {
6723 partition_sched_domains(1, NULL, NULL);
6724 break;
6725 }
6726
6727 /*
6728 * This is the last CPU online operation. So fall through and
6729 * restore the original sched domains by considering the
6730 * cpuset configurations.
6731 */
6732
6733 case CPU_ONLINE:
6734 case CPU_DOWN_FAILED:
6735 cpuset_update_active_cpus(true);
6736 break;
6737 default:
6738 return NOTIFY_DONE;
6739 }
6740 return NOTIFY_OK;
6741}
6742
6743static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6744 void *hcpu)
6745{
6746 switch (action) {
6747 case CPU_DOWN_PREPARE:
6748 cpuset_update_active_cpus(false);
6749 break;
6750 case CPU_DOWN_PREPARE_FROZEN:
6751 num_cpus_frozen++;
6752 partition_sched_domains(1, NULL, NULL);
6753 break;
6754 default:
6755 return NOTIFY_DONE;
6756 }
6757 return NOTIFY_OK;
6758}
6759
6760void __init sched_init_smp(void)
6761{
6762 cpumask_var_t non_isolated_cpus;
6763
6764 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6765 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6766
6767 sched_init_numa();
6768
6769 get_online_cpus();
6770 mutex_lock(&sched_domains_mutex);
6771 init_sched_domains(cpu_active_mask);
6772 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6773 if (cpumask_empty(non_isolated_cpus))
6774 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6775 mutex_unlock(&sched_domains_mutex);
6776 put_online_cpus();
6777
6778 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6779 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6780 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6781
6782 /* RT runtime code needs to handle some hotplug events */
6783 hotcpu_notifier(update_runtime, 0);
6784
6785 init_hrtick();
6786
6787 /* Move init over to a non-isolated CPU */
6788 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6789 BUG();
6790 sched_init_granularity();
6791 free_cpumask_var(non_isolated_cpus);
6792
6793 init_sched_rt_class();
6794}
6795#else
6796void __init sched_init_smp(void)
6797{
6798 sched_init_granularity();
6799}
6800#endif /* CONFIG_SMP */
6801
6802const_debug unsigned int sysctl_timer_migration = 1;
6803
6804int in_sched_functions(unsigned long addr)
6805{
6806 return in_lock_functions(addr) ||
6807 (addr >= (unsigned long)__sched_text_start
6808 && addr < (unsigned long)__sched_text_end);
6809}
6810
6811#ifdef CONFIG_CGROUP_SCHED
6812struct task_group root_task_group;
6813LIST_HEAD(task_groups);
6814#endif
6815
6816DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
6817
6818void __init sched_init(void)
6819{
6820 int i, j;
6821 unsigned long alloc_size = 0, ptr;
6822
6823#ifdef CONFIG_FAIR_GROUP_SCHED
6824 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6825#endif
6826#ifdef CONFIG_RT_GROUP_SCHED
6827 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6828#endif
6829#ifdef CONFIG_CPUMASK_OFFSTACK
6830 alloc_size += num_possible_cpus() * cpumask_size();
6831#endif
6832 if (alloc_size) {
6833 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6834
6835#ifdef CONFIG_FAIR_GROUP_SCHED
6836 root_task_group.se = (struct sched_entity **)ptr;
6837 ptr += nr_cpu_ids * sizeof(void **);
6838
6839 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6840 ptr += nr_cpu_ids * sizeof(void **);
6841
6842#endif /* CONFIG_FAIR_GROUP_SCHED */
6843#ifdef CONFIG_RT_GROUP_SCHED
6844 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6845 ptr += nr_cpu_ids * sizeof(void **);
6846
6847 root_task_group.rt_rq = (struct rt_rq **)ptr;
6848 ptr += nr_cpu_ids * sizeof(void **);
6849
6850#endif /* CONFIG_RT_GROUP_SCHED */
6851#ifdef CONFIG_CPUMASK_OFFSTACK
6852 for_each_possible_cpu(i) {
6853 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
6854 ptr += cpumask_size();
6855 }
6856#endif /* CONFIG_CPUMASK_OFFSTACK */
6857 }
6858
6859#ifdef CONFIG_SMP
6860 init_defrootdomain();
6861#endif
6862
6863 init_rt_bandwidth(&def_rt_bandwidth,
6864 global_rt_period(), global_rt_runtime());
6865
6866#ifdef CONFIG_RT_GROUP_SCHED
6867 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6868 global_rt_period(), global_rt_runtime());
6869#endif /* CONFIG_RT_GROUP_SCHED */
6870
6871#ifdef CONFIG_CGROUP_SCHED
6872 list_add(&root_task_group.list, &task_groups);
6873 INIT_LIST_HEAD(&root_task_group.children);
6874 INIT_LIST_HEAD(&root_task_group.siblings);
6875 autogroup_init(&init_task);
6876
6877#endif /* CONFIG_CGROUP_SCHED */
6878
6879#ifdef CONFIG_CGROUP_CPUACCT
6880 root_cpuacct.cpustat = &kernel_cpustat;
6881 root_cpuacct.cpuusage = alloc_percpu(u64);
6882 /* Too early, not expected to fail */
6883 BUG_ON(!root_cpuacct.cpuusage);
6884#endif
6885 for_each_possible_cpu(i) {
6886 struct rq *rq;
6887
6888 rq = cpu_rq(i);
6889 raw_spin_lock_init(&rq->lock);
6890 rq->nr_running = 0;
6891 rq->calc_load_active = 0;
6892 rq->calc_load_update = jiffies + LOAD_FREQ;
6893 init_cfs_rq(&rq->cfs);
6894 init_rt_rq(&rq->rt, rq);
6895#ifdef CONFIG_FAIR_GROUP_SCHED
6896 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6897 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6898 /*
6899 * How much cpu bandwidth does root_task_group get?
6900 *
6901 * In case of task-groups formed thr' the cgroup filesystem, it
6902 * gets 100% of the cpu resources in the system. This overall
6903 * system cpu resource is divided among the tasks of
6904 * root_task_group and its child task-groups in a fair manner,
6905 * based on each entity's (task or task-group's) weight
6906 * (se->load.weight).
6907 *
6908 * In other words, if root_task_group has 10 tasks of weight
6909 * 1024) and two child groups A0 and A1 (of weight 1024 each),
6910 * then A0's share of the cpu resource is:
6911 *
6912 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6913 *
6914 * We achieve this by letting root_task_group's tasks sit
6915 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
6916 */
6917 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6918 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6919#endif /* CONFIG_FAIR_GROUP_SCHED */
6920
6921 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6922#ifdef CONFIG_RT_GROUP_SCHED
6923 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6924 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6925#endif
6926
6927 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6928 rq->cpu_load[j] = 0;
6929
6930 rq->last_load_update_tick = jiffies;
6931
6932#ifdef CONFIG_SMP
6933 rq->sd = NULL;
6934 rq->rd = NULL;
6935 rq->cpu_power = SCHED_POWER_SCALE;
6936 rq->post_schedule = 0;
6937 rq->active_balance = 0;
6938 rq->next_balance = jiffies;
6939 rq->push_cpu = 0;
6940 rq->cpu = i;
6941 rq->online = 0;
6942 rq->idle_stamp = 0;
6943 rq->avg_idle = 2*sysctl_sched_migration_cost;
6944
6945 INIT_LIST_HEAD(&rq->cfs_tasks);
6946
6947 rq_attach_root(rq, &def_root_domain);
6948#ifdef CONFIG_NO_HZ
6949 rq->nohz_flags = 0;
6950#endif
6951#endif
6952 init_rq_hrtick(rq);
6953 atomic_set(&rq->nr_iowait, 0);
6954 }
6955
6956 set_load_weight(&init_task);
6957
6958#ifdef CONFIG_PREEMPT_NOTIFIERS
6959 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6960#endif
6961
6962#ifdef CONFIG_RT_MUTEXES
6963 plist_head_init(&init_task.pi_waiters);
6964#endif
6965
6966 /*
6967 * The boot idle thread does lazy MMU switching as well:
6968 */
6969 atomic_inc(&init_mm.mm_count);
6970 enter_lazy_tlb(&init_mm, current);
6971
6972 /*
6973 * Make us the idle thread. Technically, schedule() should not be
6974 * called from this thread, however somewhere below it might be,
6975 * but because we are the idle thread, we just pick up running again
6976 * when this runqueue becomes "idle".
6977 */
6978 init_idle(current, smp_processor_id());
6979
6980 calc_load_update = jiffies + LOAD_FREQ;
6981
6982 /*
6983 * During early bootup we pretend to be a normal task:
6984 */
6985 current->sched_class = &fair_sched_class;
6986
6987#ifdef CONFIG_SMP
6988 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6989 /* May be allocated at isolcpus cmdline parse time */
6990 if (cpu_isolated_map == NULL)
6991 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6992 idle_thread_set_boot_cpu();
6993#endif
6994 init_sched_fair_class();
6995
6996 scheduler_running = 1;
6997}
6998
6999#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7000static inline int preempt_count_equals(int preempt_offset)
7001{
7002 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7003
7004 return (nested == preempt_offset);
7005}
7006
7007void __might_sleep(const char *file, int line, int preempt_offset)
7008{
7009 static unsigned long prev_jiffy; /* ratelimiting */
7010
7011 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7012 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7013 system_state != SYSTEM_RUNNING || oops_in_progress)
7014 return;
7015 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7016 return;
7017 prev_jiffy = jiffies;
7018
7019 printk(KERN_ERR
7020 "BUG: sleeping function called from invalid context at %s:%d\n",
7021 file, line);
7022 printk(KERN_ERR
7023 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7024 in_atomic(), irqs_disabled(),
7025 current->pid, current->comm);
7026
7027 debug_show_held_locks(current);
7028 if (irqs_disabled())
7029 print_irqtrace_events(current);
7030 dump_stack();
7031}
7032EXPORT_SYMBOL(__might_sleep);
7033#endif
7034
7035#ifdef CONFIG_MAGIC_SYSRQ
7036static void normalize_task(struct rq *rq, struct task_struct *p)
7037{
7038 const struct sched_class *prev_class = p->sched_class;
7039 int old_prio = p->prio;
7040 int on_rq;
7041
7042 on_rq = p->on_rq;
7043 if (on_rq)
7044 dequeue_task(rq, p, 0);
7045 __setscheduler(rq, p, SCHED_NORMAL, 0);
7046 if (on_rq) {
7047 enqueue_task(rq, p, 0);
7048 resched_task(rq->curr);
7049 }
7050
7051 check_class_changed(rq, p, prev_class, old_prio);
7052}
7053
7054void normalize_rt_tasks(void)
7055{
7056 struct task_struct *g, *p;
7057 unsigned long flags;
7058 struct rq *rq;
7059
7060 read_lock_irqsave(&tasklist_lock, flags);
7061 do_each_thread(g, p) {
7062 /*
7063 * Only normalize user tasks:
7064 */
7065 if (!p->mm)
7066 continue;
7067
7068 p->se.exec_start = 0;
7069#ifdef CONFIG_SCHEDSTATS
7070 p->se.statistics.wait_start = 0;
7071 p->se.statistics.sleep_start = 0;
7072 p->se.statistics.block_start = 0;
7073#endif
7074
7075 if (!rt_task(p)) {
7076 /*
7077 * Renice negative nice level userspace
7078 * tasks back to 0:
7079 */
7080 if (TASK_NICE(p) < 0 && p->mm)
7081 set_user_nice(p, 0);
7082 continue;
7083 }
7084
7085 raw_spin_lock(&p->pi_lock);
7086 rq = __task_rq_lock(p);
7087
7088 normalize_task(rq, p);
7089
7090 __task_rq_unlock(rq);
7091 raw_spin_unlock(&p->pi_lock);
7092 } while_each_thread(g, p);
7093
7094 read_unlock_irqrestore(&tasklist_lock, flags);
7095}
7096
7097#endif /* CONFIG_MAGIC_SYSRQ */
7098
7099#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7100/*
7101 * These functions are only useful for the IA64 MCA handling, or kdb.
7102 *
7103 * They can only be called when the whole system has been
7104 * stopped - every CPU needs to be quiescent, and no scheduling
7105 * activity can take place. Using them for anything else would
7106 * be a serious bug, and as a result, they aren't even visible
7107 * under any other configuration.
7108 */
7109
7110/**
7111 * curr_task - return the current task for a given cpu.
7112 * @cpu: the processor in question.
7113 *
7114 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7115 */
7116struct task_struct *curr_task(int cpu)
7117{
7118 return cpu_curr(cpu);
7119}
7120
7121#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7122
7123#ifdef CONFIG_IA64
7124/**
7125 * set_curr_task - set the current task for a given cpu.
7126 * @cpu: the processor in question.
7127 * @p: the task pointer to set.
7128 *
7129 * Description: This function must only be used when non-maskable interrupts
7130 * are serviced on a separate stack. It allows the architecture to switch the
7131 * notion of the current task on a cpu in a non-blocking manner. This function
7132 * must be called with all CPU's synchronized, and interrupts disabled, the
7133 * and caller must save the original value of the current task (see
7134 * curr_task() above) and restore that value before reenabling interrupts and
7135 * re-starting the system.
7136 *
7137 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7138 */
7139void set_curr_task(int cpu, struct task_struct *p)
7140{
7141 cpu_curr(cpu) = p;
7142}
7143
7144#endif
7145
7146#ifdef CONFIG_CGROUP_SCHED
7147/* task_group_lock serializes the addition/removal of task groups */
7148static DEFINE_SPINLOCK(task_group_lock);
7149
7150static void free_sched_group(struct task_group *tg)
7151{
7152 free_fair_sched_group(tg);
7153 free_rt_sched_group(tg);
7154 autogroup_free(tg);
7155 kfree(tg);
7156}
7157
7158/* allocate runqueue etc for a new task group */
7159struct task_group *sched_create_group(struct task_group *parent)
7160{
7161 struct task_group *tg;
7162 unsigned long flags;
7163
7164 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7165 if (!tg)
7166 return ERR_PTR(-ENOMEM);
7167
7168 if (!alloc_fair_sched_group(tg, parent))
7169 goto err;
7170
7171 if (!alloc_rt_sched_group(tg, parent))
7172 goto err;
7173
7174 spin_lock_irqsave(&task_group_lock, flags);
7175 list_add_rcu(&tg->list, &task_groups);
7176
7177 WARN_ON(!parent); /* root should already exist */
7178
7179 tg->parent = parent;
7180 INIT_LIST_HEAD(&tg->children);
7181 list_add_rcu(&tg->siblings, &parent->children);
7182 spin_unlock_irqrestore(&task_group_lock, flags);
7183
7184 return tg;
7185
7186err:
7187 free_sched_group(tg);
7188 return ERR_PTR(-ENOMEM);
7189}
7190
7191/* rcu callback to free various structures associated with a task group */
7192static void free_sched_group_rcu(struct rcu_head *rhp)
7193{
7194 /* now it should be safe to free those cfs_rqs */
7195 free_sched_group(container_of(rhp, struct task_group, rcu));
7196}
7197
7198/* Destroy runqueue etc associated with a task group */
7199void sched_destroy_group(struct task_group *tg)
7200{
7201 unsigned long flags;
7202 int i;
7203
7204 /* end participation in shares distribution */
7205 for_each_possible_cpu(i)
7206 unregister_fair_sched_group(tg, i);
7207
7208 spin_lock_irqsave(&task_group_lock, flags);
7209 list_del_rcu(&tg->list);
7210 list_del_rcu(&tg->siblings);
7211 spin_unlock_irqrestore(&task_group_lock, flags);
7212
7213 /* wait for possible concurrent references to cfs_rqs complete */
7214 call_rcu(&tg->rcu, free_sched_group_rcu);
7215}
7216
7217/* change task's runqueue when it moves between groups.
7218 * The caller of this function should have put the task in its new group
7219 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7220 * reflect its new group.
7221 */
7222void sched_move_task(struct task_struct *tsk)
7223{
7224 struct task_group *tg;
7225 int on_rq, running;
7226 unsigned long flags;
7227 struct rq *rq;
7228
7229 rq = task_rq_lock(tsk, &flags);
7230
7231 running = task_current(rq, tsk);
7232 on_rq = tsk->on_rq;
7233
7234 if (on_rq)
7235 dequeue_task(rq, tsk, 0);
7236 if (unlikely(running))
7237 tsk->sched_class->put_prev_task(rq, tsk);
7238
7239 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
7240 lockdep_is_held(&tsk->sighand->siglock)),
7241 struct task_group, css);
7242 tg = autogroup_task_group(tsk, tg);
7243 tsk->sched_task_group = tg;
7244
7245#ifdef CONFIG_FAIR_GROUP_SCHED
7246 if (tsk->sched_class->task_move_group)
7247 tsk->sched_class->task_move_group(tsk, on_rq);
7248 else
7249#endif
7250 set_task_rq(tsk, task_cpu(tsk));
7251
7252 if (unlikely(running))
7253 tsk->sched_class->set_curr_task(rq);
7254 if (on_rq)
7255 enqueue_task(rq, tsk, 0);
7256
7257 task_rq_unlock(rq, tsk, &flags);
7258}
7259#endif /* CONFIG_CGROUP_SCHED */
7260
7261#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7262static unsigned long to_ratio(u64 period, u64 runtime)
7263{
7264 if (runtime == RUNTIME_INF)
7265 return 1ULL << 20;
7266
7267 return div64_u64(runtime << 20, period);
7268}
7269#endif
7270
7271#ifdef CONFIG_RT_GROUP_SCHED
7272/*
7273 * Ensure that the real time constraints are schedulable.
7274 */
7275static DEFINE_MUTEX(rt_constraints_mutex);
7276
7277/* Must be called with tasklist_lock held */
7278static inline int tg_has_rt_tasks(struct task_group *tg)
7279{
7280 struct task_struct *g, *p;
7281
7282 do_each_thread(g, p) {
7283 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7284 return 1;
7285 } while_each_thread(g, p);
7286
7287 return 0;
7288}
7289
7290struct rt_schedulable_data {
7291 struct task_group *tg;
7292 u64 rt_period;
7293 u64 rt_runtime;
7294};
7295
7296static int tg_rt_schedulable(struct task_group *tg, void *data)
7297{
7298 struct rt_schedulable_data *d = data;
7299 struct task_group *child;
7300 unsigned long total, sum = 0;
7301 u64 period, runtime;
7302
7303 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7304 runtime = tg->rt_bandwidth.rt_runtime;
7305
7306 if (tg == d->tg) {
7307 period = d->rt_period;
7308 runtime = d->rt_runtime;
7309 }
7310
7311 /*
7312 * Cannot have more runtime than the period.
7313 */
7314 if (runtime > period && runtime != RUNTIME_INF)
7315 return -EINVAL;
7316
7317 /*
7318 * Ensure we don't starve existing RT tasks.
7319 */
7320 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7321 return -EBUSY;
7322
7323 total = to_ratio(period, runtime);
7324
7325 /*
7326 * Nobody can have more than the global setting allows.
7327 */
7328 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7329 return -EINVAL;
7330
7331 /*
7332 * The sum of our children's runtime should not exceed our own.
7333 */
7334 list_for_each_entry_rcu(child, &tg->children, siblings) {
7335 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7336 runtime = child->rt_bandwidth.rt_runtime;
7337
7338 if (child == d->tg) {
7339 period = d->rt_period;
7340 runtime = d->rt_runtime;
7341 }
7342
7343 sum += to_ratio(period, runtime);
7344 }
7345
7346 if (sum > total)
7347 return -EINVAL;
7348
7349 return 0;
7350}
7351
7352static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7353{
7354 int ret;
7355
7356 struct rt_schedulable_data data = {
7357 .tg = tg,
7358 .rt_period = period,
7359 .rt_runtime = runtime,
7360 };
7361
7362 rcu_read_lock();
7363 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7364 rcu_read_unlock();
7365
7366 return ret;
7367}
7368
7369static int tg_set_rt_bandwidth(struct task_group *tg,
7370 u64 rt_period, u64 rt_runtime)
7371{
7372 int i, err = 0;
7373
7374 mutex_lock(&rt_constraints_mutex);
7375 read_lock(&tasklist_lock);
7376 err = __rt_schedulable(tg, rt_period, rt_runtime);
7377 if (err)
7378 goto unlock;
7379
7380 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7381 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7382 tg->rt_bandwidth.rt_runtime = rt_runtime;
7383
7384 for_each_possible_cpu(i) {
7385 struct rt_rq *rt_rq = tg->rt_rq[i];
7386
7387 raw_spin_lock(&rt_rq->rt_runtime_lock);
7388 rt_rq->rt_runtime = rt_runtime;
7389 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7390 }
7391 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7392unlock:
7393 read_unlock(&tasklist_lock);
7394 mutex_unlock(&rt_constraints_mutex);
7395
7396 return err;
7397}
7398
7399int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7400{
7401 u64 rt_runtime, rt_period;
7402
7403 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7404 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7405 if (rt_runtime_us < 0)
7406 rt_runtime = RUNTIME_INF;
7407
7408 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7409}
7410
7411long sched_group_rt_runtime(struct task_group *tg)
7412{
7413 u64 rt_runtime_us;
7414
7415 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7416 return -1;
7417
7418 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7419 do_div(rt_runtime_us, NSEC_PER_USEC);
7420 return rt_runtime_us;
7421}
7422
7423int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7424{
7425 u64 rt_runtime, rt_period;
7426
7427 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7428 rt_runtime = tg->rt_bandwidth.rt_runtime;
7429
7430 if (rt_period == 0)
7431 return -EINVAL;
7432
7433 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7434}
7435
7436long sched_group_rt_period(struct task_group *tg)
7437{
7438 u64 rt_period_us;
7439
7440 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7441 do_div(rt_period_us, NSEC_PER_USEC);
7442 return rt_period_us;
7443}
7444
7445static int sched_rt_global_constraints(void)
7446{
7447 u64 runtime, period;
7448 int ret = 0;
7449
7450 if (sysctl_sched_rt_period <= 0)
7451 return -EINVAL;
7452
7453 runtime = global_rt_runtime();
7454 period = global_rt_period();
7455
7456 /*
7457 * Sanity check on the sysctl variables.
7458 */
7459 if (runtime > period && runtime != RUNTIME_INF)
7460 return -EINVAL;
7461
7462 mutex_lock(&rt_constraints_mutex);
7463 read_lock(&tasklist_lock);
7464 ret = __rt_schedulable(NULL, 0, 0);
7465 read_unlock(&tasklist_lock);
7466 mutex_unlock(&rt_constraints_mutex);
7467
7468 return ret;
7469}
7470
7471int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7472{
7473 /* Don't accept realtime tasks when there is no way for them to run */
7474 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7475 return 0;
7476
7477 return 1;
7478}
7479
7480#else /* !CONFIG_RT_GROUP_SCHED */
7481static int sched_rt_global_constraints(void)
7482{
7483 unsigned long flags;
7484 int i;
7485
7486 if (sysctl_sched_rt_period <= 0)
7487 return -EINVAL;
7488
7489 /*
7490 * There's always some RT tasks in the root group
7491 * -- migration, kstopmachine etc..
7492 */
7493 if (sysctl_sched_rt_runtime == 0)
7494 return -EBUSY;
7495
7496 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7497 for_each_possible_cpu(i) {
7498 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7499
7500 raw_spin_lock(&rt_rq->rt_runtime_lock);
7501 rt_rq->rt_runtime = global_rt_runtime();
7502 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7503 }
7504 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7505
7506 return 0;
7507}
7508#endif /* CONFIG_RT_GROUP_SCHED */
7509
7510int sched_rt_handler(struct ctl_table *table, int write,
7511 void __user *buffer, size_t *lenp,
7512 loff_t *ppos)
7513{
7514 int ret;
7515 int old_period, old_runtime;
7516 static DEFINE_MUTEX(mutex);
7517
7518 mutex_lock(&mutex);
7519 old_period = sysctl_sched_rt_period;
7520 old_runtime = sysctl_sched_rt_runtime;
7521
7522 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7523
7524 if (!ret && write) {
7525 ret = sched_rt_global_constraints();
7526 if (ret) {
7527 sysctl_sched_rt_period = old_period;
7528 sysctl_sched_rt_runtime = old_runtime;
7529 } else {
7530 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7531 def_rt_bandwidth.rt_period =
7532 ns_to_ktime(global_rt_period());
7533 }
7534 }
7535 mutex_unlock(&mutex);
7536
7537 return ret;
7538}
7539
7540#ifdef CONFIG_CGROUP_SCHED
7541
7542/* return corresponding task_group object of a cgroup */
7543static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7544{
7545 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7546 struct task_group, css);
7547}
7548
7549static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7550{
7551 struct task_group *tg, *parent;
7552
7553 if (!cgrp->parent) {
7554 /* This is early initialization for the top cgroup */
7555 return &root_task_group.css;
7556 }
7557
7558 parent = cgroup_tg(cgrp->parent);
7559 tg = sched_create_group(parent);
7560 if (IS_ERR(tg))
7561 return ERR_PTR(-ENOMEM);
7562
7563 return &tg->css;
7564}
7565
7566static void cpu_cgroup_css_free(struct cgroup *cgrp)
7567{
7568 struct task_group *tg = cgroup_tg(cgrp);
7569
7570 sched_destroy_group(tg);
7571}
7572
7573static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7574 struct cgroup_taskset *tset)
7575{
7576 struct task_struct *task;
7577
7578 cgroup_taskset_for_each(task, cgrp, tset) {
7579#ifdef CONFIG_RT_GROUP_SCHED
7580 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7581 return -EINVAL;
7582#else
7583 /* We don't support RT-tasks being in separate groups */
7584 if (task->sched_class != &fair_sched_class)
7585 return -EINVAL;
7586#endif
7587 }
7588 return 0;
7589}
7590
7591static void cpu_cgroup_attach(struct cgroup *cgrp,
7592 struct cgroup_taskset *tset)
7593{
7594 struct task_struct *task;
7595
7596 cgroup_taskset_for_each(task, cgrp, tset)
7597 sched_move_task(task);
7598}
7599
7600static void
7601cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7602 struct task_struct *task)
7603{
7604 /*
7605 * cgroup_exit() is called in the copy_process() failure path.
7606 * Ignore this case since the task hasn't ran yet, this avoids
7607 * trying to poke a half freed task state from generic code.
7608 */
7609 if (!(task->flags & PF_EXITING))
7610 return;
7611
7612 sched_move_task(task);
7613}
7614
7615#ifdef CONFIG_FAIR_GROUP_SCHED
7616static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7617 u64 shareval)
7618{
7619 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7620}
7621
7622static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7623{
7624 struct task_group *tg = cgroup_tg(cgrp);
7625
7626 return (u64) scale_load_down(tg->shares);
7627}
7628
7629#ifdef CONFIG_CFS_BANDWIDTH
7630static DEFINE_MUTEX(cfs_constraints_mutex);
7631
7632const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7633const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7634
7635static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7636
7637static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7638{
7639 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7640 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7641
7642 if (tg == &root_task_group)
7643 return -EINVAL;
7644
7645 /*
7646 * Ensure we have at some amount of bandwidth every period. This is
7647 * to prevent reaching a state of large arrears when throttled via
7648 * entity_tick() resulting in prolonged exit starvation.
7649 */
7650 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7651 return -EINVAL;
7652
7653 /*
7654 * Likewise, bound things on the otherside by preventing insane quota
7655 * periods. This also allows us to normalize in computing quota
7656 * feasibility.
7657 */
7658 if (period > max_cfs_quota_period)
7659 return -EINVAL;
7660
7661 mutex_lock(&cfs_constraints_mutex);
7662 ret = __cfs_schedulable(tg, period, quota);
7663 if (ret)
7664 goto out_unlock;
7665
7666 runtime_enabled = quota != RUNTIME_INF;
7667 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7668 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7669 raw_spin_lock_irq(&cfs_b->lock);
7670 cfs_b->period = ns_to_ktime(period);
7671 cfs_b->quota = quota;
7672
7673 __refill_cfs_bandwidth_runtime(cfs_b);
7674 /* restart the period timer (if active) to handle new period expiry */
7675 if (runtime_enabled && cfs_b->timer_active) {
7676 /* force a reprogram */
7677 cfs_b->timer_active = 0;
7678 __start_cfs_bandwidth(cfs_b);
7679 }
7680 raw_spin_unlock_irq(&cfs_b->lock);
7681
7682 for_each_possible_cpu(i) {
7683 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7684 struct rq *rq = cfs_rq->rq;
7685
7686 raw_spin_lock_irq(&rq->lock);
7687 cfs_rq->runtime_enabled = runtime_enabled;
7688 cfs_rq->runtime_remaining = 0;
7689
7690 if (cfs_rq->throttled)
7691 unthrottle_cfs_rq(cfs_rq);
7692 raw_spin_unlock_irq(&rq->lock);
7693 }
7694out_unlock:
7695 mutex_unlock(&cfs_constraints_mutex);
7696
7697 return ret;
7698}
7699
7700int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7701{
7702 u64 quota, period;
7703
7704 period = ktime_to_ns(tg->cfs_bandwidth.period);
7705 if (cfs_quota_us < 0)
7706 quota = RUNTIME_INF;
7707 else
7708 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7709
7710 return tg_set_cfs_bandwidth(tg, period, quota);
7711}
7712
7713long tg_get_cfs_quota(struct task_group *tg)
7714{
7715 u64 quota_us;
7716
7717 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7718 return -1;
7719
7720 quota_us = tg->cfs_bandwidth.quota;
7721 do_div(quota_us, NSEC_PER_USEC);
7722
7723 return quota_us;
7724}
7725
7726int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7727{
7728 u64 quota, period;
7729
7730 period = (u64)cfs_period_us * NSEC_PER_USEC;
7731 quota = tg->cfs_bandwidth.quota;
7732
7733 return tg_set_cfs_bandwidth(tg, period, quota);
7734}
7735
7736long tg_get_cfs_period(struct task_group *tg)
7737{
7738 u64 cfs_period_us;
7739
7740 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7741 do_div(cfs_period_us, NSEC_PER_USEC);
7742
7743 return cfs_period_us;
7744}
7745
7746static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7747{
7748 return tg_get_cfs_quota(cgroup_tg(cgrp));
7749}
7750
7751static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7752 s64 cfs_quota_us)
7753{
7754 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7755}
7756
7757static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7758{
7759 return tg_get_cfs_period(cgroup_tg(cgrp));
7760}
7761
7762static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7763 u64 cfs_period_us)
7764{
7765 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7766}
7767
7768struct cfs_schedulable_data {
7769 struct task_group *tg;
7770 u64 period, quota;
7771};
7772
7773/*
7774 * normalize group quota/period to be quota/max_period
7775 * note: units are usecs
7776 */
7777static u64 normalize_cfs_quota(struct task_group *tg,
7778 struct cfs_schedulable_data *d)
7779{
7780 u64 quota, period;
7781
7782 if (tg == d->tg) {
7783 period = d->period;
7784 quota = d->quota;
7785 } else {
7786 period = tg_get_cfs_period(tg);
7787 quota = tg_get_cfs_quota(tg);
7788 }
7789
7790 /* note: these should typically be equivalent */
7791 if (quota == RUNTIME_INF || quota == -1)
7792 return RUNTIME_INF;
7793
7794 return to_ratio(period, quota);
7795}
7796
7797static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7798{
7799 struct cfs_schedulable_data *d = data;
7800 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7801 s64 quota = 0, parent_quota = -1;
7802
7803 if (!tg->parent) {
7804 quota = RUNTIME_INF;
7805 } else {
7806 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7807
7808 quota = normalize_cfs_quota(tg, d);
7809 parent_quota = parent_b->hierarchal_quota;
7810
7811 /*
7812 * ensure max(child_quota) <= parent_quota, inherit when no
7813 * limit is set
7814 */
7815 if (quota == RUNTIME_INF)
7816 quota = parent_quota;
7817 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7818 return -EINVAL;
7819 }
7820 cfs_b->hierarchal_quota = quota;
7821
7822 return 0;
7823}
7824
7825static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7826{
7827 int ret;
7828 struct cfs_schedulable_data data = {
7829 .tg = tg,
7830 .period = period,
7831 .quota = quota,
7832 };
7833
7834 if (quota != RUNTIME_INF) {
7835 do_div(data.period, NSEC_PER_USEC);
7836 do_div(data.quota, NSEC_PER_USEC);
7837 }
7838
7839 rcu_read_lock();
7840 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7841 rcu_read_unlock();
7842
7843 return ret;
7844}
7845
7846static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7847 struct cgroup_map_cb *cb)
7848{
7849 struct task_group *tg = cgroup_tg(cgrp);
7850 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7851
7852 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7853 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7854 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7855
7856 return 0;
7857}
7858#endif /* CONFIG_CFS_BANDWIDTH */
7859#endif /* CONFIG_FAIR_GROUP_SCHED */
7860
7861#ifdef CONFIG_RT_GROUP_SCHED
7862static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7863 s64 val)
7864{
7865 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7866}
7867
7868static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
7869{
7870 return sched_group_rt_runtime(cgroup_tg(cgrp));
7871}
7872
7873static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7874 u64 rt_period_us)
7875{
7876 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7877}
7878
7879static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
7880{
7881 return sched_group_rt_period(cgroup_tg(cgrp));
7882}
7883#endif /* CONFIG_RT_GROUP_SCHED */
7884
7885static struct cftype cpu_files[] = {
7886#ifdef CONFIG_FAIR_GROUP_SCHED
7887 {
7888 .name = "shares",
7889 .read_u64 = cpu_shares_read_u64,
7890 .write_u64 = cpu_shares_write_u64,
7891 },
7892#endif
7893#ifdef CONFIG_CFS_BANDWIDTH
7894 {
7895 .name = "cfs_quota_us",
7896 .read_s64 = cpu_cfs_quota_read_s64,
7897 .write_s64 = cpu_cfs_quota_write_s64,
7898 },
7899 {
7900 .name = "cfs_period_us",
7901 .read_u64 = cpu_cfs_period_read_u64,
7902 .write_u64 = cpu_cfs_period_write_u64,
7903 },
7904 {
7905 .name = "stat",
7906 .read_map = cpu_stats_show,
7907 },
7908#endif
7909#ifdef CONFIG_RT_GROUP_SCHED
7910 {
7911 .name = "rt_runtime_us",
7912 .read_s64 = cpu_rt_runtime_read,
7913 .write_s64 = cpu_rt_runtime_write,
7914 },
7915 {
7916 .name = "rt_period_us",
7917 .read_u64 = cpu_rt_period_read_uint,
7918 .write_u64 = cpu_rt_period_write_uint,
7919 },
7920#endif
7921 { } /* terminate */
7922};
7923
7924struct cgroup_subsys cpu_cgroup_subsys = {
7925 .name = "cpu",
7926 .css_alloc = cpu_cgroup_css_alloc,
7927 .css_free = cpu_cgroup_css_free,
7928 .can_attach = cpu_cgroup_can_attach,
7929 .attach = cpu_cgroup_attach,
7930 .exit = cpu_cgroup_exit,
7931 .subsys_id = cpu_cgroup_subsys_id,
7932 .base_cftypes = cpu_files,
7933 .early_init = 1,
7934};
7935
7936#endif /* CONFIG_CGROUP_SCHED */
7937
7938#ifdef CONFIG_CGROUP_CPUACCT
7939
7940/*
7941 * CPU accounting code for task groups.
7942 *
7943 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
7944 * (balbir@in.ibm.com).
7945 */
7946
7947struct cpuacct root_cpuacct;
7948
7949/* create a new cpu accounting group */
7950static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
7951{
7952 struct cpuacct *ca;
7953
7954 if (!cgrp->parent)
7955 return &root_cpuacct.css;
7956
7957 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7958 if (!ca)
7959 goto out;
7960
7961 ca->cpuusage = alloc_percpu(u64);
7962 if (!ca->cpuusage)
7963 goto out_free_ca;
7964
7965 ca->cpustat = alloc_percpu(struct kernel_cpustat);
7966 if (!ca->cpustat)
7967 goto out_free_cpuusage;
7968
7969 return &ca->css;
7970
7971out_free_cpuusage:
7972 free_percpu(ca->cpuusage);
7973out_free_ca:
7974 kfree(ca);
7975out:
7976 return ERR_PTR(-ENOMEM);
7977}
7978
7979/* destroy an existing cpu accounting group */
7980static void cpuacct_css_free(struct cgroup *cgrp)
7981{
7982 struct cpuacct *ca = cgroup_ca(cgrp);
7983
7984 free_percpu(ca->cpustat);
7985 free_percpu(ca->cpuusage);
7986 kfree(ca);
7987}
7988
7989static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
7990{
7991 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
7992 u64 data;
7993
7994#ifndef CONFIG_64BIT
7995 /*
7996 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
7997 */
7998 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
7999 data = *cpuusage;
8000 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8001#else
8002 data = *cpuusage;
8003#endif
8004
8005 return data;
8006}
8007
8008static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8009{
8010 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8011
8012#ifndef CONFIG_64BIT
8013 /*
8014 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8015 */
8016 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8017 *cpuusage = val;
8018 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8019#else
8020 *cpuusage = val;
8021#endif
8022}
8023
8024/* return total cpu usage (in nanoseconds) of a group */
8025static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8026{
8027 struct cpuacct *ca = cgroup_ca(cgrp);
8028 u64 totalcpuusage = 0;
8029 int i;
8030
8031 for_each_present_cpu(i)
8032 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8033
8034 return totalcpuusage;
8035}
8036
8037static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8038 u64 reset)
8039{
8040 struct cpuacct *ca = cgroup_ca(cgrp);
8041 int err = 0;
8042 int i;
8043
8044 if (reset) {
8045 err = -EINVAL;
8046 goto out;
8047 }
8048
8049 for_each_present_cpu(i)
8050 cpuacct_cpuusage_write(ca, i, 0);
8051
8052out:
8053 return err;
8054}
8055
8056static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8057 struct seq_file *m)
8058{
8059 struct cpuacct *ca = cgroup_ca(cgroup);
8060 u64 percpu;
8061 int i;
8062
8063 for_each_present_cpu(i) {
8064 percpu = cpuacct_cpuusage_read(ca, i);
8065 seq_printf(m, "%llu ", (unsigned long long) percpu);
8066 }
8067 seq_printf(m, "\n");
8068 return 0;
8069}
8070
8071static const char *cpuacct_stat_desc[] = {
8072 [CPUACCT_STAT_USER] = "user",
8073 [CPUACCT_STAT_SYSTEM] = "system",
8074};
8075
8076static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8077 struct cgroup_map_cb *cb)
8078{
8079 struct cpuacct *ca = cgroup_ca(cgrp);
8080 int cpu;
8081 s64 val = 0;
8082
8083 for_each_online_cpu(cpu) {
8084 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8085 val += kcpustat->cpustat[CPUTIME_USER];
8086 val += kcpustat->cpustat[CPUTIME_NICE];
8087 }
8088 val = cputime64_to_clock_t(val);
8089 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8090
8091 val = 0;
8092 for_each_online_cpu(cpu) {
8093 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8094 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8095 val += kcpustat->cpustat[CPUTIME_IRQ];
8096 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8097 }
8098
8099 val = cputime64_to_clock_t(val);
8100 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8101
8102 return 0;
8103}
8104
8105static struct cftype files[] = {
8106 {
8107 .name = "usage",
8108 .read_u64 = cpuusage_read,
8109 .write_u64 = cpuusage_write,
8110 },
8111 {
8112 .name = "usage_percpu",
8113 .read_seq_string = cpuacct_percpu_seq_read,
8114 },
8115 {
8116 .name = "stat",
8117 .read_map = cpuacct_stats_show,
8118 },
8119 { } /* terminate */
8120};
8121
8122/*
8123 * charge this task's execution time to its accounting group.
8124 *
8125 * called with rq->lock held.
8126 */
8127void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8128{
8129 struct cpuacct *ca;
8130 int cpu;
8131
8132 if (unlikely(!cpuacct_subsys.active))
8133 return;
8134
8135 cpu = task_cpu(tsk);
8136
8137 rcu_read_lock();
8138
8139 ca = task_ca(tsk);
8140
8141 for (; ca; ca = parent_ca(ca)) {
8142 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8143 *cpuusage += cputime;
8144 }
8145
8146 rcu_read_unlock();
8147}
8148
8149struct cgroup_subsys cpuacct_subsys = {
8150 .name = "cpuacct",
8151 .css_alloc = cpuacct_css_alloc,
8152 .css_free = cpuacct_css_free,
8153 .subsys_id = cpuacct_subsys_id,
8154 .base_cftypes = files,
8155};
8156#endif /* CONFIG_CGROUP_CPUACCT */
8157
8158void dump_cpu_task(int cpu)
8159{
8160 pr_info("Task dump for CPU %d:\n", cpu);
8161 sched_show_task(cpu_curr(cpu));
8162}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
deleted file mode 100644
index 23aa789c53e..00000000000
--- a/kernel/sched/cpupri.c
+++ /dev/null
@@ -1,240 +0,0 @@
1/*
2 * kernel/sched/cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include <linux/gfp.h>
31#include "cpupri.h"
32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio)
35{
36 int cpupri;
37
38 if (prio == CPUPRI_INVALID)
39 cpupri = CPUPRI_INVALID;
40 else if (prio == MAX_PRIO)
41 cpupri = CPUPRI_IDLE;
42 else if (prio >= MAX_RT_PRIO)
43 cpupri = CPUPRI_NORMAL;
44 else
45 cpupri = MAX_RT_PRIO - prio + 1;
46
47 return cpupri;
48}
49
50/**
51 * cpupri_find - find the best (lowest-pri) CPU in the system
52 * @cp: The cpupri context
53 * @p: The task
54 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
55 *
56 * Note: This function returns the recommended CPUs as calculated during the
57 * current invocation. By the time the call returns, the CPUs may have in
58 * fact changed priorities any number of times. While not ideal, it is not
59 * an issue of correctness since the normal rebalancer logic will correct
60 * any discrepancies created by racing against the uncertainty of the current
61 * priority configuration.
62 *
63 * Returns: (int)bool - CPUs were found
64 */
65int cpupri_find(struct cpupri *cp, struct task_struct *p,
66 struct cpumask *lowest_mask)
67{
68 int idx = 0;
69 int task_pri = convert_prio(p->prio);
70
71 if (task_pri >= MAX_RT_PRIO)
72 return 0;
73
74 for (idx = 0; idx < task_pri; idx++) {
75 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
76 int skip = 0;
77
78 if (!atomic_read(&(vec)->count))
79 skip = 1;
80 /*
81 * When looking at the vector, we need to read the counter,
82 * do a memory barrier, then read the mask.
83 *
84 * Note: This is still all racey, but we can deal with it.
85 * Ideally, we only want to look at masks that are set.
86 *
87 * If a mask is not set, then the only thing wrong is that we
88 * did a little more work than necessary.
89 *
90 * If we read a zero count but the mask is set, because of the
91 * memory barriers, that can only happen when the highest prio
92 * task for a run queue has left the run queue, in which case,
93 * it will be followed by a pull. If the task we are processing
94 * fails to find a proper place to go, that pull request will
95 * pull this task if the run queue is running at a lower
96 * priority.
97 */
98 smp_rmb();
99
100 /* Need to do the rmb for every iteration */
101 if (skip)
102 continue;
103
104 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
105 continue;
106
107 if (lowest_mask) {
108 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
109
110 /*
111 * We have to ensure that we have at least one bit
112 * still set in the array, since the map could have
113 * been concurrently emptied between the first and
114 * second reads of vec->mask. If we hit this
115 * condition, simply act as though we never hit this
116 * priority level and continue on.
117 */
118 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
119 continue;
120 }
121
122 return 1;
123 }
124
125 return 0;
126}
127
128/**
129 * cpupri_set - update the cpu priority setting
130 * @cp: The cpupri context
131 * @cpu: The target cpu
132 * @newpri: The priority (INVALID-RT99) to assign to this CPU
133 *
134 * Note: Assumes cpu_rq(cpu)->lock is locked
135 *
136 * Returns: (void)
137 */
138void cpupri_set(struct cpupri *cp, int cpu, int newpri)
139{
140 int *currpri = &cp->cpu_to_pri[cpu];
141 int oldpri = *currpri;
142 int do_mb = 0;
143
144 newpri = convert_prio(newpri);
145
146 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
147
148 if (newpri == oldpri)
149 return;
150
151 /*
152 * If the cpu was currently mapped to a different value, we
153 * need to map it to the new value then remove the old value.
154 * Note, we must add the new value first, otherwise we risk the
155 * cpu being missed by the priority loop in cpupri_find.
156 */
157 if (likely(newpri != CPUPRI_INVALID)) {
158 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
159
160 cpumask_set_cpu(cpu, vec->mask);
161 /*
162 * When adding a new vector, we update the mask first,
163 * do a write memory barrier, and then update the count, to
164 * make sure the vector is visible when count is set.
165 */
166 smp_mb__before_atomic_inc();
167 atomic_inc(&(vec)->count);
168 do_mb = 1;
169 }
170 if (likely(oldpri != CPUPRI_INVALID)) {
171 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
172
173 /*
174 * Because the order of modification of the vec->count
175 * is important, we must make sure that the update
176 * of the new prio is seen before we decrement the
177 * old prio. This makes sure that the loop sees
178 * one or the other when we raise the priority of
179 * the run queue. We don't care about when we lower the
180 * priority, as that will trigger an rt pull anyway.
181 *
182 * We only need to do a memory barrier if we updated
183 * the new priority vec.
184 */
185 if (do_mb)
186 smp_mb__after_atomic_inc();
187
188 /*
189 * When removing from the vector, we decrement the counter first
190 * do a memory barrier and then clear the mask.
191 */
192 atomic_dec(&(vec)->count);
193 smp_mb__after_atomic_inc();
194 cpumask_clear_cpu(cpu, vec->mask);
195 }
196
197 *currpri = newpri;
198}
199
200/**
201 * cpupri_init - initialize the cpupri structure
202 * @cp: The cpupri context
203 *
204 * Returns: -ENOMEM if memory fails.
205 */
206int cpupri_init(struct cpupri *cp)
207{
208 int i;
209
210 memset(cp, 0, sizeof(*cp));
211
212 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
213 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
214
215 atomic_set(&vec->count, 0);
216 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
217 goto cleanup;
218 }
219
220 for_each_possible_cpu(i)
221 cp->cpu_to_pri[i] = CPUPRI_INVALID;
222 return 0;
223
224cleanup:
225 for (i--; i >= 0; i--)
226 free_cpumask_var(cp->pri_to_cpu[i].mask);
227 return -ENOMEM;
228}
229
230/**
231 * cpupri_cleanup - clean up the cpupri structure
232 * @cp: The cpupri context
233 */
234void cpupri_cleanup(struct cpupri *cp)
235{
236 int i;
237
238 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
239 free_cpumask_var(cp->pri_to_cpu[i].mask);
240}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
deleted file mode 100644
index f6d75617349..00000000000
--- a/kernel/sched/cpupri.h
+++ /dev/null
@@ -1,34 +0,0 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7
8#define CPUPRI_INVALID -1
9#define CPUPRI_IDLE 0
10#define CPUPRI_NORMAL 1
11/* values 2-101 are RT priorities 0-99 */
12
13struct cpupri_vec {
14 atomic_t count;
15 cpumask_var_t mask;
16};
17
18struct cpupri {
19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
20 int cpu_to_pri[NR_CPUS];
21};
22
23#ifdef CONFIG_SMP
24int cpupri_find(struct cpupri *cp,
25 struct task_struct *p, struct cpumask *lowest_mask);
26void cpupri_set(struct cpupri *cp, int cpu, int pri);
27int cpupri_init(struct cpupri *cp);
28void cpupri_cleanup(struct cpupri *cp);
29#else
30#define cpupri_set(cp, cpu, pri) do { } while (0)
31#define cpupri_init() do { } while (0)
32#endif
33
34#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
deleted file mode 100644
index 293b202fcf7..00000000000
--- a/kernel/sched/cputime.c
+++ /dev/null
@@ -1,589 +0,0 @@
1#include <linux/export.h>
2#include <linux/sched.h>
3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include "sched.h"
7
8
9#ifdef CONFIG_IRQ_TIME_ACCOUNTING
10
11/*
12 * There are no locks covering percpu hardirq/softirq time.
13 * They are only modified in vtime_account, on corresponding CPU
14 * with interrupts disabled. So, writes are safe.
15 * They are read and saved off onto struct rq in update_rq_clock().
16 * This may result in other CPU reading this CPU's irq time and can
17 * race with irq/vtime_account on this CPU. We would either get old
18 * or new value with a side effect of accounting a slice of irq time to wrong
19 * task when irq is in progress while we read rq->clock. That is a worthy
20 * compromise in place of having locks on each irq in account_system_time.
21 */
22DEFINE_PER_CPU(u64, cpu_hardirq_time);
23DEFINE_PER_CPU(u64, cpu_softirq_time);
24
25static DEFINE_PER_CPU(u64, irq_start_time);
26static int sched_clock_irqtime;
27
28void enable_sched_clock_irqtime(void)
29{
30 sched_clock_irqtime = 1;
31}
32
33void disable_sched_clock_irqtime(void)
34{
35 sched_clock_irqtime = 0;
36}
37
38#ifndef CONFIG_64BIT
39DEFINE_PER_CPU(seqcount_t, irq_time_seq);
40#endif /* CONFIG_64BIT */
41
42/*
43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */
46void irqtime_account_irq(struct task_struct *curr)
47{
48 unsigned long flags;
49 s64 delta;
50 int cpu;
51
52 if (!sched_clock_irqtime)
53 return;
54
55 local_irq_save(flags);
56
57 cpu = smp_processor_id();
58 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
59 __this_cpu_add(irq_start_time, delta);
60
61 irq_time_write_begin();
62 /*
63 * We do not account for softirq time from ksoftirqd here.
64 * We want to continue accounting softirq time to ksoftirqd thread
65 * in that case, so as not to confuse scheduler with a special task
66 * that do not consume any time, but still wants to run.
67 */
68 if (hardirq_count())
69 __this_cpu_add(cpu_hardirq_time, delta);
70 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
71 __this_cpu_add(cpu_softirq_time, delta);
72
73 irq_time_write_end();
74 local_irq_restore(flags);
75}
76EXPORT_SYMBOL_GPL(irqtime_account_irq);
77
78static int irqtime_account_hi_update(void)
79{
80 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 u64 latest_ns;
83 int ret = 0;
84
85 local_irq_save(flags);
86 latest_ns = this_cpu_read(cpu_hardirq_time);
87 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
88 ret = 1;
89 local_irq_restore(flags);
90 return ret;
91}
92
93static int irqtime_account_si_update(void)
94{
95 u64 *cpustat = kcpustat_this_cpu->cpustat;
96 unsigned long flags;
97 u64 latest_ns;
98 int ret = 0;
99
100 local_irq_save(flags);
101 latest_ns = this_cpu_read(cpu_softirq_time);
102 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
103 ret = 1;
104 local_irq_restore(flags);
105 return ret;
106}
107
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */
109
110#define sched_clock_irqtime (0)
111
112#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
113
114static inline void task_group_account_field(struct task_struct *p, int index,
115 u64 tmp)
116{
117#ifdef CONFIG_CGROUP_CPUACCT
118 struct kernel_cpustat *kcpustat;
119 struct cpuacct *ca;
120#endif
121 /*
122 * Since all updates are sure to touch the root cgroup, we
123 * get ourselves ahead and touch it first. If the root cgroup
124 * is the only cgroup, then nothing else should be necessary.
125 *
126 */
127 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
128
129#ifdef CONFIG_CGROUP_CPUACCT
130 if (unlikely(!cpuacct_subsys.active))
131 return;
132
133 rcu_read_lock();
134 ca = task_ca(p);
135 while (ca && (ca != &root_cpuacct)) {
136 kcpustat = this_cpu_ptr(ca->cpustat);
137 kcpustat->cpustat[index] += tmp;
138 ca = parent_ca(ca);
139 }
140 rcu_read_unlock();
141#endif
142}
143
144/*
145 * Account user cpu time to a process.
146 * @p: the process that the cpu time gets accounted to
147 * @cputime: the cpu time spent in user space since the last update
148 * @cputime_scaled: cputime scaled by cpu frequency
149 */
150void account_user_time(struct task_struct *p, cputime_t cputime,
151 cputime_t cputime_scaled)
152{
153 int index;
154
155 /* Add user time to process. */
156 p->utime += cputime;
157 p->utimescaled += cputime_scaled;
158 account_group_user_time(p, cputime);
159
160 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
161
162 /* Add user time to cpustat. */
163 task_group_account_field(p, index, (__force u64) cputime);
164
165 /* Account for user time used */
166 acct_update_integrals(p);
167}
168
169/*
170 * Account guest cpu time to a process.
171 * @p: the process that the cpu time gets accounted to
172 * @cputime: the cpu time spent in virtual machine since the last update
173 * @cputime_scaled: cputime scaled by cpu frequency
174 */
175static void account_guest_time(struct task_struct *p, cputime_t cputime,
176 cputime_t cputime_scaled)
177{
178 u64 *cpustat = kcpustat_this_cpu->cpustat;
179
180 /* Add guest time to process. */
181 p->utime += cputime;
182 p->utimescaled += cputime_scaled;
183 account_group_user_time(p, cputime);
184 p->gtime += cputime;
185
186 /* Add guest time to cpustat. */
187 if (TASK_NICE(p) > 0) {
188 cpustat[CPUTIME_NICE] += (__force u64) cputime;
189 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
190 } else {
191 cpustat[CPUTIME_USER] += (__force u64) cputime;
192 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
193 }
194}
195
196/*
197 * Account system cpu time to a process and desired cpustat field
198 * @p: the process that the cpu time gets accounted to
199 * @cputime: the cpu time spent in kernel space since the last update
200 * @cputime_scaled: cputime scaled by cpu frequency
201 * @target_cputime64: pointer to cpustat field that has to be updated
202 */
203static inline
204void __account_system_time(struct task_struct *p, cputime_t cputime,
205 cputime_t cputime_scaled, int index)
206{
207 /* Add system time to process. */
208 p->stime += cputime;
209 p->stimescaled += cputime_scaled;
210 account_group_system_time(p, cputime);
211
212 /* Add system time to cpustat. */
213 task_group_account_field(p, index, (__force u64) cputime);
214
215 /* Account for system time used */
216 acct_update_integrals(p);
217}
218
219/*
220 * Account system cpu time to a process.
221 * @p: the process that the cpu time gets accounted to
222 * @hardirq_offset: the offset to subtract from hardirq_count()
223 * @cputime: the cpu time spent in kernel space since the last update
224 * @cputime_scaled: cputime scaled by cpu frequency
225 */
226void account_system_time(struct task_struct *p, int hardirq_offset,
227 cputime_t cputime, cputime_t cputime_scaled)
228{
229 int index;
230
231 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
232 account_guest_time(p, cputime, cputime_scaled);
233 return;
234 }
235
236 if (hardirq_count() - hardirq_offset)
237 index = CPUTIME_IRQ;
238 else if (in_serving_softirq())
239 index = CPUTIME_SOFTIRQ;
240 else
241 index = CPUTIME_SYSTEM;
242
243 __account_system_time(p, cputime, cputime_scaled, index);
244}
245
246/*
247 * Account for involuntary wait time.
248 * @cputime: the cpu time spent in involuntary wait
249 */
250void account_steal_time(cputime_t cputime)
251{
252 u64 *cpustat = kcpustat_this_cpu->cpustat;
253
254 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
255}
256
257/*
258 * Account for idle time.
259 * @cputime: the cpu time spent in idle wait
260 */
261void account_idle_time(cputime_t cputime)
262{
263 u64 *cpustat = kcpustat_this_cpu->cpustat;
264 struct rq *rq = this_rq();
265
266 if (atomic_read(&rq->nr_iowait) > 0)
267 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
268 else
269 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
270}
271
272static __always_inline bool steal_account_process_tick(void)
273{
274#ifdef CONFIG_PARAVIRT
275 if (static_key_false(&paravirt_steal_enabled)) {
276 u64 steal, st = 0;
277
278 steal = paravirt_steal_clock(smp_processor_id());
279 steal -= this_rq()->prev_steal_time;
280
281 st = steal_ticks(steal);
282 this_rq()->prev_steal_time += st * TICK_NSEC;
283
284 account_steal_time(st);
285 return st;
286 }
287#endif
288 return false;
289}
290
291/*
292 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
293 * tasks (sum on group iteration) belonging to @tsk's group.
294 */
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{
297 struct signal_struct *sig = tsk->signal;
298 struct task_struct *t;
299
300 times->utime = sig->utime;
301 times->stime = sig->stime;
302 times->sum_exec_runtime = sig->sum_sched_runtime;
303
304 rcu_read_lock();
305 /* make sure we can trust tsk->thread_group list */
306 if (!likely(pid_alive(tsk)))
307 goto out;
308
309 t = tsk;
310 do {
311 times->utime += t->utime;
312 times->stime += t->stime;
313 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t);
315out:
316 rcu_read_unlock();
317}
318
319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
320
321#ifdef CONFIG_IRQ_TIME_ACCOUNTING
322/*
323 * Account a tick to a process and cpustat
324 * @p: the process that the cpu time gets accounted to
325 * @user_tick: is the tick from userspace
326 * @rq: the pointer to rq
327 *
328 * Tick demultiplexing follows the order
329 * - pending hardirq update
330 * - pending softirq update
331 * - user_time
332 * - idle_time
333 * - system time
334 * - check for guest_time
335 * - else account as system_time
336 *
337 * Check for hardirq is done both for system and user time as there is
338 * no timer going off while we are on hardirq and hence we may never get an
339 * opportunity to update it solely in system time.
340 * p->stime and friends are only updated on system time and not on irq
341 * softirq as those do not count in task exec_runtime any more.
342 */
343static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
344 struct rq *rq)
345{
346 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
347 u64 *cpustat = kcpustat_this_cpu->cpustat;
348
349 if (steal_account_process_tick())
350 return;
351
352 if (irqtime_account_hi_update()) {
353 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
354 } else if (irqtime_account_si_update()) {
355 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
356 } else if (this_cpu_ksoftirqd() == p) {
357 /*
358 * ksoftirqd time do not get accounted in cpu_softirq_time.
359 * So, we have to handle it separately here.
360 * Also, p->stime needs to be updated for ksoftirqd.
361 */
362 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
363 CPUTIME_SOFTIRQ);
364 } else if (user_tick) {
365 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
366 } else if (p == rq->idle) {
367 account_idle_time(cputime_one_jiffy);
368 } else if (p->flags & PF_VCPU) { /* System time or guest time */
369 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
370 } else {
371 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
372 CPUTIME_SYSTEM);
373 }
374}
375
376static void irqtime_account_idle_ticks(int ticks)
377{
378 int i;
379 struct rq *rq = this_rq();
380
381 for (i = 0; i < ticks; i++)
382 irqtime_account_process_tick(current, 0, rq);
383}
384#else /* CONFIG_IRQ_TIME_ACCOUNTING */
385static void irqtime_account_idle_ticks(int ticks) {}
386static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
387 struct rq *rq) {}
388#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
389
390/*
391 * Account a single tick of cpu time.
392 * @p: the process that the cpu time gets accounted to
393 * @user_tick: indicates if the tick is a user or a system tick
394 */
395void account_process_tick(struct task_struct *p, int user_tick)
396{
397 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
398 struct rq *rq = this_rq();
399
400 if (sched_clock_irqtime) {
401 irqtime_account_process_tick(p, user_tick, rq);
402 return;
403 }
404
405 if (steal_account_process_tick())
406 return;
407
408 if (user_tick)
409 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
410 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
411 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
412 one_jiffy_scaled);
413 else
414 account_idle_time(cputime_one_jiffy);
415}
416
417/*
418 * Account multiple ticks of steal time.
419 * @p: the process from which the cpu time has been stolen
420 * @ticks: number of stolen ticks
421 */
422void account_steal_ticks(unsigned long ticks)
423{
424 account_steal_time(jiffies_to_cputime(ticks));
425}
426
427/*
428 * Account multiple ticks of idle time.
429 * @ticks: number of stolen ticks
430 */
431void account_idle_ticks(unsigned long ticks)
432{
433
434 if (sched_clock_irqtime) {
435 irqtime_account_idle_ticks(ticks);
436 return;
437 }
438
439 account_idle_time(jiffies_to_cputime(ticks));
440}
441
442#endif
443
444/*
445 * Use precise platform statistics if available:
446 */
447#ifdef CONFIG_VIRT_CPU_ACCOUNTING
448void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
449{
450 *ut = p->utime;
451 *st = p->stime;
452}
453
454void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
455{
456 struct task_cputime cputime;
457
458 thread_group_cputime(p, &cputime);
459
460 *ut = cputime.utime;
461 *st = cputime.stime;
462}
463
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev)
476{
477 if (is_idle_task(prev))
478 vtime_account_idle(prev);
479 else
480 vtime_account_system(prev);
481
482 vtime_account_user(prev);
483 arch_vtime_task_switch(prev);
484}
485#endif
486
487/*
488 * Archs that account the whole time spent in the idle task
489 * (outside irq) as idle time can rely on this and just implement
490 * vtime_account_system() and vtime_account_idle(). Archs that
491 * have other meaning of the idle time (s390 only includes the
492 * time spent by the CPU when it's in low power mode) must override
493 * vtime_account().
494 */
495#ifndef __ARCH_HAS_VTIME_ACCOUNT
496void vtime_account(struct task_struct *tsk)
497{
498 if (in_interrupt() || !is_idle_task(tsk))
499 vtime_account_system(tsk);
500 else
501 vtime_account_idle(tsk);
502}
503EXPORT_SYMBOL_GPL(vtime_account);
504#endif /* __ARCH_HAS_VTIME_ACCOUNT */
505
506#else
507
508#ifndef nsecs_to_cputime
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif
511
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
513{
514 u64 temp = (__force u64) rtime;
515
516 temp *= (__force u64) utime;
517
518 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total);
520 else
521 temp = div64_u64(temp, (__force u64) total);
522
523 return (__force cputime_t) temp;
524}
525
526/*
527 * Adjust tick based cputime random precision against scheduler
528 * runtime accounting.
529 */
530static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev,
532 cputime_t *ut, cputime_t *st)
533{
534 cputime_t rtime, utime, total;
535
536 utime = curr->utime;
537 total = utime + curr->stime;
538
539 /*
540 * Tick based cputime accounting depend on random scheduling
541 * timeslices of a task to be interrupted or not by the timer.
542 * Depending on these circumstances, the number of these interrupts
543 * may be over or under-optimistic, matching the real user and system
544 * cputime with a variable precision.
545 *
546 * Fix this by scaling these tick based values against the total
547 * runtime accounted by the CFS scheduler.
548 */
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550
551 if (total)
552 utime = scale_utime(utime, rtime, total);
553 else
554 utime = rtime;
555
556 /*
557 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward.
559 * Let's enforce monotonicity.
560 */
561 prev->utime = max(prev->utime, utime);
562 prev->stime = max(prev->stime, rtime - prev->utime);
563
564 *ut = prev->utime;
565 *st = prev->stime;
566}
567
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{
570 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime,
574 };
575
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
577}
578
579/*
580 * Must be called with siglock held.
581 */
582void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
583{
584 struct task_cputime cputime;
585
586 thread_group_cputime(p, &cputime);
587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
588}
589#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
deleted file mode 100644
index 2cd3c1b4e58..00000000000
--- a/kernel/sched/debug.c
+++ /dev/null
@@ -1,531 +0,0 @@
1/*
2 * kernel/sched/debug.c
3 *
4 * Print the CFS rbtree
5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched.h>
15#include <linux/seq_file.h>
16#include <linux/kallsyms.h>
17#include <linux/utsname.h>
18
19#include "sched.h"
20
21static DEFINE_SPINLOCK(sched_debug_lock);
22
23/*
24 * This allows printing both to /proc/sched_debug and
25 * to the console
26 */
27#define SEQ_printf(m, x...) \
28 do { \
29 if (m) \
30 seq_printf(m, x); \
31 else \
32 printk(x); \
33 } while (0)
34
35/*
36 * Ease the printing of nsec fields:
37 */
38static long long nsec_high(unsigned long long nsec)
39{
40 if ((long long)nsec < 0) {
41 nsec = -nsec;
42 do_div(nsec, 1000000);
43 return -nsec;
44 }
45 do_div(nsec, 1000000);
46
47 return nsec;
48}
49
50static unsigned long nsec_low(unsigned long long nsec)
51{
52 if ((long long)nsec < 0)
53 nsec = -nsec;
54
55 return do_div(nsec, 1000000);
56}
57
58#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
59
60#ifdef CONFIG_FAIR_GROUP_SCHED
61static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
62{
63 struct sched_entity *se = tg->se[cpu];
64
65#define P(F) \
66 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
67#define PN(F) \
68 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
69
70 if (!se) {
71 struct sched_avg *avg = &cpu_rq(cpu)->avg;
72 P(avg->runnable_avg_sum);
73 P(avg->runnable_avg_period);
74 return;
75 }
76
77
78 PN(se->exec_start);
79 PN(se->vruntime);
80 PN(se->sum_exec_runtime);
81#ifdef CONFIG_SCHEDSTATS
82 PN(se->statistics.wait_start);
83 PN(se->statistics.sleep_start);
84 PN(se->statistics.block_start);
85 PN(se->statistics.sleep_max);
86 PN(se->statistics.block_max);
87 PN(se->statistics.exec_max);
88 PN(se->statistics.slice_max);
89 PN(se->statistics.wait_max);
90 PN(se->statistics.wait_sum);
91 P(se->statistics.wait_count);
92#endif
93 P(se->load.weight);
94#ifdef CONFIG_SMP
95 P(se->avg.runnable_avg_sum);
96 P(se->avg.runnable_avg_period);
97 P(se->avg.load_avg_contrib);
98 P(se->avg.decay_count);
99#endif
100#undef PN
101#undef P
102}
103#endif
104
105#ifdef CONFIG_CGROUP_SCHED
106static char group_path[PATH_MAX];
107
108static char *task_group_path(struct task_group *tg)
109{
110 if (autogroup_path(tg, group_path, PATH_MAX))
111 return group_path;
112
113 /*
114 * May be NULL if the underlying cgroup isn't fully-created yet
115 */
116 if (!tg->css.cgroup) {
117 group_path[0] = '\0';
118 return group_path;
119 }
120 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
121 return group_path;
122}
123#endif
124
125static void
126print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
127{
128 if (rq->curr == p)
129 SEQ_printf(m, "R");
130 else
131 SEQ_printf(m, " ");
132
133 SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
134 p->comm, p->pid,
135 SPLIT_NS(p->se.vruntime),
136 (long long)(p->nvcsw + p->nivcsw),
137 p->prio);
138#ifdef CONFIG_SCHEDSTATS
139 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
140 SPLIT_NS(p->se.vruntime),
141 SPLIT_NS(p->se.sum_exec_runtime),
142 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
143#else
144 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
145 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
146#endif
147#ifdef CONFIG_CGROUP_SCHED
148 SEQ_printf(m, " %s", task_group_path(task_group(p)));
149#endif
150
151 SEQ_printf(m, "\n");
152}
153
154static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
155{
156 struct task_struct *g, *p;
157 unsigned long flags;
158
159 SEQ_printf(m,
160 "\nrunnable tasks:\n"
161 " task PID tree-key switches prio"
162 " exec-runtime sum-exec sum-sleep\n"
163 "------------------------------------------------------"
164 "----------------------------------------------------\n");
165
166 read_lock_irqsave(&tasklist_lock, flags);
167
168 do_each_thread(g, p) {
169 if (!p->on_rq || task_cpu(p) != rq_cpu)
170 continue;
171
172 print_task(m, rq, p);
173 } while_each_thread(g, p);
174
175 read_unlock_irqrestore(&tasklist_lock, flags);
176}
177
178void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179{
180 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
181 spread, rq0_min_vruntime, spread0;
182 struct rq *rq = cpu_rq(cpu);
183 struct sched_entity *last;
184 unsigned long flags;
185
186#ifdef CONFIG_FAIR_GROUP_SCHED
187 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
188#else
189 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
190#endif
191 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
192 SPLIT_NS(cfs_rq->exec_clock));
193
194 raw_spin_lock_irqsave(&rq->lock, flags);
195 if (cfs_rq->rb_leftmost)
196 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
197 last = __pick_last_entity(cfs_rq);
198 if (last)
199 max_vruntime = last->vruntime;
200 min_vruntime = cfs_rq->min_vruntime;
201 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
202 raw_spin_unlock_irqrestore(&rq->lock, flags);
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
204 SPLIT_NS(MIN_vruntime));
205 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
206 SPLIT_NS(min_vruntime));
207 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
208 SPLIT_NS(max_vruntime));
209 spread = max_vruntime - MIN_vruntime;
210 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
211 SPLIT_NS(spread));
212 spread0 = min_vruntime - rq0_min_vruntime;
213 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
214 SPLIT_NS(spread0));
215 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
216 cfs_rq->nr_spread_over);
217 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
218 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
219#ifdef CONFIG_FAIR_GROUP_SCHED
220#ifdef CONFIG_SMP
221 SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg",
222 cfs_rq->runnable_load_avg);
223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
224 cfs_rq->blocked_load_avg);
225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
226 atomic64_read(&cfs_rq->tg->load_avg));
227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
228 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
230 cfs_rq->tg_runnable_contrib);
231 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
232 atomic_read(&cfs_rq->tg->runnable_avg));
233#endif
234
235 print_cfs_group_stats(m, cpu, cfs_rq->tg);
236#endif
237}
238
239void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
240{
241#ifdef CONFIG_RT_GROUP_SCHED
242 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
243#else
244 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
245#endif
246
247#define P(x) \
248 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
249#define PN(x) \
250 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
251
252 P(rt_nr_running);
253 P(rt_throttled);
254 PN(rt_time);
255 PN(rt_runtime);
256
257#undef PN
258#undef P
259}
260
261extern __read_mostly int sched_clock_running;
262
263static void print_cpu(struct seq_file *m, int cpu)
264{
265 struct rq *rq = cpu_rq(cpu);
266 unsigned long flags;
267
268#ifdef CONFIG_X86
269 {
270 unsigned int freq = cpu_khz ? : 1;
271
272 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
273 cpu, freq / 1000, (freq % 1000));
274 }
275#else
276 SEQ_printf(m, "\ncpu#%d\n", cpu);
277#endif
278
279#define P(x) \
280do { \
281 if (sizeof(rq->x) == 4) \
282 SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
283 else \
284 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
285} while (0)
286
287#define PN(x) \
288 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
289
290 P(nr_running);
291 SEQ_printf(m, " .%-30s: %lu\n", "load",
292 rq->load.weight);
293 P(nr_switches);
294 P(nr_load_updates);
295 P(nr_uninterruptible);
296 PN(next_balance);
297 P(curr->pid);
298 PN(clock);
299 P(cpu_load[0]);
300 P(cpu_load[1]);
301 P(cpu_load[2]);
302 P(cpu_load[3]);
303 P(cpu_load[4]);
304#undef P
305#undef PN
306
307#ifdef CONFIG_SCHEDSTATS
308#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
309#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
310
311 P(yld_count);
312
313 P(sched_count);
314 P(sched_goidle);
315#ifdef CONFIG_SMP
316 P64(avg_idle);
317#endif
318
319 P(ttwu_count);
320 P(ttwu_local);
321
322#undef P
323#undef P64
324#endif
325 spin_lock_irqsave(&sched_debug_lock, flags);
326 print_cfs_stats(m, cpu);
327 print_rt_stats(m, cpu);
328
329 rcu_read_lock();
330 print_rq(m, rq, cpu);
331 rcu_read_unlock();
332 spin_unlock_irqrestore(&sched_debug_lock, flags);
333}
334
335static const char *sched_tunable_scaling_names[] = {
336 "none",
337 "logaritmic",
338 "linear"
339};
340
341static int sched_debug_show(struct seq_file *m, void *v)
342{
343 u64 ktime, sched_clk, cpu_clk;
344 unsigned long flags;
345 int cpu;
346
347 local_irq_save(flags);
348 ktime = ktime_to_ns(ktime_get());
349 sched_clk = sched_clock();
350 cpu_clk = local_clock();
351 local_irq_restore(flags);
352
353 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
354 init_utsname()->release,
355 (int)strcspn(init_utsname()->version, " "),
356 init_utsname()->version);
357
358#define P(x) \
359 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
360#define PN(x) \
361 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
362 PN(ktime);
363 PN(sched_clk);
364 PN(cpu_clk);
365 P(jiffies);
366#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
367 P(sched_clock_stable);
368#endif
369#undef PN
370#undef P
371
372 SEQ_printf(m, "\n");
373 SEQ_printf(m, "sysctl_sched\n");
374
375#define P(x) \
376 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
377#define PN(x) \
378 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
379 PN(sysctl_sched_latency);
380 PN(sysctl_sched_min_granularity);
381 PN(sysctl_sched_wakeup_granularity);
382 P(sysctl_sched_child_runs_first);
383 P(sysctl_sched_features);
384#undef PN
385#undef P
386
387 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
388 sysctl_sched_tunable_scaling,
389 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
390
391 for_each_online_cpu(cpu)
392 print_cpu(m, cpu);
393
394 SEQ_printf(m, "\n");
395
396 return 0;
397}
398
399void sysrq_sched_debug_show(void)
400{
401 sched_debug_show(NULL, NULL);
402}
403
404static int sched_debug_open(struct inode *inode, struct file *filp)
405{
406 return single_open(filp, sched_debug_show, NULL);
407}
408
409static const struct file_operations sched_debug_fops = {
410 .open = sched_debug_open,
411 .read = seq_read,
412 .llseek = seq_lseek,
413 .release = single_release,
414};
415
416static int __init init_sched_debug_procfs(void)
417{
418 struct proc_dir_entry *pe;
419
420 pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
421 if (!pe)
422 return -ENOMEM;
423 return 0;
424}
425
426__initcall(init_sched_debug_procfs);
427
428void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
429{
430 unsigned long nr_switches;
431
432 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
433 get_nr_threads(p));
434 SEQ_printf(m,
435 "---------------------------------------------------------\n");
436#define __P(F) \
437 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
438#define P(F) \
439 SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
440#define __PN(F) \
441 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
442#define PN(F) \
443 SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
444
445 PN(se.exec_start);
446 PN(se.vruntime);
447 PN(se.sum_exec_runtime);
448
449 nr_switches = p->nvcsw + p->nivcsw;
450
451#ifdef CONFIG_SCHEDSTATS
452 PN(se.statistics.wait_start);
453 PN(se.statistics.sleep_start);
454 PN(se.statistics.block_start);
455 PN(se.statistics.sleep_max);
456 PN(se.statistics.block_max);
457 PN(se.statistics.exec_max);
458 PN(se.statistics.slice_max);
459 PN(se.statistics.wait_max);
460 PN(se.statistics.wait_sum);
461 P(se.statistics.wait_count);
462 PN(se.statistics.iowait_sum);
463 P(se.statistics.iowait_count);
464 P(se.nr_migrations);
465 P(se.statistics.nr_migrations_cold);
466 P(se.statistics.nr_failed_migrations_affine);
467 P(se.statistics.nr_failed_migrations_running);
468 P(se.statistics.nr_failed_migrations_hot);
469 P(se.statistics.nr_forced_migrations);
470 P(se.statistics.nr_wakeups);
471 P(se.statistics.nr_wakeups_sync);
472 P(se.statistics.nr_wakeups_migrate);
473 P(se.statistics.nr_wakeups_local);
474 P(se.statistics.nr_wakeups_remote);
475 P(se.statistics.nr_wakeups_affine);
476 P(se.statistics.nr_wakeups_affine_attempts);
477 P(se.statistics.nr_wakeups_passive);
478 P(se.statistics.nr_wakeups_idle);
479
480 {
481 u64 avg_atom, avg_per_cpu;
482
483 avg_atom = p->se.sum_exec_runtime;
484 if (nr_switches)
485 do_div(avg_atom, nr_switches);
486 else
487 avg_atom = -1LL;
488
489 avg_per_cpu = p->se.sum_exec_runtime;
490 if (p->se.nr_migrations) {
491 avg_per_cpu = div64_u64(avg_per_cpu,
492 p->se.nr_migrations);
493 } else {
494 avg_per_cpu = -1LL;
495 }
496
497 __PN(avg_atom);
498 __PN(avg_per_cpu);
499 }
500#endif
501 __P(nr_switches);
502 SEQ_printf(m, "%-35s:%21Ld\n",
503 "nr_voluntary_switches", (long long)p->nvcsw);
504 SEQ_printf(m, "%-35s:%21Ld\n",
505 "nr_involuntary_switches", (long long)p->nivcsw);
506
507 P(se.load.weight);
508 P(policy);
509 P(prio);
510#undef PN
511#undef __PN
512#undef P
513#undef __P
514
515 {
516 unsigned int this_cpu = raw_smp_processor_id();
517 u64 t0, t1;
518
519 t0 = cpu_clock(this_cpu);
520 t1 = cpu_clock(this_cpu);
521 SEQ_printf(m, "%-35s:%21Ld\n",
522 "clock-delta", (long long)(t1-t0));
523 }
524}
525
526void proc_sched_set_task(struct task_struct *p)
527{
528#ifdef CONFIG_SCHEDSTATS
529 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
530#endif
531}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
deleted file mode 100644
index 5eea8707234..00000000000
--- a/kernel/sched/fair.c
+++ /dev/null
@@ -1,6174 +0,0 @@
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */
22
23#include <linux/latencytop.h>
24#include <linux/sched.h>
25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
32
33#include <trace/events/sched.h>
34
35#include "sched.h"
36
37/*
38 * Targeted preemption latency for CPU-bound tasks:
39 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
40 *
41 * NOTE: this latency value is not the same as the concept of
42 * 'timeslice length' - timeslices in CFS are of variable length
43 * and have no persistent notion like in traditional, time-slice
44 * based scheduling concepts.
45 *
46 * (to see the precise effective timeslice length of your workload,
47 * run vmstat and monitor the context-switches (cs) field)
48 */
49unsigned int sysctl_sched_latency = 6000000ULL;
50unsigned int normalized_sysctl_sched_latency = 6000000ULL;
51
52/*
53 * The initial- and re-scaling of tunables is configurable
54 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55 *
56 * Options are:
57 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60 */
61enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 = SCHED_TUNABLESCALING_LOG;
63
64/*
65 * Minimal preemption granularity for CPU-bound tasks:
66 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 */
68unsigned int sysctl_sched_min_granularity = 750000ULL;
69unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
70
71/*
72 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73 */
74static unsigned int sched_nr_latency = 8;
75
76/*
77 * After fork, child runs first. If set to 0 (default) then
78 * parent will (try to) run first.
79 */
80unsigned int sysctl_sched_child_runs_first __read_mostly;
81
82/*
83 * SCHED_OTHER wake-up granularity.
84 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
85 *
86 * This option delays the preemption effects of decoupled workloads
87 * and reduces their over-scheduling. Synchronous workloads will still
88 * have immediate wakeup/sleep latencies.
89 */
90unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
92
93const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94
95/*
96 * The exponential sliding window over which load is averaged for shares
97 * distribution.
98 * (default: 10msec)
99 */
100unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101
102#ifdef CONFIG_CFS_BANDWIDTH
103/*
104 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105 * each time a cfs_rq requests quota.
106 *
107 * Note: in the case that the slice exceeds the runtime remaining (either due
108 * to consumption or the quota being specified to be smaller than the slice)
109 * we will always only issue the remaining available time.
110 *
111 * default: 5 msec, units: microseconds
112 */
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif
115
116/*
117 * Increase the granularity value when there are more CPUs,
118 * because with more CPUs the 'effective latency' as visible
119 * to users decreases. But the relationship is not linear,
120 * so pick a second-best guess by going with the log2 of the
121 * number of CPUs.
122 *
123 * This idea comes from the SD scheduler of Con Kolivas:
124 */
125static int get_update_sysctl_factor(void)
126{
127 unsigned int cpus = min_t(int, num_online_cpus(), 8);
128 unsigned int factor;
129
130 switch (sysctl_sched_tunable_scaling) {
131 case SCHED_TUNABLESCALING_NONE:
132 factor = 1;
133 break;
134 case SCHED_TUNABLESCALING_LINEAR:
135 factor = cpus;
136 break;
137 case SCHED_TUNABLESCALING_LOG:
138 default:
139 factor = 1 + ilog2(cpus);
140 break;
141 }
142
143 return factor;
144}
145
146static void update_sysctl(void)
147{
148 unsigned int factor = get_update_sysctl_factor();
149
150#define SET_SYSCTL(name) \
151 (sysctl_##name = (factor) * normalized_sysctl_##name)
152 SET_SYSCTL(sched_min_granularity);
153 SET_SYSCTL(sched_latency);
154 SET_SYSCTL(sched_wakeup_granularity);
155#undef SET_SYSCTL
156}
157
158void sched_init_granularity(void)
159{
160 update_sysctl();
161}
162
163#if BITS_PER_LONG == 32
164# define WMULT_CONST (~0UL)
165#else
166# define WMULT_CONST (1UL << 32)
167#endif
168
169#define WMULT_SHIFT 32
170
171/*
172 * Shift right and round:
173 */
174#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
175
176/*
177 * delta *= weight / lw
178 */
179static unsigned long
180calc_delta_mine(unsigned long delta_exec, unsigned long weight,
181 struct load_weight *lw)
182{
183 u64 tmp;
184
185 /*
186 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
187 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
188 * 2^SCHED_LOAD_RESOLUTION.
189 */
190 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
191 tmp = (u64)delta_exec * scale_load_down(weight);
192 else
193 tmp = (u64)delta_exec;
194
195 if (!lw->inv_weight) {
196 unsigned long w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204 }
205
206 /*
207 * Check whether we'd overflow the 64-bit multiplication:
208 */
209 if (unlikely(tmp > WMULT_CONST))
210 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
211 WMULT_SHIFT/2);
212 else
213 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
214
215 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
216}
217
218
219const struct sched_class fair_sched_class;
220
221/**************************************************************
222 * CFS operations on generic schedulable entities:
223 */
224
225#ifdef CONFIG_FAIR_GROUP_SCHED
226
227/* cpu runqueue to which this cfs_rq is attached */
228static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
229{
230 return cfs_rq->rq;
231}
232
233/* An entity is a task if it doesn't "own" a runqueue */
234#define entity_is_task(se) (!se->my_q)
235
236static inline struct task_struct *task_of(struct sched_entity *se)
237{
238#ifdef CONFIG_SCHED_DEBUG
239 WARN_ON_ONCE(!entity_is_task(se));
240#endif
241 return container_of(se, struct task_struct, se);
242}
243
244/* Walk up scheduling entities hierarchy */
245#define for_each_sched_entity(se) \
246 for (; se; se = se->parent)
247
248static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
249{
250 return p->se.cfs_rq;
251}
252
253/* runqueue on which this entity is (to be) queued */
254static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
255{
256 return se->cfs_rq;
257}
258
259/* runqueue "owned" by this group */
260static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
261{
262 return grp->my_q;
263}
264
265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 int force_update);
267
268static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
269{
270 if (!cfs_rq->on_list) {
271 /*
272 * Ensure we either appear before our parent (if already
273 * enqueued) or force our parent to appear after us when it is
274 * enqueued. The fact that we always enqueue bottom-up
275 * reduces this to two cases.
276 */
277 if (cfs_rq->tg->parent &&
278 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
279 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
280 &rq_of(cfs_rq)->leaf_cfs_rq_list);
281 } else {
282 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
283 &rq_of(cfs_rq)->leaf_cfs_rq_list);
284 }
285
286 cfs_rq->on_list = 1;
287 /* We should have no load, but we need to update last_decay. */
288 update_cfs_rq_blocked_load(cfs_rq, 0);
289 }
290}
291
292static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
293{
294 if (cfs_rq->on_list) {
295 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
296 cfs_rq->on_list = 0;
297 }
298}
299
300/* Iterate thr' all leaf cfs_rq's on a runqueue */
301#define for_each_leaf_cfs_rq(rq, cfs_rq) \
302 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
303
304/* Do the two (enqueued) entities belong to the same group ? */
305static inline int
306is_same_group(struct sched_entity *se, struct sched_entity *pse)
307{
308 if (se->cfs_rq == pse->cfs_rq)
309 return 1;
310
311 return 0;
312}
313
314static inline struct sched_entity *parent_entity(struct sched_entity *se)
315{
316 return se->parent;
317}
318
319/* return depth at which a sched entity is present in the hierarchy */
320static inline int depth_se(struct sched_entity *se)
321{
322 int depth = 0;
323
324 for_each_sched_entity(se)
325 depth++;
326
327 return depth;
328}
329
330static void
331find_matching_se(struct sched_entity **se, struct sched_entity **pse)
332{
333 int se_depth, pse_depth;
334
335 /*
336 * preemption test can be made between sibling entities who are in the
337 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
338 * both tasks until we find their ancestors who are siblings of common
339 * parent.
340 */
341
342 /* First walk up until both entities are at same depth */
343 se_depth = depth_se(*se);
344 pse_depth = depth_se(*pse);
345
346 while (se_depth > pse_depth) {
347 se_depth--;
348 *se = parent_entity(*se);
349 }
350
351 while (pse_depth > se_depth) {
352 pse_depth--;
353 *pse = parent_entity(*pse);
354 }
355
356 while (!is_same_group(*se, *pse)) {
357 *se = parent_entity(*se);
358 *pse = parent_entity(*pse);
359 }
360}
361
362#else /* !CONFIG_FAIR_GROUP_SCHED */
363
364static inline struct task_struct *task_of(struct sched_entity *se)
365{
366 return container_of(se, struct task_struct, se);
367}
368
369static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
370{
371 return container_of(cfs_rq, struct rq, cfs);
372}
373
374#define entity_is_task(se) 1
375
376#define for_each_sched_entity(se) \
377 for (; se; se = NULL)
378
379static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
380{
381 return &task_rq(p)->cfs;
382}
383
384static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
385{
386 struct task_struct *p = task_of(se);
387 struct rq *rq = task_rq(p);
388
389 return &rq->cfs;
390}
391
392/* runqueue "owned" by this group */
393static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
394{
395 return NULL;
396}
397
398static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
399{
400}
401
402static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
403{
404}
405
406#define for_each_leaf_cfs_rq(rq, cfs_rq) \
407 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
408
409static inline int
410is_same_group(struct sched_entity *se, struct sched_entity *pse)
411{
412 return 1;
413}
414
415static inline struct sched_entity *parent_entity(struct sched_entity *se)
416{
417 return NULL;
418}
419
420static inline void
421find_matching_se(struct sched_entity **se, struct sched_entity **pse)
422{
423}
424
425#endif /* CONFIG_FAIR_GROUP_SCHED */
426
427static __always_inline
428void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
429
430/**************************************************************
431 * Scheduling class tree data structure manipulation methods:
432 */
433
434static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
435{
436 s64 delta = (s64)(vruntime - min_vruntime);
437 if (delta > 0)
438 min_vruntime = vruntime;
439
440 return min_vruntime;
441}
442
443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
444{
445 s64 delta = (s64)(vruntime - min_vruntime);
446 if (delta < 0)
447 min_vruntime = vruntime;
448
449 return min_vruntime;
450}
451
452static inline int entity_before(struct sched_entity *a,
453 struct sched_entity *b)
454{
455 return (s64)(a->vruntime - b->vruntime) < 0;
456}
457
458static void update_min_vruntime(struct cfs_rq *cfs_rq)
459{
460 u64 vruntime = cfs_rq->min_vruntime;
461
462 if (cfs_rq->curr)
463 vruntime = cfs_rq->curr->vruntime;
464
465 if (cfs_rq->rb_leftmost) {
466 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
467 struct sched_entity,
468 run_node);
469
470 if (!cfs_rq->curr)
471 vruntime = se->vruntime;
472 else
473 vruntime = min_vruntime(vruntime, se->vruntime);
474 }
475
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT
478 smp_wmb();
479 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
480#endif
481}
482
483/*
484 * Enqueue an entity into the rb-tree:
485 */
486static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
487{
488 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
489 struct rb_node *parent = NULL;
490 struct sched_entity *entry;
491 int leftmost = 1;
492
493 /*
494 * Find the right place in the rbtree:
495 */
496 while (*link) {
497 parent = *link;
498 entry = rb_entry(parent, struct sched_entity, run_node);
499 /*
500 * We dont care about collisions. Nodes with
501 * the same key stay together.
502 */
503 if (entity_before(se, entry)) {
504 link = &parent->rb_left;
505 } else {
506 link = &parent->rb_right;
507 leftmost = 0;
508 }
509 }
510
511 /*
512 * Maintain a cache of leftmost tree entries (it is frequently
513 * used):
514 */
515 if (leftmost)
516 cfs_rq->rb_leftmost = &se->run_node;
517
518 rb_link_node(&se->run_node, parent, link);
519 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
520}
521
522static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
523{
524 if (cfs_rq->rb_leftmost == &se->run_node) {
525 struct rb_node *next_node;
526
527 next_node = rb_next(&se->run_node);
528 cfs_rq->rb_leftmost = next_node;
529 }
530
531 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
532}
533
534struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
535{
536 struct rb_node *left = cfs_rq->rb_leftmost;
537
538 if (!left)
539 return NULL;
540
541 return rb_entry(left, struct sched_entity, run_node);
542}
543
544static struct sched_entity *__pick_next_entity(struct sched_entity *se)
545{
546 struct rb_node *next = rb_next(&se->run_node);
547
548 if (!next)
549 return NULL;
550
551 return rb_entry(next, struct sched_entity, run_node);
552}
553
554#ifdef CONFIG_SCHED_DEBUG
555struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
556{
557 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
558
559 if (!last)
560 return NULL;
561
562 return rb_entry(last, struct sched_entity, run_node);
563}
564
565/**************************************************************
566 * Scheduling class statistics methods:
567 */
568
569int sched_proc_update_handler(struct ctl_table *table, int write,
570 void __user *buffer, size_t *lenp,
571 loff_t *ppos)
572{
573 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
574 int factor = get_update_sysctl_factor();
575
576 if (ret || !write)
577 return ret;
578
579 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
580 sysctl_sched_min_granularity);
581
582#define WRT_SYSCTL(name) \
583 (normalized_sysctl_##name = sysctl_##name / (factor))
584 WRT_SYSCTL(sched_min_granularity);
585 WRT_SYSCTL(sched_latency);
586 WRT_SYSCTL(sched_wakeup_granularity);
587#undef WRT_SYSCTL
588
589 return 0;
590}
591#endif
592
593/*
594 * delta /= w
595 */
596static inline unsigned long
597calc_delta_fair(unsigned long delta, struct sched_entity *se)
598{
599 if (unlikely(se->load.weight != NICE_0_LOAD))
600 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
601
602 return delta;
603}
604
605/*
606 * The idea is to set a period in which each task runs once.
607 *
608 * When there are too many tasks (sched_nr_latency) we have to stretch
609 * this period because otherwise the slices get too small.
610 *
611 * p = (nr <= nl) ? l : l*nr/nl
612 */
613static u64 __sched_period(unsigned long nr_running)
614{
615 u64 period = sysctl_sched_latency;
616 unsigned long nr_latency = sched_nr_latency;
617
618 if (unlikely(nr_running > nr_latency)) {
619 period = sysctl_sched_min_granularity;
620 period *= nr_running;
621 }
622
623 return period;
624}
625
626/*
627 * We calculate the wall-time slice from the period by taking a part
628 * proportional to the weight.
629 *
630 * s = p*P[w/rw]
631 */
632static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
633{
634 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
635
636 for_each_sched_entity(se) {
637 struct load_weight *load;
638 struct load_weight lw;
639
640 cfs_rq = cfs_rq_of(se);
641 load = &cfs_rq->load;
642
643 if (unlikely(!se->on_rq)) {
644 lw = cfs_rq->load;
645
646 update_load_add(&lw, se->load.weight);
647 load = &lw;
648 }
649 slice = calc_delta_mine(slice, se->load.weight, load);
650 }
651 return slice;
652}
653
654/*
655 * We calculate the vruntime slice of a to be inserted task
656 *
657 * vs = s/w
658 */
659static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
660{
661 return calc_delta_fair(sched_slice(cfs_rq, se), se);
662}
663
664/*
665 * Update the current task's runtime statistics. Skip current tasks that
666 * are not in our scheduling class.
667 */
668static inline void
669__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
670 unsigned long delta_exec)
671{
672 unsigned long delta_exec_weighted;
673
674 schedstat_set(curr->statistics.exec_max,
675 max((u64)delta_exec, curr->statistics.exec_max));
676
677 curr->sum_exec_runtime += delta_exec;
678 schedstat_add(cfs_rq, exec_clock, delta_exec);
679 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
680
681 curr->vruntime += delta_exec_weighted;
682 update_min_vruntime(cfs_rq);
683}
684
685static void update_curr(struct cfs_rq *cfs_rq)
686{
687 struct sched_entity *curr = cfs_rq->curr;
688 u64 now = rq_of(cfs_rq)->clock_task;
689 unsigned long delta_exec;
690
691 if (unlikely(!curr))
692 return;
693
694 /*
695 * Get the amount of time the current task was running
696 * since the last time we changed load (this cannot
697 * overflow on 32 bits):
698 */
699 delta_exec = (unsigned long)(now - curr->exec_start);
700 if (!delta_exec)
701 return;
702
703 __update_curr(cfs_rq, curr, delta_exec);
704 curr->exec_start = now;
705
706 if (entity_is_task(curr)) {
707 struct task_struct *curtask = task_of(curr);
708
709 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
710 cpuacct_charge(curtask, delta_exec);
711 account_group_exec_runtime(curtask, delta_exec);
712 }
713
714 account_cfs_rq_runtime(cfs_rq, delta_exec);
715}
716
717static inline void
718update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
719{
720 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
721}
722
723/*
724 * Task is being enqueued - update stats:
725 */
726static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
727{
728 /*
729 * Are we enqueueing a waiting task? (for current tasks
730 * a dequeue/enqueue event is a NOP)
731 */
732 if (se != cfs_rq->curr)
733 update_stats_wait_start(cfs_rq, se);
734}
735
736static void
737update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
738{
739 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
740 rq_of(cfs_rq)->clock - se->statistics.wait_start));
741 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
742 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
743 rq_of(cfs_rq)->clock - se->statistics.wait_start);
744#ifdef CONFIG_SCHEDSTATS
745 if (entity_is_task(se)) {
746 trace_sched_stat_wait(task_of(se),
747 rq_of(cfs_rq)->clock - se->statistics.wait_start);
748 }
749#endif
750 schedstat_set(se->statistics.wait_start, 0);
751}
752
753static inline void
754update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
755{
756 /*
757 * Mark the end of the wait period if dequeueing a
758 * waiting task:
759 */
760 if (se != cfs_rq->curr)
761 update_stats_wait_end(cfs_rq, se);
762}
763
764/*
765 * We are picking a new current task - update its stats:
766 */
767static inline void
768update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
769{
770 /*
771 * We are starting a new run period:
772 */
773 se->exec_start = rq_of(cfs_rq)->clock_task;
774}
775
776/**************************************************
777 * Scheduling class queueing methods:
778 */
779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq;
797
798 if (!p->mm) /* for example, ksmd faulting in a user's mm */
799 return;
800 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
801 if (p->numa_scan_seq == seq)
802 return;
803 p->numa_scan_seq = seq;
804
805 /* FIXME: Scheduling placement policy hints go here */
806}
807
808/*
809 * Got a PROT_NONE fault for a page on @node.
810 */
811void task_numa_fault(int node, int pages, bool migrated)
812{
813 struct task_struct *p = current;
814
815 if (!sched_feat_numa(NUMA))
816 return;
817
818 /* FIXME: Allocate task-specific structure for placement policy here */
819
820 /*
821 * If pages are properly placed (did not migrate) then scan slower.
822 * This is reset periodically in case of phase changes
823 */
824 if (!migrated)
825 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
826 p->numa_scan_period + jiffies_to_msecs(10));
827
828 task_numa_placement(p);
829}
830
831static void reset_ptenuma_scan(struct task_struct *p)
832{
833 ACCESS_ONCE(p->mm->numa_scan_seq)++;
834 p->mm->numa_scan_offset = 0;
835}
836
837/*
838 * The expensive part of numa migration is done from task_work context.
839 * Triggered from task_tick_numa().
840 */
841void task_numa_work(struct callback_head *work)
842{
843 unsigned long migrate, next_scan, now = jiffies;
844 struct task_struct *p = current;
845 struct mm_struct *mm = p->mm;
846 struct vm_area_struct *vma;
847 unsigned long start, end;
848 long pages;
849
850 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
851
852 work->next = work; /* protect against double add */
853 /*
854 * Who cares about NUMA placement when they're dying.
855 *
856 * NOTE: make sure not to dereference p->mm before this check,
857 * exit_task_work() happens _after_ exit_mm() so we could be called
858 * without p->mm even though we still had it when we enqueued this
859 * work.
860 */
861 if (p->flags & PF_EXITING)
862 return;
863
864 /*
865 * We do not care about task placement until a task runs on a node
866 * other than the first one used by the address space. This is
867 * largely because migrations are driven by what CPU the task
868 * is running on. If it's never scheduled on another node, it'll
869 * not migrate so why bother trapping the fault.
870 */
871 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
872 mm->first_nid = numa_node_id();
873 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
874 /* Are we running on a new node yet? */
875 if (numa_node_id() == mm->first_nid &&
876 !sched_feat_numa(NUMA_FORCE))
877 return;
878
879 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
880 }
881
882 /*
883 * Reset the scan period if enough time has gone by. Objective is that
884 * scanning will be reduced if pages are properly placed. As tasks
885 * can enter different phases this needs to be re-examined. Lacking
886 * proper tracking of reference behaviour, this blunt hammer is used.
887 */
888 migrate = mm->numa_next_reset;
889 if (time_after(now, migrate)) {
890 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
891 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
892 xchg(&mm->numa_next_reset, next_scan);
893 }
894
895 /*
896 * Enforce maximal scan/migration frequency..
897 */
898 migrate = mm->numa_next_scan;
899 if (time_before(now, migrate))
900 return;
901
902 if (p->numa_scan_period == 0)
903 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
904
905 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
906 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
907 return;
908
909 /*
910 * Do not set pte_numa if the current running node is rate-limited.
911 * This loses statistics on the fault but if we are unwilling to
912 * migrate to this node, it is less likely we can do useful work
913 */
914 if (migrate_ratelimited(numa_node_id()))
915 return;
916
917 start = mm->numa_scan_offset;
918 pages = sysctl_numa_balancing_scan_size;
919 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
920 if (!pages)
921 return;
922
923 down_read(&mm->mmap_sem);
924 vma = find_vma(mm, start);
925 if (!vma) {
926 reset_ptenuma_scan(p);
927 start = 0;
928 vma = mm->mmap;
929 }
930 for (; vma; vma = vma->vm_next) {
931 if (!vma_migratable(vma))
932 continue;
933
934 /* Skip small VMAs. They are not likely to be of relevance */
935 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
936 continue;
937
938 do {
939 start = max(start, vma->vm_start);
940 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
941 end = min(end, vma->vm_end);
942 pages -= change_prot_numa(vma, start, end);
943
944 start = end;
945 if (pages <= 0)
946 goto out;
947 } while (end != vma->vm_end);
948 }
949
950out:
951 /*
952 * It is possible to reach the end of the VMA list but the last few VMAs are
953 * not guaranteed to the vma_migratable. If they are not, we would find the
954 * !migratable VMA on the next scan but not reset the scanner to the start
955 * so check it now.
956 */
957 if (vma)
958 mm->numa_scan_offset = start;
959 else
960 reset_ptenuma_scan(p);
961 up_read(&mm->mmap_sem);
962}
963
964/*
965 * Drive the periodic memory faults..
966 */
967void task_tick_numa(struct rq *rq, struct task_struct *curr)
968{
969 struct callback_head *work = &curr->numa_work;
970 u64 period, now;
971
972 /*
973 * We don't care about NUMA placement if we don't have memory.
974 */
975 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
976 return;
977
978 /*
979 * Using runtime rather than walltime has the dual advantage that
980 * we (mostly) drive the selection from busy threads and that the
981 * task needs to have done some actual work before we bother with
982 * NUMA placement.
983 */
984 now = curr->se.sum_exec_runtime;
985 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
986
987 if (now - curr->node_stamp > period) {
988 if (!curr->node_stamp)
989 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
990 curr->node_stamp = now;
991
992 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
993 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
994 task_work_add(curr, work, true);
995 }
996 }
997}
998#else
999static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1000{
1001}
1002#endif /* CONFIG_NUMA_BALANCING */
1003
1004static void
1005account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1006{
1007 update_load_add(&cfs_rq->load, se->load.weight);
1008 if (!parent_entity(se))
1009 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1010#ifdef CONFIG_SMP
1011 if (entity_is_task(se))
1012 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1013#endif
1014 cfs_rq->nr_running++;
1015}
1016
1017static void
1018account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1019{
1020 update_load_sub(&cfs_rq->load, se->load.weight);
1021 if (!parent_entity(se))
1022 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1023 if (entity_is_task(se))
1024 list_del_init(&se->group_node);
1025 cfs_rq->nr_running--;
1026}
1027
1028#ifdef CONFIG_FAIR_GROUP_SCHED
1029# ifdef CONFIG_SMP
1030static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1031{
1032 long tg_weight;
1033
1034 /*
1035 * Use this CPU's actual weight instead of the last load_contribution
1036 * to gain a more accurate current total weight. See
1037 * update_cfs_rq_load_contribution().
1038 */
1039 tg_weight = atomic64_read(&tg->load_avg);
1040 tg_weight -= cfs_rq->tg_load_contrib;
1041 tg_weight += cfs_rq->load.weight;
1042
1043 return tg_weight;
1044}
1045
1046static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1047{
1048 long tg_weight, load, shares;
1049
1050 tg_weight = calc_tg_weight(tg, cfs_rq);
1051 load = cfs_rq->load.weight;
1052
1053 shares = (tg->shares * load);
1054 if (tg_weight)
1055 shares /= tg_weight;
1056
1057 if (shares < MIN_SHARES)
1058 shares = MIN_SHARES;
1059 if (shares > tg->shares)
1060 shares = tg->shares;
1061
1062 return shares;
1063}
1064# else /* CONFIG_SMP */
1065static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1066{
1067 return tg->shares;
1068}
1069# endif /* CONFIG_SMP */
1070static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1071 unsigned long weight)
1072{
1073 if (se->on_rq) {
1074 /* commit outstanding execution time */
1075 if (cfs_rq->curr == se)
1076 update_curr(cfs_rq);
1077 account_entity_dequeue(cfs_rq, se);
1078 }
1079
1080 update_load_set(&se->load, weight);
1081
1082 if (se->on_rq)
1083 account_entity_enqueue(cfs_rq, se);
1084}
1085
1086static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1087
1088static void update_cfs_shares(struct cfs_rq *cfs_rq)
1089{
1090 struct task_group *tg;
1091 struct sched_entity *se;
1092 long shares;
1093
1094 tg = cfs_rq->tg;
1095 se = tg->se[cpu_of(rq_of(cfs_rq))];
1096 if (!se || throttled_hierarchy(cfs_rq))
1097 return;
1098#ifndef CONFIG_SMP
1099 if (likely(se->load.weight == tg->shares))
1100 return;
1101#endif
1102 shares = calc_cfs_shares(cfs_rq, tg);
1103
1104 reweight_entity(cfs_rq_of(se), se, shares);
1105}
1106#else /* CONFIG_FAIR_GROUP_SCHED */
1107static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1108{
1109}
1110#endif /* CONFIG_FAIR_GROUP_SCHED */
1111
1112/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1113#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1114/*
1115 * We choose a half-life close to 1 scheduling period.
1116 * Note: The tables below are dependent on this value.
1117 */
1118#define LOAD_AVG_PERIOD 32
1119#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1120#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1121
1122/* Precomputed fixed inverse multiplies for multiplication by y^n */
1123static const u32 runnable_avg_yN_inv[] = {
1124 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1125 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1126 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1127 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1128 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1129 0x85aac367, 0x82cd8698,
1130};
1131
1132/*
1133 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1134 * over-estimates when re-combining.
1135 */
1136static const u32 runnable_avg_yN_sum[] = {
1137 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1138 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1139 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1140};
1141
1142/*
1143 * Approximate:
1144 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1145 */
1146static __always_inline u64 decay_load(u64 val, u64 n)
1147{
1148 unsigned int local_n;
1149
1150 if (!n)
1151 return val;
1152 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1153 return 0;
1154
1155 /* after bounds checking we can collapse to 32-bit */
1156 local_n = n;
1157
1158 /*
1159 * As y^PERIOD = 1/2, we can combine
1160 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1161 * With a look-up table which covers k^n (n<PERIOD)
1162 *
1163 * To achieve constant time decay_load.
1164 */
1165 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1166 val >>= local_n / LOAD_AVG_PERIOD;
1167 local_n %= LOAD_AVG_PERIOD;
1168 }
1169
1170 val *= runnable_avg_yN_inv[local_n];
1171 /* We don't use SRR here since we always want to round down. */
1172 return val >> 32;
1173}
1174
1175/*
1176 * For updates fully spanning n periods, the contribution to runnable
1177 * average will be: \Sum 1024*y^n
1178 *
1179 * We can compute this reasonably efficiently by combining:
1180 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1181 */
1182static u32 __compute_runnable_contrib(u64 n)
1183{
1184 u32 contrib = 0;
1185
1186 if (likely(n <= LOAD_AVG_PERIOD))
1187 return runnable_avg_yN_sum[n];
1188 else if (unlikely(n >= LOAD_AVG_MAX_N))
1189 return LOAD_AVG_MAX;
1190
1191 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1192 do {
1193 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1194 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1195
1196 n -= LOAD_AVG_PERIOD;
1197 } while (n > LOAD_AVG_PERIOD);
1198
1199 contrib = decay_load(contrib, n);
1200 return contrib + runnable_avg_yN_sum[n];
1201}
1202
1203/*
1204 * We can represent the historical contribution to runnable average as the
1205 * coefficients of a geometric series. To do this we sub-divide our runnable
1206 * history into segments of approximately 1ms (1024us); label the segment that
1207 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1208 *
1209 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1210 * p0 p1 p2
1211 * (now) (~1ms ago) (~2ms ago)
1212 *
1213 * Let u_i denote the fraction of p_i that the entity was runnable.
1214 *
1215 * We then designate the fractions u_i as our co-efficients, yielding the
1216 * following representation of historical load:
1217 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1218 *
1219 * We choose y based on the with of a reasonably scheduling period, fixing:
1220 * y^32 = 0.5
1221 *
1222 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1223 * approximately half as much as the contribution to load within the last ms
1224 * (u_0).
1225 *
1226 * When a period "rolls over" and we have new u_0`, multiplying the previous
1227 * sum again by y is sufficient to update:
1228 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1229 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1230 */
1231static __always_inline int __update_entity_runnable_avg(u64 now,
1232 struct sched_avg *sa,
1233 int runnable)
1234{
1235 u64 delta, periods;
1236 u32 runnable_contrib;
1237 int delta_w, decayed = 0;
1238
1239 delta = now - sa->last_runnable_update;
1240 /*
1241 * This should only happen when time goes backwards, which it
1242 * unfortunately does during sched clock init when we swap over to TSC.
1243 */
1244 if ((s64)delta < 0) {
1245 sa->last_runnable_update = now;
1246 return 0;
1247 }
1248
1249 /*
1250 * Use 1024ns as the unit of measurement since it's a reasonable
1251 * approximation of 1us and fast to compute.
1252 */
1253 delta >>= 10;
1254 if (!delta)
1255 return 0;
1256 sa->last_runnable_update = now;
1257
1258 /* delta_w is the amount already accumulated against our next period */
1259 delta_w = sa->runnable_avg_period % 1024;
1260 if (delta + delta_w >= 1024) {
1261 /* period roll-over */
1262 decayed = 1;
1263
1264 /*
1265 * Now that we know we're crossing a period boundary, figure
1266 * out how much from delta we need to complete the current
1267 * period and accrue it.
1268 */
1269 delta_w = 1024 - delta_w;
1270 if (runnable)
1271 sa->runnable_avg_sum += delta_w;
1272 sa->runnable_avg_period += delta_w;
1273
1274 delta -= delta_w;
1275
1276 /* Figure out how many additional periods this update spans */
1277 periods = delta / 1024;
1278 delta %= 1024;
1279
1280 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1281 periods + 1);
1282 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1283 periods + 1);
1284
1285 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1286 runnable_contrib = __compute_runnable_contrib(periods);
1287 if (runnable)
1288 sa->runnable_avg_sum += runnable_contrib;
1289 sa->runnable_avg_period += runnable_contrib;
1290 }
1291
1292 /* Remainder of delta accrued against u_0` */
1293 if (runnable)
1294 sa->runnable_avg_sum += delta;
1295 sa->runnable_avg_period += delta;
1296
1297 return decayed;
1298}
1299
1300/* Synchronize an entity's decay with its parenting cfs_rq.*/
1301static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1302{
1303 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1304 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1305
1306 decays -= se->avg.decay_count;
1307 if (!decays)
1308 return 0;
1309
1310 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1311 se->avg.decay_count = 0;
1312
1313 return decays;
1314}
1315
1316#ifdef CONFIG_FAIR_GROUP_SCHED
1317static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1318 int force_update)
1319{
1320 struct task_group *tg = cfs_rq->tg;
1321 s64 tg_contrib;
1322
1323 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1324 tg_contrib -= cfs_rq->tg_load_contrib;
1325
1326 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1327 atomic64_add(tg_contrib, &tg->load_avg);
1328 cfs_rq->tg_load_contrib += tg_contrib;
1329 }
1330}
1331
1332/*
1333 * Aggregate cfs_rq runnable averages into an equivalent task_group
1334 * representation for computing load contributions.
1335 */
1336static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1337 struct cfs_rq *cfs_rq)
1338{
1339 struct task_group *tg = cfs_rq->tg;
1340 long contrib;
1341
1342 /* The fraction of a cpu used by this cfs_rq */
1343 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1344 sa->runnable_avg_period + 1);
1345 contrib -= cfs_rq->tg_runnable_contrib;
1346
1347 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1348 atomic_add(contrib, &tg->runnable_avg);
1349 cfs_rq->tg_runnable_contrib += contrib;
1350 }
1351}
1352
1353static inline void __update_group_entity_contrib(struct sched_entity *se)
1354{
1355 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1356 struct task_group *tg = cfs_rq->tg;
1357 int runnable_avg;
1358
1359 u64 contrib;
1360
1361 contrib = cfs_rq->tg_load_contrib * tg->shares;
1362 se->avg.load_avg_contrib = div64_u64(contrib,
1363 atomic64_read(&tg->load_avg) + 1);
1364
1365 /*
1366 * For group entities we need to compute a correction term in the case
1367 * that they are consuming <1 cpu so that we would contribute the same
1368 * load as a task of equal weight.
1369 *
1370 * Explicitly co-ordinating this measurement would be expensive, but
1371 * fortunately the sum of each cpus contribution forms a usable
1372 * lower-bound on the true value.
1373 *
1374 * Consider the aggregate of 2 contributions. Either they are disjoint
1375 * (and the sum represents true value) or they are disjoint and we are
1376 * understating by the aggregate of their overlap.
1377 *
1378 * Extending this to N cpus, for a given overlap, the maximum amount we
1379 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1380 * cpus that overlap for this interval and w_i is the interval width.
1381 *
1382 * On a small machine; the first term is well-bounded which bounds the
1383 * total error since w_i is a subset of the period. Whereas on a
1384 * larger machine, while this first term can be larger, if w_i is the
1385 * of consequential size guaranteed to see n_i*w_i quickly converge to
1386 * our upper bound of 1-cpu.
1387 */
1388 runnable_avg = atomic_read(&tg->runnable_avg);
1389 if (runnable_avg < NICE_0_LOAD) {
1390 se->avg.load_avg_contrib *= runnable_avg;
1391 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1392 }
1393}
1394#else
1395static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1396 int force_update) {}
1397static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1398 struct cfs_rq *cfs_rq) {}
1399static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1400#endif
1401
1402static inline void __update_task_entity_contrib(struct sched_entity *se)
1403{
1404 u32 contrib;
1405
1406 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1407 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1408 contrib /= (se->avg.runnable_avg_period + 1);
1409 se->avg.load_avg_contrib = scale_load(contrib);
1410}
1411
1412/* Compute the current contribution to load_avg by se, return any delta */
1413static long __update_entity_load_avg_contrib(struct sched_entity *se)
1414{
1415 long old_contrib = se->avg.load_avg_contrib;
1416
1417 if (entity_is_task(se)) {
1418 __update_task_entity_contrib(se);
1419 } else {
1420 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1421 __update_group_entity_contrib(se);
1422 }
1423
1424 return se->avg.load_avg_contrib - old_contrib;
1425}
1426
1427static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1428 long load_contrib)
1429{
1430 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1431 cfs_rq->blocked_load_avg -= load_contrib;
1432 else
1433 cfs_rq->blocked_load_avg = 0;
1434}
1435
1436static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1437
1438/* Update a sched_entity's runnable average */
1439static inline void update_entity_load_avg(struct sched_entity *se,
1440 int update_cfs_rq)
1441{
1442 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1443 long contrib_delta;
1444 u64 now;
1445
1446 /*
1447 * For a group entity we need to use their owned cfs_rq_clock_task() in
1448 * case they are the parent of a throttled hierarchy.
1449 */
1450 if (entity_is_task(se))
1451 now = cfs_rq_clock_task(cfs_rq);
1452 else
1453 now = cfs_rq_clock_task(group_cfs_rq(se));
1454
1455 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1456 return;
1457
1458 contrib_delta = __update_entity_load_avg_contrib(se);
1459
1460 if (!update_cfs_rq)
1461 return;
1462
1463 if (se->on_rq)
1464 cfs_rq->runnable_load_avg += contrib_delta;
1465 else
1466 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1467}
1468
1469/*
1470 * Decay the load contributed by all blocked children and account this so that
1471 * their contribution may appropriately discounted when they wake up.
1472 */
1473static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1474{
1475 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1476 u64 decays;
1477
1478 decays = now - cfs_rq->last_decay;
1479 if (!decays && !force_update)
1480 return;
1481
1482 if (atomic64_read(&cfs_rq->removed_load)) {
1483 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1484 subtract_blocked_load_contrib(cfs_rq, removed_load);
1485 }
1486
1487 if (decays) {
1488 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1489 decays);
1490 atomic64_add(decays, &cfs_rq->decay_counter);
1491 cfs_rq->last_decay = now;
1492 }
1493
1494 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1495}
1496
1497static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1498{
1499 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1500 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1501}
1502
1503/* Add the load generated by se into cfs_rq's child load-average */
1504static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1505 struct sched_entity *se,
1506 int wakeup)
1507{
1508 /*
1509 * We track migrations using entity decay_count <= 0, on a wake-up
1510 * migration we use a negative decay count to track the remote decays
1511 * accumulated while sleeping.
1512 */
1513 if (unlikely(se->avg.decay_count <= 0)) {
1514 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1515 if (se->avg.decay_count) {
1516 /*
1517 * In a wake-up migration we have to approximate the
1518 * time sleeping. This is because we can't synchronize
1519 * clock_task between the two cpus, and it is not
1520 * guaranteed to be read-safe. Instead, we can
1521 * approximate this using our carried decays, which are
1522 * explicitly atomically readable.
1523 */
1524 se->avg.last_runnable_update -= (-se->avg.decay_count)
1525 << 20;
1526 update_entity_load_avg(se, 0);
1527 /* Indicate that we're now synchronized and on-rq */
1528 se->avg.decay_count = 0;
1529 }
1530 wakeup = 0;
1531 } else {
1532 __synchronize_entity_decay(se);
1533 }
1534
1535 /* migrated tasks did not contribute to our blocked load */
1536 if (wakeup) {
1537 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1538 update_entity_load_avg(se, 0);
1539 }
1540
1541 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1542 /* we force update consideration on load-balancer moves */
1543 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1544}
1545
1546/*
1547 * Remove se's load from this cfs_rq child load-average, if the entity is
1548 * transitioning to a blocked state we track its projected decay using
1549 * blocked_load_avg.
1550 */
1551static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1552 struct sched_entity *se,
1553 int sleep)
1554{
1555 update_entity_load_avg(se, 1);
1556 /* we force update consideration on load-balancer moves */
1557 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1558
1559 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1560 if (sleep) {
1561 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564}
1565#else
1566static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {}
1568static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1569static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1570 struct sched_entity *se,
1571 int wakeup) {}
1572static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1573 struct sched_entity *se,
1574 int sleep) {}
1575static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1576 int force_update) {}
1577#endif
1578
1579static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1580{
1581#ifdef CONFIG_SCHEDSTATS
1582 struct task_struct *tsk = NULL;
1583
1584 if (entity_is_task(se))
1585 tsk = task_of(se);
1586
1587 if (se->statistics.sleep_start) {
1588 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1589
1590 if ((s64)delta < 0)
1591 delta = 0;
1592
1593 if (unlikely(delta > se->statistics.sleep_max))
1594 se->statistics.sleep_max = delta;
1595
1596 se->statistics.sleep_start = 0;
1597 se->statistics.sum_sleep_runtime += delta;
1598
1599 if (tsk) {
1600 account_scheduler_latency(tsk, delta >> 10, 1);
1601 trace_sched_stat_sleep(tsk, delta);
1602 }
1603 }
1604 if (se->statistics.block_start) {
1605 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1606
1607 if ((s64)delta < 0)
1608 delta = 0;
1609
1610 if (unlikely(delta > se->statistics.block_max))
1611 se->statistics.block_max = delta;
1612
1613 se->statistics.block_start = 0;
1614 se->statistics.sum_sleep_runtime += delta;
1615
1616 if (tsk) {
1617 if (tsk->in_iowait) {
1618 se->statistics.iowait_sum += delta;
1619 se->statistics.iowait_count++;
1620 trace_sched_stat_iowait(tsk, delta);
1621 }
1622
1623 trace_sched_stat_blocked(tsk, delta);
1624
1625 /*
1626 * Blocking time is in units of nanosecs, so shift by
1627 * 20 to get a milliseconds-range estimation of the
1628 * amount of time that the task spent sleeping:
1629 */
1630 if (unlikely(prof_on == SLEEP_PROFILING)) {
1631 profile_hits(SLEEP_PROFILING,
1632 (void *)get_wchan(tsk),
1633 delta >> 20);
1634 }
1635 account_scheduler_latency(tsk, delta >> 10, 0);
1636 }
1637 }
1638#endif
1639}
1640
1641static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1642{
1643#ifdef CONFIG_SCHED_DEBUG
1644 s64 d = se->vruntime - cfs_rq->min_vruntime;
1645
1646 if (d < 0)
1647 d = -d;
1648
1649 if (d > 3*sysctl_sched_latency)
1650 schedstat_inc(cfs_rq, nr_spread_over);
1651#endif
1652}
1653
1654static void
1655place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1656{
1657 u64 vruntime = cfs_rq->min_vruntime;
1658
1659 /*
1660 * The 'current' period is already promised to the current tasks,
1661 * however the extra weight of the new task will slow them down a
1662 * little, place the new task so that it fits in the slot that
1663 * stays open at the end.
1664 */
1665 if (initial && sched_feat(START_DEBIT))
1666 vruntime += sched_vslice(cfs_rq, se);
1667
1668 /* sleeps up to a single latency don't count. */
1669 if (!initial) {
1670 unsigned long thresh = sysctl_sched_latency;
1671
1672 /*
1673 * Halve their sleep time's effect, to allow
1674 * for a gentler effect of sleepers:
1675 */
1676 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1677 thresh >>= 1;
1678
1679 vruntime -= thresh;
1680 }
1681
1682 /* ensure we never gain time by being placed backwards. */
1683 vruntime = max_vruntime(se->vruntime, vruntime);
1684
1685 se->vruntime = vruntime;
1686}
1687
1688static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1689
1690static void
1691enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1692{
1693 /*
1694 * Update the normalized vruntime before updating min_vruntime
1695 * through callig update_curr().
1696 */
1697 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1698 se->vruntime += cfs_rq->min_vruntime;
1699
1700 /*
1701 * Update run-time statistics of the 'current'.
1702 */
1703 update_curr(cfs_rq);
1704 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1705 account_entity_enqueue(cfs_rq, se);
1706 update_cfs_shares(cfs_rq);
1707
1708 if (flags & ENQUEUE_WAKEUP) {
1709 place_entity(cfs_rq, se, 0);
1710 enqueue_sleeper(cfs_rq, se);
1711 }
1712
1713 update_stats_enqueue(cfs_rq, se);
1714 check_spread(cfs_rq, se);
1715 if (se != cfs_rq->curr)
1716 __enqueue_entity(cfs_rq, se);
1717 se->on_rq = 1;
1718
1719 if (cfs_rq->nr_running == 1) {
1720 list_add_leaf_cfs_rq(cfs_rq);
1721 check_enqueue_throttle(cfs_rq);
1722 }
1723}
1724
1725static void __clear_buddies_last(struct sched_entity *se)
1726{
1727 for_each_sched_entity(se) {
1728 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1729 if (cfs_rq->last == se)
1730 cfs_rq->last = NULL;
1731 else
1732 break;
1733 }
1734}
1735
1736static void __clear_buddies_next(struct sched_entity *se)
1737{
1738 for_each_sched_entity(se) {
1739 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1740 if (cfs_rq->next == se)
1741 cfs_rq->next = NULL;
1742 else
1743 break;
1744 }
1745}
1746
1747static void __clear_buddies_skip(struct sched_entity *se)
1748{
1749 for_each_sched_entity(se) {
1750 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1751 if (cfs_rq->skip == se)
1752 cfs_rq->skip = NULL;
1753 else
1754 break;
1755 }
1756}
1757
1758static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1759{
1760 if (cfs_rq->last == se)
1761 __clear_buddies_last(se);
1762
1763 if (cfs_rq->next == se)
1764 __clear_buddies_next(se);
1765
1766 if (cfs_rq->skip == se)
1767 __clear_buddies_skip(se);
1768}
1769
1770static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1771
1772static void
1773dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1774{
1775 /*
1776 * Update run-time statistics of the 'current'.
1777 */
1778 update_curr(cfs_rq);
1779 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1780
1781 update_stats_dequeue(cfs_rq, se);
1782 if (flags & DEQUEUE_SLEEP) {
1783#ifdef CONFIG_SCHEDSTATS
1784 if (entity_is_task(se)) {
1785 struct task_struct *tsk = task_of(se);
1786
1787 if (tsk->state & TASK_INTERRUPTIBLE)
1788 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1789 if (tsk->state & TASK_UNINTERRUPTIBLE)
1790 se->statistics.block_start = rq_of(cfs_rq)->clock;
1791 }
1792#endif
1793 }
1794
1795 clear_buddies(cfs_rq, se);
1796
1797 if (se != cfs_rq->curr)
1798 __dequeue_entity(cfs_rq, se);
1799 se->on_rq = 0;
1800 account_entity_dequeue(cfs_rq, se);
1801
1802 /*
1803 * Normalize the entity after updating the min_vruntime because the
1804 * update can refer to the ->curr item and we need to reflect this
1805 * movement in our normalized position.
1806 */
1807 if (!(flags & DEQUEUE_SLEEP))
1808 se->vruntime -= cfs_rq->min_vruntime;
1809
1810 /* return excess runtime on last dequeue */
1811 return_cfs_rq_runtime(cfs_rq);
1812
1813 update_min_vruntime(cfs_rq);
1814 update_cfs_shares(cfs_rq);
1815}
1816
1817/*
1818 * Preempt the current task with a newly woken task if needed:
1819 */
1820static void
1821check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1822{
1823 unsigned long ideal_runtime, delta_exec;
1824 struct sched_entity *se;
1825 s64 delta;
1826
1827 ideal_runtime = sched_slice(cfs_rq, curr);
1828 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1829 if (delta_exec > ideal_runtime) {
1830 resched_task(rq_of(cfs_rq)->curr);
1831 /*
1832 * The current task ran long enough, ensure it doesn't get
1833 * re-elected due to buddy favours.
1834 */
1835 clear_buddies(cfs_rq, curr);
1836 return;
1837 }
1838
1839 /*
1840 * Ensure that a task that missed wakeup preemption by a
1841 * narrow margin doesn't have to wait for a full slice.
1842 * This also mitigates buddy induced latencies under load.
1843 */
1844 if (delta_exec < sysctl_sched_min_granularity)
1845 return;
1846
1847 se = __pick_first_entity(cfs_rq);
1848 delta = curr->vruntime - se->vruntime;
1849
1850 if (delta < 0)
1851 return;
1852
1853 if (delta > ideal_runtime)
1854 resched_task(rq_of(cfs_rq)->curr);
1855}
1856
1857static void
1858set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1859{
1860 /* 'current' is not kept within the tree. */
1861 if (se->on_rq) {
1862 /*
1863 * Any task has to be enqueued before it get to execute on
1864 * a CPU. So account for the time it spent waiting on the
1865 * runqueue.
1866 */
1867 update_stats_wait_end(cfs_rq, se);
1868 __dequeue_entity(cfs_rq, se);
1869 }
1870
1871 update_stats_curr_start(cfs_rq, se);
1872 cfs_rq->curr = se;
1873#ifdef CONFIG_SCHEDSTATS
1874 /*
1875 * Track our maximum slice length, if the CPU's load is at
1876 * least twice that of our own weight (i.e. dont track it
1877 * when there are only lesser-weight tasks around):
1878 */
1879 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1880 se->statistics.slice_max = max(se->statistics.slice_max,
1881 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1882 }
1883#endif
1884 se->prev_sum_exec_runtime = se->sum_exec_runtime;
1885}
1886
1887static int
1888wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1889
1890/*
1891 * Pick the next process, keeping these things in mind, in this order:
1892 * 1) keep things fair between processes/task groups
1893 * 2) pick the "next" process, since someone really wants that to run
1894 * 3) pick the "last" process, for cache locality
1895 * 4) do not run the "skip" process, if something else is available
1896 */
1897static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1898{
1899 struct sched_entity *se = __pick_first_entity(cfs_rq);
1900 struct sched_entity *left = se;
1901
1902 /*
1903 * Avoid running the skip buddy, if running something else can
1904 * be done without getting too unfair.
1905 */
1906 if (cfs_rq->skip == se) {
1907 struct sched_entity *second = __pick_next_entity(se);
1908 if (second && wakeup_preempt_entity(second, left) < 1)
1909 se = second;
1910 }
1911
1912 /*
1913 * Prefer last buddy, try to return the CPU to a preempted task.
1914 */
1915 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1916 se = cfs_rq->last;
1917
1918 /*
1919 * Someone really wants this to run. If it's not unfair, run it.
1920 */
1921 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1922 se = cfs_rq->next;
1923
1924 clear_buddies(cfs_rq, se);
1925
1926 return se;
1927}
1928
1929static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1930
1931static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1932{
1933 /*
1934 * If still on the runqueue then deactivate_task()
1935 * was not called and update_curr() has to be done:
1936 */
1937 if (prev->on_rq)
1938 update_curr(cfs_rq);
1939
1940 /* throttle cfs_rqs exceeding runtime */
1941 check_cfs_rq_runtime(cfs_rq);
1942
1943 check_spread(cfs_rq, prev);
1944 if (prev->on_rq) {
1945 update_stats_wait_start(cfs_rq, prev);
1946 /* Put 'current' back into the tree. */
1947 __enqueue_entity(cfs_rq, prev);
1948 /* in !on_rq case, update occurred at dequeue */
1949 update_entity_load_avg(prev, 1);
1950 }
1951 cfs_rq->curr = NULL;
1952}
1953
1954static void
1955entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1956{
1957 /*
1958 * Update run-time statistics of the 'current'.
1959 */
1960 update_curr(cfs_rq);
1961
1962 /*
1963 * Ensure that runnable average is periodically updated.
1964 */
1965 update_entity_load_avg(curr, 1);
1966 update_cfs_rq_blocked_load(cfs_rq, 1);
1967
1968#ifdef CONFIG_SCHED_HRTICK
1969 /*
1970 * queued ticks are scheduled to match the slice, so don't bother
1971 * validating it and just reschedule.
1972 */
1973 if (queued) {
1974 resched_task(rq_of(cfs_rq)->curr);
1975 return;
1976 }
1977 /*
1978 * don't let the period tick interfere with the hrtick preemption
1979 */
1980 if (!sched_feat(DOUBLE_TICK) &&
1981 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1982 return;
1983#endif
1984
1985 if (cfs_rq->nr_running > 1)
1986 check_preempt_tick(cfs_rq, curr);
1987}
1988
1989
1990/**************************************************
1991 * CFS bandwidth control machinery
1992 */
1993
1994#ifdef CONFIG_CFS_BANDWIDTH
1995
1996#ifdef HAVE_JUMP_LABEL
1997static struct static_key __cfs_bandwidth_used;
1998
1999static inline bool cfs_bandwidth_used(void)
2000{
2001 return static_key_false(&__cfs_bandwidth_used);
2002}
2003
2004void account_cfs_bandwidth_used(int enabled, int was_enabled)
2005{
2006 /* only need to count groups transitioning between enabled/!enabled */
2007 if (enabled && !was_enabled)
2008 static_key_slow_inc(&__cfs_bandwidth_used);
2009 else if (!enabled && was_enabled)
2010 static_key_slow_dec(&__cfs_bandwidth_used);
2011}
2012#else /* HAVE_JUMP_LABEL */
2013static bool cfs_bandwidth_used(void)
2014{
2015 return true;
2016}
2017
2018void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2019#endif /* HAVE_JUMP_LABEL */
2020
2021/*
2022 * default period for cfs group bandwidth.
2023 * default: 0.1s, units: nanoseconds
2024 */
2025static inline u64 default_cfs_period(void)
2026{
2027 return 100000000ULL;
2028}
2029
2030static inline u64 sched_cfs_bandwidth_slice(void)
2031{
2032 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2033}
2034
2035/*
2036 * Replenish runtime according to assigned quota and update expiration time.
2037 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2038 * additional synchronization around rq->lock.
2039 *
2040 * requires cfs_b->lock
2041 */
2042void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2043{
2044 u64 now;
2045
2046 if (cfs_b->quota == RUNTIME_INF)
2047 return;
2048
2049 now = sched_clock_cpu(smp_processor_id());
2050 cfs_b->runtime = cfs_b->quota;
2051 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2052}
2053
2054static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2055{
2056 return &tg->cfs_bandwidth;
2057}
2058
2059/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2060static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2061{
2062 if (unlikely(cfs_rq->throttle_count))
2063 return cfs_rq->throttled_clock_task;
2064
2065 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2066}
2067
2068/* returns 0 on failure to allocate runtime */
2069static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2070{
2071 struct task_group *tg = cfs_rq->tg;
2072 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2073 u64 amount = 0, min_amount, expires;
2074
2075 /* note: this is a positive sum as runtime_remaining <= 0 */
2076 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2077
2078 raw_spin_lock(&cfs_b->lock);
2079 if (cfs_b->quota == RUNTIME_INF)
2080 amount = min_amount;
2081 else {
2082 /*
2083 * If the bandwidth pool has become inactive, then at least one
2084 * period must have elapsed since the last consumption.
2085 * Refresh the global state and ensure bandwidth timer becomes
2086 * active.
2087 */
2088 if (!cfs_b->timer_active) {
2089 __refill_cfs_bandwidth_runtime(cfs_b);
2090 __start_cfs_bandwidth(cfs_b);
2091 }
2092
2093 if (cfs_b->runtime > 0) {
2094 amount = min(cfs_b->runtime, min_amount);
2095 cfs_b->runtime -= amount;
2096 cfs_b->idle = 0;
2097 }
2098 }
2099 expires = cfs_b->runtime_expires;
2100 raw_spin_unlock(&cfs_b->lock);
2101
2102 cfs_rq->runtime_remaining += amount;
2103 /*
2104 * we may have advanced our local expiration to account for allowed
2105 * spread between our sched_clock and the one on which runtime was
2106 * issued.
2107 */
2108 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2109 cfs_rq->runtime_expires = expires;
2110
2111 return cfs_rq->runtime_remaining > 0;
2112}
2113
2114/*
2115 * Note: This depends on the synchronization provided by sched_clock and the
2116 * fact that rq->clock snapshots this value.
2117 */
2118static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2119{
2120 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2121 struct rq *rq = rq_of(cfs_rq);
2122
2123 /* if the deadline is ahead of our clock, nothing to do */
2124 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2125 return;
2126
2127 if (cfs_rq->runtime_remaining < 0)
2128 return;
2129
2130 /*
2131 * If the local deadline has passed we have to consider the
2132 * possibility that our sched_clock is 'fast' and the global deadline
2133 * has not truly expired.
2134 *
2135 * Fortunately we can check determine whether this the case by checking
2136 * whether the global deadline has advanced.
2137 */
2138
2139 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2140 /* extend local deadline, drift is bounded above by 2 ticks */
2141 cfs_rq->runtime_expires += TICK_NSEC;
2142 } else {
2143 /* global deadline is ahead, expiration has passed */
2144 cfs_rq->runtime_remaining = 0;
2145 }
2146}
2147
2148static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2149 unsigned long delta_exec)
2150{
2151 /* dock delta_exec before expiring quota (as it could span periods) */
2152 cfs_rq->runtime_remaining -= delta_exec;
2153 expire_cfs_rq_runtime(cfs_rq);
2154
2155 if (likely(cfs_rq->runtime_remaining > 0))
2156 return;
2157
2158 /*
2159 * if we're unable to extend our runtime we resched so that the active
2160 * hierarchy can be throttled
2161 */
2162 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2163 resched_task(rq_of(cfs_rq)->curr);
2164}
2165
2166static __always_inline
2167void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2168{
2169 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2170 return;
2171
2172 __account_cfs_rq_runtime(cfs_rq, delta_exec);
2173}
2174
2175static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2176{
2177 return cfs_bandwidth_used() && cfs_rq->throttled;
2178}
2179
2180/* check whether cfs_rq, or any parent, is throttled */
2181static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2182{
2183 return cfs_bandwidth_used() && cfs_rq->throttle_count;
2184}
2185
2186/*
2187 * Ensure that neither of the group entities corresponding to src_cpu or
2188 * dest_cpu are members of a throttled hierarchy when performing group
2189 * load-balance operations.
2190 */
2191static inline int throttled_lb_pair(struct task_group *tg,
2192 int src_cpu, int dest_cpu)
2193{
2194 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2195
2196 src_cfs_rq = tg->cfs_rq[src_cpu];
2197 dest_cfs_rq = tg->cfs_rq[dest_cpu];
2198
2199 return throttled_hierarchy(src_cfs_rq) ||
2200 throttled_hierarchy(dest_cfs_rq);
2201}
2202
2203/* updated child weight may affect parent so we have to do this bottom up */
2204static int tg_unthrottle_up(struct task_group *tg, void *data)
2205{
2206 struct rq *rq = data;
2207 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2208
2209 cfs_rq->throttle_count--;
2210#ifdef CONFIG_SMP
2211 if (!cfs_rq->throttle_count) {
2212 /* adjust cfs_rq_clock_task() */
2213 cfs_rq->throttled_clock_task_time += rq->clock_task -
2214 cfs_rq->throttled_clock_task;
2215 }
2216#endif
2217
2218 return 0;
2219}
2220
2221static int tg_throttle_down(struct task_group *tg, void *data)
2222{
2223 struct rq *rq = data;
2224 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2225
2226 /* group is entering throttled state, stop time */
2227 if (!cfs_rq->throttle_count)
2228 cfs_rq->throttled_clock_task = rq->clock_task;
2229 cfs_rq->throttle_count++;
2230
2231 return 0;
2232}
2233
2234static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2235{
2236 struct rq *rq = rq_of(cfs_rq);
2237 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2238 struct sched_entity *se;
2239 long task_delta, dequeue = 1;
2240
2241 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2242
2243 /* freeze hierarchy runnable averages while throttled */
2244 rcu_read_lock();
2245 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2246 rcu_read_unlock();
2247
2248 task_delta = cfs_rq->h_nr_running;
2249 for_each_sched_entity(se) {
2250 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2251 /* throttled entity or throttle-on-deactivate */
2252 if (!se->on_rq)
2253 break;
2254
2255 if (dequeue)
2256 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2257 qcfs_rq->h_nr_running -= task_delta;
2258
2259 if (qcfs_rq->load.weight)
2260 dequeue = 0;
2261 }
2262
2263 if (!se)
2264 rq->nr_running -= task_delta;
2265
2266 cfs_rq->throttled = 1;
2267 cfs_rq->throttled_clock = rq->clock;
2268 raw_spin_lock(&cfs_b->lock);
2269 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2270 raw_spin_unlock(&cfs_b->lock);
2271}
2272
2273void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2274{
2275 struct rq *rq = rq_of(cfs_rq);
2276 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2277 struct sched_entity *se;
2278 int enqueue = 1;
2279 long task_delta;
2280
2281 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2282
2283 cfs_rq->throttled = 0;
2284 raw_spin_lock(&cfs_b->lock);
2285 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2286 list_del_rcu(&cfs_rq->throttled_list);
2287 raw_spin_unlock(&cfs_b->lock);
2288
2289 update_rq_clock(rq);
2290 /* update hierarchical throttle state */
2291 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2292
2293 if (!cfs_rq->load.weight)
2294 return;
2295
2296 task_delta = cfs_rq->h_nr_running;
2297 for_each_sched_entity(se) {
2298 if (se->on_rq)
2299 enqueue = 0;
2300
2301 cfs_rq = cfs_rq_of(se);
2302 if (enqueue)
2303 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2304 cfs_rq->h_nr_running += task_delta;
2305
2306 if (cfs_rq_throttled(cfs_rq))
2307 break;
2308 }
2309
2310 if (!se)
2311 rq->nr_running += task_delta;
2312
2313 /* determine whether we need to wake up potentially idle cpu */
2314 if (rq->curr == rq->idle && rq->cfs.nr_running)
2315 resched_task(rq->curr);
2316}
2317
2318static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2319 u64 remaining, u64 expires)
2320{
2321 struct cfs_rq *cfs_rq;
2322 u64 runtime = remaining;
2323
2324 rcu_read_lock();
2325 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2326 throttled_list) {
2327 struct rq *rq = rq_of(cfs_rq);
2328
2329 raw_spin_lock(&rq->lock);
2330 if (!cfs_rq_throttled(cfs_rq))
2331 goto next;
2332
2333 runtime = -cfs_rq->runtime_remaining + 1;
2334 if (runtime > remaining)
2335 runtime = remaining;
2336 remaining -= runtime;
2337
2338 cfs_rq->runtime_remaining += runtime;
2339 cfs_rq->runtime_expires = expires;
2340
2341 /* we check whether we're throttled above */
2342 if (cfs_rq->runtime_remaining > 0)
2343 unthrottle_cfs_rq(cfs_rq);
2344
2345next:
2346 raw_spin_unlock(&rq->lock);
2347
2348 if (!remaining)
2349 break;
2350 }
2351 rcu_read_unlock();
2352
2353 return remaining;
2354}
2355
2356/*
2357 * Responsible for refilling a task_group's bandwidth and unthrottling its
2358 * cfs_rqs as appropriate. If there has been no activity within the last
2359 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2360 * used to track this state.
2361 */
2362static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2363{
2364 u64 runtime, runtime_expires;
2365 int idle = 1, throttled;
2366
2367 raw_spin_lock(&cfs_b->lock);
2368 /* no need to continue the timer with no bandwidth constraint */
2369 if (cfs_b->quota == RUNTIME_INF)
2370 goto out_unlock;
2371
2372 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2373 /* idle depends on !throttled (for the case of a large deficit) */
2374 idle = cfs_b->idle && !throttled;
2375 cfs_b->nr_periods += overrun;
2376
2377 /* if we're going inactive then everything else can be deferred */
2378 if (idle)
2379 goto out_unlock;
2380
2381 __refill_cfs_bandwidth_runtime(cfs_b);
2382
2383 if (!throttled) {
2384 /* mark as potentially idle for the upcoming period */
2385 cfs_b->idle = 1;
2386 goto out_unlock;
2387 }
2388
2389 /* account preceding periods in which throttling occurred */
2390 cfs_b->nr_throttled += overrun;
2391
2392 /*
2393 * There are throttled entities so we must first use the new bandwidth
2394 * to unthrottle them before making it generally available. This
2395 * ensures that all existing debts will be paid before a new cfs_rq is
2396 * allowed to run.
2397 */
2398 runtime = cfs_b->runtime;
2399 runtime_expires = cfs_b->runtime_expires;
2400 cfs_b->runtime = 0;
2401
2402 /*
2403 * This check is repeated as we are holding onto the new bandwidth
2404 * while we unthrottle. This can potentially race with an unthrottled
2405 * group trying to acquire new bandwidth from the global pool.
2406 */
2407 while (throttled && runtime > 0) {
2408 raw_spin_unlock(&cfs_b->lock);
2409 /* we can't nest cfs_b->lock while distributing bandwidth */
2410 runtime = distribute_cfs_runtime(cfs_b, runtime,
2411 runtime_expires);
2412 raw_spin_lock(&cfs_b->lock);
2413
2414 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2415 }
2416
2417 /* return (any) remaining runtime */
2418 cfs_b->runtime = runtime;
2419 /*
2420 * While we are ensured activity in the period following an
2421 * unthrottle, this also covers the case in which the new bandwidth is
2422 * insufficient to cover the existing bandwidth deficit. (Forcing the
2423 * timer to remain active while there are any throttled entities.)
2424 */
2425 cfs_b->idle = 0;
2426out_unlock:
2427 if (idle)
2428 cfs_b->timer_active = 0;
2429 raw_spin_unlock(&cfs_b->lock);
2430
2431 return idle;
2432}
2433
2434/* a cfs_rq won't donate quota below this amount */
2435static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2436/* minimum remaining period time to redistribute slack quota */
2437static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2438/* how long we wait to gather additional slack before distributing */
2439static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2440
2441/* are we near the end of the current quota period? */
2442static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2443{
2444 struct hrtimer *refresh_timer = &cfs_b->period_timer;
2445 u64 remaining;
2446
2447 /* if the call-back is running a quota refresh is already occurring */
2448 if (hrtimer_callback_running(refresh_timer))
2449 return 1;
2450
2451 /* is a quota refresh about to occur? */
2452 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2453 if (remaining < min_expire)
2454 return 1;
2455
2456 return 0;
2457}
2458
2459static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2460{
2461 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2462
2463 /* if there's a quota refresh soon don't bother with slack */
2464 if (runtime_refresh_within(cfs_b, min_left))
2465 return;
2466
2467 start_bandwidth_timer(&cfs_b->slack_timer,
2468 ns_to_ktime(cfs_bandwidth_slack_period));
2469}
2470
2471/* we know any runtime found here is valid as update_curr() precedes return */
2472static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2473{
2474 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2475 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2476
2477 if (slack_runtime <= 0)
2478 return;
2479
2480 raw_spin_lock(&cfs_b->lock);
2481 if (cfs_b->quota != RUNTIME_INF &&
2482 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2483 cfs_b->runtime += slack_runtime;
2484
2485 /* we are under rq->lock, defer unthrottling using a timer */
2486 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2487 !list_empty(&cfs_b->throttled_cfs_rq))
2488 start_cfs_slack_bandwidth(cfs_b);
2489 }
2490 raw_spin_unlock(&cfs_b->lock);
2491
2492 /* even if it's not valid for return we don't want to try again */
2493 cfs_rq->runtime_remaining -= slack_runtime;
2494}
2495
2496static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2497{
2498 if (!cfs_bandwidth_used())
2499 return;
2500
2501 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2502 return;
2503
2504 __return_cfs_rq_runtime(cfs_rq);
2505}
2506
2507/*
2508 * This is done with a timer (instead of inline with bandwidth return) since
2509 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2510 */
2511static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2512{
2513 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2514 u64 expires;
2515
2516 /* confirm we're still not at a refresh boundary */
2517 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2518 return;
2519
2520 raw_spin_lock(&cfs_b->lock);
2521 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2522 runtime = cfs_b->runtime;
2523 cfs_b->runtime = 0;
2524 }
2525 expires = cfs_b->runtime_expires;
2526 raw_spin_unlock(&cfs_b->lock);
2527
2528 if (!runtime)
2529 return;
2530
2531 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2532
2533 raw_spin_lock(&cfs_b->lock);
2534 if (expires == cfs_b->runtime_expires)
2535 cfs_b->runtime = runtime;
2536 raw_spin_unlock(&cfs_b->lock);
2537}
2538
2539/*
2540 * When a group wakes up we want to make sure that its quota is not already
2541 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2542 * runtime as update_curr() throttling can not not trigger until it's on-rq.
2543 */
2544static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2545{
2546 if (!cfs_bandwidth_used())
2547 return;
2548
2549 /* an active group must be handled by the update_curr()->put() path */
2550 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2551 return;
2552
2553 /* ensure the group is not already throttled */
2554 if (cfs_rq_throttled(cfs_rq))
2555 return;
2556
2557 /* update runtime allocation */
2558 account_cfs_rq_runtime(cfs_rq, 0);
2559 if (cfs_rq->runtime_remaining <= 0)
2560 throttle_cfs_rq(cfs_rq);
2561}
2562
2563/* conditionally throttle active cfs_rq's from put_prev_entity() */
2564static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2565{
2566 if (!cfs_bandwidth_used())
2567 return;
2568
2569 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2570 return;
2571
2572 /*
2573 * it's possible for a throttled entity to be forced into a running
2574 * state (e.g. set_curr_task), in this case we're finished.
2575 */
2576 if (cfs_rq_throttled(cfs_rq))
2577 return;
2578
2579 throttle_cfs_rq(cfs_rq);
2580}
2581
2582static inline u64 default_cfs_period(void);
2583static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2584static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2585
2586static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2587{
2588 struct cfs_bandwidth *cfs_b =
2589 container_of(timer, struct cfs_bandwidth, slack_timer);
2590 do_sched_cfs_slack_timer(cfs_b);
2591
2592 return HRTIMER_NORESTART;
2593}
2594
2595static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2596{
2597 struct cfs_bandwidth *cfs_b =
2598 container_of(timer, struct cfs_bandwidth, period_timer);
2599 ktime_t now;
2600 int overrun;
2601 int idle = 0;
2602
2603 for (;;) {
2604 now = hrtimer_cb_get_time(timer);
2605 overrun = hrtimer_forward(timer, now, cfs_b->period);
2606
2607 if (!overrun)
2608 break;
2609
2610 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2611 }
2612
2613 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2614}
2615
2616void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2617{
2618 raw_spin_lock_init(&cfs_b->lock);
2619 cfs_b->runtime = 0;
2620 cfs_b->quota = RUNTIME_INF;
2621 cfs_b->period = ns_to_ktime(default_cfs_period());
2622
2623 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2624 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2625 cfs_b->period_timer.function = sched_cfs_period_timer;
2626 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2627 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2628}
2629
2630static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2631{
2632 cfs_rq->runtime_enabled = 0;
2633 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2634}
2635
2636/* requires cfs_b->lock, may release to reprogram timer */
2637void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2638{
2639 /*
2640 * The timer may be active because we're trying to set a new bandwidth
2641 * period or because we're racing with the tear-down path
2642 * (timer_active==0 becomes visible before the hrtimer call-back
2643 * terminates). In either case we ensure that it's re-programmed
2644 */
2645 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2646 raw_spin_unlock(&cfs_b->lock);
2647 /* ensure cfs_b->lock is available while we wait */
2648 hrtimer_cancel(&cfs_b->period_timer);
2649
2650 raw_spin_lock(&cfs_b->lock);
2651 /* if someone else restarted the timer then we're done */
2652 if (cfs_b->timer_active)
2653 return;
2654 }
2655
2656 cfs_b->timer_active = 1;
2657 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2658}
2659
2660static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2661{
2662 hrtimer_cancel(&cfs_b->period_timer);
2663 hrtimer_cancel(&cfs_b->slack_timer);
2664}
2665
2666static void unthrottle_offline_cfs_rqs(struct rq *rq)
2667{
2668 struct cfs_rq *cfs_rq;
2669
2670 for_each_leaf_cfs_rq(rq, cfs_rq) {
2671 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2672
2673 if (!cfs_rq->runtime_enabled)
2674 continue;
2675
2676 /*
2677 * clock_task is not advancing so we just need to make sure
2678 * there's some valid quota amount
2679 */
2680 cfs_rq->runtime_remaining = cfs_b->quota;
2681 if (cfs_rq_throttled(cfs_rq))
2682 unthrottle_cfs_rq(cfs_rq);
2683 }
2684}
2685
2686#else /* CONFIG_CFS_BANDWIDTH */
2687static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2688{
2689 return rq_of(cfs_rq)->clock_task;
2690}
2691
2692static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2693 unsigned long delta_exec) {}
2694static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2695static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2696static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2697
2698static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2699{
2700 return 0;
2701}
2702
2703static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2704{
2705 return 0;
2706}
2707
2708static inline int throttled_lb_pair(struct task_group *tg,
2709 int src_cpu, int dest_cpu)
2710{
2711 return 0;
2712}
2713
2714void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2715
2716#ifdef CONFIG_FAIR_GROUP_SCHED
2717static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2718#endif
2719
2720static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2721{
2722 return NULL;
2723}
2724static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2725static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2726
2727#endif /* CONFIG_CFS_BANDWIDTH */
2728
2729/**************************************************
2730 * CFS operations on tasks:
2731 */
2732
2733#ifdef CONFIG_SCHED_HRTICK
2734static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2735{
2736 struct sched_entity *se = &p->se;
2737 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2738
2739 WARN_ON(task_rq(p) != rq);
2740
2741 if (cfs_rq->nr_running > 1) {
2742 u64 slice = sched_slice(cfs_rq, se);
2743 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2744 s64 delta = slice - ran;
2745
2746 if (delta < 0) {
2747 if (rq->curr == p)
2748 resched_task(p);
2749 return;
2750 }
2751
2752 /*
2753 * Don't schedule slices shorter than 10000ns, that just
2754 * doesn't make sense. Rely on vruntime for fairness.
2755 */
2756 if (rq->curr != p)
2757 delta = max_t(s64, 10000LL, delta);
2758
2759 hrtick_start(rq, delta);
2760 }
2761}
2762
2763/*
2764 * called from enqueue/dequeue and updates the hrtick when the
2765 * current task is from our class and nr_running is low enough
2766 * to matter.
2767 */
2768static void hrtick_update(struct rq *rq)
2769{
2770 struct task_struct *curr = rq->curr;
2771
2772 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2773 return;
2774
2775 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2776 hrtick_start_fair(rq, curr);
2777}
2778#else /* !CONFIG_SCHED_HRTICK */
2779static inline void
2780hrtick_start_fair(struct rq *rq, struct task_struct *p)
2781{
2782}
2783
2784static inline void hrtick_update(struct rq *rq)
2785{
2786}
2787#endif
2788
2789/*
2790 * The enqueue_task method is called before nr_running is
2791 * increased. Here we update the fair scheduling stats and
2792 * then put the task into the rbtree:
2793 */
2794static void
2795enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2796{
2797 struct cfs_rq *cfs_rq;
2798 struct sched_entity *se = &p->se;
2799
2800 for_each_sched_entity(se) {
2801 if (se->on_rq)
2802 break;
2803 cfs_rq = cfs_rq_of(se);
2804 enqueue_entity(cfs_rq, se, flags);
2805
2806 /*
2807 * end evaluation on encountering a throttled cfs_rq
2808 *
2809 * note: in the case of encountering a throttled cfs_rq we will
2810 * post the final h_nr_running increment below.
2811 */
2812 if (cfs_rq_throttled(cfs_rq))
2813 break;
2814 cfs_rq->h_nr_running++;
2815
2816 flags = ENQUEUE_WAKEUP;
2817 }
2818
2819 for_each_sched_entity(se) {
2820 cfs_rq = cfs_rq_of(se);
2821 cfs_rq->h_nr_running++;
2822
2823 if (cfs_rq_throttled(cfs_rq))
2824 break;
2825
2826 update_cfs_shares(cfs_rq);
2827 update_entity_load_avg(se, 1);
2828 }
2829
2830 if (!se) {
2831 update_rq_runnable_avg(rq, rq->nr_running);
2832 inc_nr_running(rq);
2833 }
2834 hrtick_update(rq);
2835}
2836
2837static void set_next_buddy(struct sched_entity *se);
2838
2839/*
2840 * The dequeue_task method is called before nr_running is
2841 * decreased. We remove the task from the rbtree and
2842 * update the fair scheduling stats:
2843 */
2844static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2845{
2846 struct cfs_rq *cfs_rq;
2847 struct sched_entity *se = &p->se;
2848 int task_sleep = flags & DEQUEUE_SLEEP;
2849
2850 for_each_sched_entity(se) {
2851 cfs_rq = cfs_rq_of(se);
2852 dequeue_entity(cfs_rq, se, flags);
2853
2854 /*
2855 * end evaluation on encountering a throttled cfs_rq
2856 *
2857 * note: in the case of encountering a throttled cfs_rq we will
2858 * post the final h_nr_running decrement below.
2859 */
2860 if (cfs_rq_throttled(cfs_rq))
2861 break;
2862 cfs_rq->h_nr_running--;
2863
2864 /* Don't dequeue parent if it has other entities besides us */
2865 if (cfs_rq->load.weight) {
2866 /*
2867 * Bias pick_next to pick a task from this cfs_rq, as
2868 * p is sleeping when it is within its sched_slice.
2869 */
2870 if (task_sleep && parent_entity(se))
2871 set_next_buddy(parent_entity(se));
2872
2873 /* avoid re-evaluating load for this entity */
2874 se = parent_entity(se);
2875 break;
2876 }
2877 flags |= DEQUEUE_SLEEP;
2878 }
2879
2880 for_each_sched_entity(se) {
2881 cfs_rq = cfs_rq_of(se);
2882 cfs_rq->h_nr_running--;
2883
2884 if (cfs_rq_throttled(cfs_rq))
2885 break;
2886
2887 update_cfs_shares(cfs_rq);
2888 update_entity_load_avg(se, 1);
2889 }
2890
2891 if (!se) {
2892 dec_nr_running(rq);
2893 update_rq_runnable_avg(rq, 1);
2894 }
2895 hrtick_update(rq);
2896}
2897
2898#ifdef CONFIG_SMP
2899/* Used instead of source_load when we know the type == 0 */
2900static unsigned long weighted_cpuload(const int cpu)
2901{
2902 return cpu_rq(cpu)->load.weight;
2903}
2904
2905/*
2906 * Return a low guess at the load of a migration-source cpu weighted
2907 * according to the scheduling class and "nice" value.
2908 *
2909 * We want to under-estimate the load of migration sources, to
2910 * balance conservatively.
2911 */
2912static unsigned long source_load(int cpu, int type)
2913{
2914 struct rq *rq = cpu_rq(cpu);
2915 unsigned long total = weighted_cpuload(cpu);
2916
2917 if (type == 0 || !sched_feat(LB_BIAS))
2918 return total;
2919
2920 return min(rq->cpu_load[type-1], total);
2921}
2922
2923/*
2924 * Return a high guess at the load of a migration-target cpu weighted
2925 * according to the scheduling class and "nice" value.
2926 */
2927static unsigned long target_load(int cpu, int type)
2928{
2929 struct rq *rq = cpu_rq(cpu);
2930 unsigned long total = weighted_cpuload(cpu);
2931
2932 if (type == 0 || !sched_feat(LB_BIAS))
2933 return total;
2934
2935 return max(rq->cpu_load[type-1], total);
2936}
2937
2938static unsigned long power_of(int cpu)
2939{
2940 return cpu_rq(cpu)->cpu_power;
2941}
2942
2943static unsigned long cpu_avg_load_per_task(int cpu)
2944{
2945 struct rq *rq = cpu_rq(cpu);
2946 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2947
2948 if (nr_running)
2949 return rq->load.weight / nr_running;
2950
2951 return 0;
2952}
2953
2954
2955static void task_waking_fair(struct task_struct *p)
2956{
2957 struct sched_entity *se = &p->se;
2958 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2959 u64 min_vruntime;
2960
2961#ifndef CONFIG_64BIT
2962 u64 min_vruntime_copy;
2963
2964 do {
2965 min_vruntime_copy = cfs_rq->min_vruntime_copy;
2966 smp_rmb();
2967 min_vruntime = cfs_rq->min_vruntime;
2968 } while (min_vruntime != min_vruntime_copy);
2969#else
2970 min_vruntime = cfs_rq->min_vruntime;
2971#endif
2972
2973 se->vruntime -= min_vruntime;
2974}
2975
2976#ifdef CONFIG_FAIR_GROUP_SCHED
2977/*
2978 * effective_load() calculates the load change as seen from the root_task_group
2979 *
2980 * Adding load to a group doesn't make a group heavier, but can cause movement
2981 * of group shares between cpus. Assuming the shares were perfectly aligned one
2982 * can calculate the shift in shares.
2983 *
2984 * Calculate the effective load difference if @wl is added (subtracted) to @tg
2985 * on this @cpu and results in a total addition (subtraction) of @wg to the
2986 * total group weight.
2987 *
2988 * Given a runqueue weight distribution (rw_i) we can compute a shares
2989 * distribution (s_i) using:
2990 *
2991 * s_i = rw_i / \Sum rw_j (1)
2992 *
2993 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2994 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2995 * shares distribution (s_i):
2996 *
2997 * rw_i = { 2, 4, 1, 0 }
2998 * s_i = { 2/7, 4/7, 1/7, 0 }
2999 *
3000 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3001 * task used to run on and the CPU the waker is running on), we need to
3002 * compute the effect of waking a task on either CPU and, in case of a sync
3003 * wakeup, compute the effect of the current task going to sleep.
3004 *
3005 * So for a change of @wl to the local @cpu with an overall group weight change
3006 * of @wl we can compute the new shares distribution (s'_i) using:
3007 *
3008 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
3009 *
3010 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3011 * differences in waking a task to CPU 0. The additional task changes the
3012 * weight and shares distributions like:
3013 *
3014 * rw'_i = { 3, 4, 1, 0 }
3015 * s'_i = { 3/8, 4/8, 1/8, 0 }
3016 *
3017 * We can then compute the difference in effective weight by using:
3018 *
3019 * dw_i = S * (s'_i - s_i) (3)
3020 *
3021 * Where 'S' is the group weight as seen by its parent.
3022 *
3023 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3024 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3025 * 4/7) times the weight of the group.
3026 */
3027static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3028{
3029 struct sched_entity *se = tg->se[cpu];
3030
3031 if (!tg->parent) /* the trivial, non-cgroup case */
3032 return wl;
3033
3034 for_each_sched_entity(se) {
3035 long w, W;
3036
3037 tg = se->my_q->tg;
3038
3039 /*
3040 * W = @wg + \Sum rw_j
3041 */
3042 W = wg + calc_tg_weight(tg, se->my_q);
3043
3044 /*
3045 * w = rw_i + @wl
3046 */
3047 w = se->my_q->load.weight + wl;
3048
3049 /*
3050 * wl = S * s'_i; see (2)
3051 */
3052 if (W > 0 && w < W)
3053 wl = (w * tg->shares) / W;
3054 else
3055 wl = tg->shares;
3056
3057 /*
3058 * Per the above, wl is the new se->load.weight value; since
3059 * those are clipped to [MIN_SHARES, ...) do so now. See
3060 * calc_cfs_shares().
3061 */
3062 if (wl < MIN_SHARES)
3063 wl = MIN_SHARES;
3064
3065 /*
3066 * wl = dw_i = S * (s'_i - s_i); see (3)
3067 */
3068 wl -= se->load.weight;
3069
3070 /*
3071 * Recursively apply this logic to all parent groups to compute
3072 * the final effective load change on the root group. Since
3073 * only the @tg group gets extra weight, all parent groups can
3074 * only redistribute existing shares. @wl is the shift in shares
3075 * resulting from this level per the above.
3076 */
3077 wg = 0;
3078 }
3079
3080 return wl;
3081}
3082#else
3083
3084static inline unsigned long effective_load(struct task_group *tg, int cpu,
3085 unsigned long wl, unsigned long wg)
3086{
3087 return wl;
3088}
3089
3090#endif
3091
3092static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3093{
3094 s64 this_load, load;
3095 int idx, this_cpu, prev_cpu;
3096 unsigned long tl_per_task;
3097 struct task_group *tg;
3098 unsigned long weight;
3099 int balanced;
3100
3101 idx = sd->wake_idx;
3102 this_cpu = smp_processor_id();
3103 prev_cpu = task_cpu(p);
3104 load = source_load(prev_cpu, idx);
3105 this_load = target_load(this_cpu, idx);
3106
3107 /*
3108 * If sync wakeup then subtract the (maximum possible)
3109 * effect of the currently running task from the load
3110 * of the current CPU:
3111 */
3112 if (sync) {
3113 tg = task_group(current);
3114 weight = current->se.load.weight;
3115
3116 this_load += effective_load(tg, this_cpu, -weight, -weight);
3117 load += effective_load(tg, prev_cpu, 0, -weight);
3118 }
3119
3120 tg = task_group(p);
3121 weight = p->se.load.weight;
3122
3123 /*
3124 * In low-load situations, where prev_cpu is idle and this_cpu is idle
3125 * due to the sync cause above having dropped this_load to 0, we'll
3126 * always have an imbalance, but there's really nothing you can do
3127 * about that, so that's good too.
3128 *
3129 * Otherwise check if either cpus are near enough in load to allow this
3130 * task to be woken on this_cpu.
3131 */
3132 if (this_load > 0) {
3133 s64 this_eff_load, prev_eff_load;
3134
3135 this_eff_load = 100;
3136 this_eff_load *= power_of(prev_cpu);
3137 this_eff_load *= this_load +
3138 effective_load(tg, this_cpu, weight, weight);
3139
3140 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3141 prev_eff_load *= power_of(this_cpu);
3142 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3143
3144 balanced = this_eff_load <= prev_eff_load;
3145 } else
3146 balanced = true;
3147
3148 /*
3149 * If the currently running task will sleep within
3150 * a reasonable amount of time then attract this newly
3151 * woken task:
3152 */
3153 if (sync && balanced)
3154 return 1;
3155
3156 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3157 tl_per_task = cpu_avg_load_per_task(this_cpu);
3158
3159 if (balanced ||
3160 (this_load <= load &&
3161 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3162 /*
3163 * This domain has SD_WAKE_AFFINE and
3164 * p is cache cold in this domain, and
3165 * there is no bad imbalance.
3166 */
3167 schedstat_inc(sd, ttwu_move_affine);
3168 schedstat_inc(p, se.statistics.nr_wakeups_affine);
3169
3170 return 1;
3171 }
3172 return 0;
3173}
3174
3175/*
3176 * find_idlest_group finds and returns the least busy CPU group within the
3177 * domain.
3178 */
3179static struct sched_group *
3180find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3181 int this_cpu, int load_idx)
3182{
3183 struct sched_group *idlest = NULL, *group = sd->groups;
3184 unsigned long min_load = ULONG_MAX, this_load = 0;
3185 int imbalance = 100 + (sd->imbalance_pct-100)/2;
3186
3187 do {
3188 unsigned long load, avg_load;
3189 int local_group;
3190 int i;
3191
3192 /* Skip over this group if it has no CPUs allowed */
3193 if (!cpumask_intersects(sched_group_cpus(group),
3194 tsk_cpus_allowed(p)))
3195 continue;
3196
3197 local_group = cpumask_test_cpu(this_cpu,
3198 sched_group_cpus(group));
3199
3200 /* Tally up the load of all CPUs in the group */
3201 avg_load = 0;
3202
3203 for_each_cpu(i, sched_group_cpus(group)) {
3204 /* Bias balancing toward cpus of our domain */
3205 if (local_group)
3206 load = source_load(i, load_idx);
3207 else
3208 load = target_load(i, load_idx);
3209
3210 avg_load += load;
3211 }
3212
3213 /* Adjust by relative CPU power of the group */
3214 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3215
3216 if (local_group) {
3217 this_load = avg_load;
3218 } else if (avg_load < min_load) {
3219 min_load = avg_load;
3220 idlest = group;
3221 }
3222 } while (group = group->next, group != sd->groups);
3223
3224 if (!idlest || 100*this_load < imbalance*min_load)
3225 return NULL;
3226 return idlest;
3227}
3228
3229/*
3230 * find_idlest_cpu - find the idlest cpu among the cpus in group.
3231 */
3232static int
3233find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3234{
3235 unsigned long load, min_load = ULONG_MAX;
3236 int idlest = -1;
3237 int i;
3238
3239 /* Traverse only the allowed CPUs */
3240 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3241 load = weighted_cpuload(i);
3242
3243 if (load < min_load || (load == min_load && i == this_cpu)) {
3244 min_load = load;
3245 idlest = i;
3246 }
3247 }
3248
3249 return idlest;
3250}
3251
3252/*
3253 * Try and locate an idle CPU in the sched_domain.
3254 */
3255static int select_idle_sibling(struct task_struct *p, int target)
3256{
3257 int cpu = smp_processor_id();
3258 int prev_cpu = task_cpu(p);
3259 struct sched_domain *sd;
3260 struct sched_group *sg;
3261 int i;
3262
3263 /*
3264 * If the task is going to be woken-up on this cpu and if it is
3265 * already idle, then it is the right target.
3266 */
3267 if (target == cpu && idle_cpu(cpu))
3268 return cpu;
3269
3270 /*
3271 * If the task is going to be woken-up on the cpu where it previously
3272 * ran and if it is currently idle, then it the right target.
3273 */
3274 if (target == prev_cpu && idle_cpu(prev_cpu))
3275 return prev_cpu;
3276
3277 /*
3278 * Otherwise, iterate the domains and find an elegible idle cpu.
3279 */
3280 sd = rcu_dereference(per_cpu(sd_llc, target));
3281 for_each_lower_domain(sd) {
3282 sg = sd->groups;
3283 do {
3284 if (!cpumask_intersects(sched_group_cpus(sg),
3285 tsk_cpus_allowed(p)))
3286 goto next;
3287
3288 for_each_cpu(i, sched_group_cpus(sg)) {
3289 if (!idle_cpu(i))
3290 goto next;
3291 }
3292
3293 target = cpumask_first_and(sched_group_cpus(sg),
3294 tsk_cpus_allowed(p));
3295 goto done;
3296next:
3297 sg = sg->next;
3298 } while (sg != sd->groups);
3299 }
3300done:
3301 return target;
3302}
3303
3304/*
3305 * sched_balance_self: balance the current task (running on cpu) in domains
3306 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3307 * SD_BALANCE_EXEC.
3308 *
3309 * Balance, ie. select the least loaded group.
3310 *
3311 * Returns the target CPU number, or the same CPU if no balancing is needed.
3312 *
3313 * preempt must be disabled.
3314 */
3315static int
3316select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3317{
3318 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3319 int cpu = smp_processor_id();
3320 int prev_cpu = task_cpu(p);
3321 int new_cpu = cpu;
3322 int want_affine = 0;
3323 int sync = wake_flags & WF_SYNC;
3324
3325 if (p->nr_cpus_allowed == 1)
3326 return prev_cpu;
3327
3328 if (sd_flag & SD_BALANCE_WAKE) {
3329 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3330 want_affine = 1;
3331 new_cpu = prev_cpu;
3332 }
3333
3334 rcu_read_lock();
3335 for_each_domain(cpu, tmp) {
3336 if (!(tmp->flags & SD_LOAD_BALANCE))
3337 continue;
3338
3339 /*
3340 * If both cpu and prev_cpu are part of this domain,
3341 * cpu is a valid SD_WAKE_AFFINE target.
3342 */
3343 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3344 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3345 affine_sd = tmp;
3346 break;
3347 }
3348
3349 if (tmp->flags & sd_flag)
3350 sd = tmp;
3351 }
3352
3353 if (affine_sd) {
3354 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3355 prev_cpu = cpu;
3356
3357 new_cpu = select_idle_sibling(p, prev_cpu);
3358 goto unlock;
3359 }
3360
3361 while (sd) {
3362 int load_idx = sd->forkexec_idx;
3363 struct sched_group *group;
3364 int weight;
3365
3366 if (!(sd->flags & sd_flag)) {
3367 sd = sd->child;
3368 continue;
3369 }
3370
3371 if (sd_flag & SD_BALANCE_WAKE)
3372 load_idx = sd->wake_idx;
3373
3374 group = find_idlest_group(sd, p, cpu, load_idx);
3375 if (!group) {
3376 sd = sd->child;
3377 continue;
3378 }
3379
3380 new_cpu = find_idlest_cpu(group, p, cpu);
3381 if (new_cpu == -1 || new_cpu == cpu) {
3382 /* Now try balancing at a lower domain level of cpu */
3383 sd = sd->child;
3384 continue;
3385 }
3386
3387 /* Now try balancing at a lower domain level of new_cpu */
3388 cpu = new_cpu;
3389 weight = sd->span_weight;
3390 sd = NULL;
3391 for_each_domain(cpu, tmp) {
3392 if (weight <= tmp->span_weight)
3393 break;
3394 if (tmp->flags & sd_flag)
3395 sd = tmp;
3396 }
3397 /* while loop will break here if sd == NULL */
3398 }
3399unlock:
3400 rcu_read_unlock();
3401
3402 return new_cpu;
3403}
3404
3405/*
3406 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3407 * removed when useful for applications beyond shares distribution (e.g.
3408 * load-balance).
3409 */
3410#ifdef CONFIG_FAIR_GROUP_SCHED
3411/*
3412 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3413 * cfs_rq_of(p) references at time of call are still valid and identify the
3414 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3415 * other assumptions, including the state of rq->lock, should be made.
3416 */
3417static void
3418migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3419{
3420 struct sched_entity *se = &p->se;
3421 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3422
3423 /*
3424 * Load tracking: accumulate removed load so that it can be processed
3425 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3426 * to blocked load iff they have a positive decay-count. It can never
3427 * be negative here since on-rq tasks have decay-count == 0.
3428 */
3429 if (se->avg.decay_count) {
3430 se->avg.decay_count = -__synchronize_entity_decay(se);
3431 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3432 }
3433}
3434#endif
3435#endif /* CONFIG_SMP */
3436
3437static unsigned long
3438wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3439{
3440 unsigned long gran = sysctl_sched_wakeup_granularity;
3441
3442 /*
3443 * Since its curr running now, convert the gran from real-time
3444 * to virtual-time in his units.
3445 *
3446 * By using 'se' instead of 'curr' we penalize light tasks, so
3447 * they get preempted easier. That is, if 'se' < 'curr' then
3448 * the resulting gran will be larger, therefore penalizing the
3449 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3450 * be smaller, again penalizing the lighter task.
3451 *
3452 * This is especially important for buddies when the leftmost
3453 * task is higher priority than the buddy.
3454 */
3455 return calc_delta_fair(gran, se);
3456}
3457
3458/*
3459 * Should 'se' preempt 'curr'.
3460 *
3461 * |s1
3462 * |s2
3463 * |s3
3464 * g
3465 * |<--->|c
3466 *
3467 * w(c, s1) = -1
3468 * w(c, s2) = 0
3469 * w(c, s3) = 1
3470 *
3471 */
3472static int
3473wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3474{
3475 s64 gran, vdiff = curr->vruntime - se->vruntime;
3476
3477 if (vdiff <= 0)
3478 return -1;
3479
3480 gran = wakeup_gran(curr, se);
3481 if (vdiff > gran)
3482 return 1;
3483
3484 return 0;
3485}
3486
3487static void set_last_buddy(struct sched_entity *se)
3488{
3489 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3490 return;
3491
3492 for_each_sched_entity(se)
3493 cfs_rq_of(se)->last = se;
3494}
3495
3496static void set_next_buddy(struct sched_entity *se)
3497{
3498 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3499 return;
3500
3501 for_each_sched_entity(se)
3502 cfs_rq_of(se)->next = se;
3503}
3504
3505static void set_skip_buddy(struct sched_entity *se)
3506{
3507 for_each_sched_entity(se)
3508 cfs_rq_of(se)->skip = se;
3509}
3510
3511/*
3512 * Preempt the current task with a newly woken task if needed:
3513 */
3514static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3515{
3516 struct task_struct *curr = rq->curr;
3517 struct sched_entity *se = &curr->se, *pse = &p->se;
3518 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3519 int scale = cfs_rq->nr_running >= sched_nr_latency;
3520 int next_buddy_marked = 0;
3521
3522 if (unlikely(se == pse))
3523 return;
3524
3525 /*
3526 * This is possible from callers such as move_task(), in which we
3527 * unconditionally check_prempt_curr() after an enqueue (which may have
3528 * lead to a throttle). This both saves work and prevents false
3529 * next-buddy nomination below.
3530 */
3531 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3532 return;
3533
3534 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3535 set_next_buddy(pse);
3536 next_buddy_marked = 1;
3537 }
3538
3539 /*
3540 * We can come here with TIF_NEED_RESCHED already set from new task
3541 * wake up path.
3542 *
3543 * Note: this also catches the edge-case of curr being in a throttled
3544 * group (e.g. via set_curr_task), since update_curr() (in the
3545 * enqueue of curr) will have resulted in resched being set. This
3546 * prevents us from potentially nominating it as a false LAST_BUDDY
3547 * below.
3548 */
3549 if (test_tsk_need_resched(curr))
3550 return;
3551
3552 /* Idle tasks are by definition preempted by non-idle tasks. */
3553 if (unlikely(curr->policy == SCHED_IDLE) &&
3554 likely(p->policy != SCHED_IDLE))
3555 goto preempt;
3556
3557 /*
3558 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3559 * is driven by the tick):
3560 */
3561 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3562 return;
3563
3564 find_matching_se(&se, &pse);
3565 update_curr(cfs_rq_of(se));
3566 BUG_ON(!pse);
3567 if (wakeup_preempt_entity(se, pse) == 1) {
3568 /*
3569 * Bias pick_next to pick the sched entity that is
3570 * triggering this preemption.
3571 */
3572 if (!next_buddy_marked)
3573 set_next_buddy(pse);
3574 goto preempt;
3575 }
3576
3577 return;
3578
3579preempt:
3580 resched_task(curr);
3581 /*
3582 * Only set the backward buddy when the current task is still
3583 * on the rq. This can happen when a wakeup gets interleaved
3584 * with schedule on the ->pre_schedule() or idle_balance()
3585 * point, either of which can * drop the rq lock.
3586 *
3587 * Also, during early boot the idle thread is in the fair class,
3588 * for obvious reasons its a bad idea to schedule back to it.
3589 */
3590 if (unlikely(!se->on_rq || curr == rq->idle))
3591 return;
3592
3593 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3594 set_last_buddy(se);
3595}
3596
3597static struct task_struct *pick_next_task_fair(struct rq *rq)
3598{
3599 struct task_struct *p;
3600 struct cfs_rq *cfs_rq = &rq->cfs;
3601 struct sched_entity *se;
3602
3603 if (!cfs_rq->nr_running)
3604 return NULL;
3605
3606 do {
3607 se = pick_next_entity(cfs_rq);
3608 set_next_entity(cfs_rq, se);
3609 cfs_rq = group_cfs_rq(se);
3610 } while (cfs_rq);
3611
3612 p = task_of(se);
3613 if (hrtick_enabled(rq))
3614 hrtick_start_fair(rq, p);
3615
3616 return p;
3617}
3618
3619/*
3620 * Account for a descheduled task:
3621 */
3622static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3623{
3624 struct sched_entity *se = &prev->se;
3625 struct cfs_rq *cfs_rq;
3626
3627 for_each_sched_entity(se) {
3628 cfs_rq = cfs_rq_of(se);
3629 put_prev_entity(cfs_rq, se);
3630 }
3631}
3632
3633/*
3634 * sched_yield() is very simple
3635 *
3636 * The magic of dealing with the ->skip buddy is in pick_next_entity.
3637 */
3638static void yield_task_fair(struct rq *rq)
3639{
3640 struct task_struct *curr = rq->curr;
3641 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3642 struct sched_entity *se = &curr->se;
3643
3644 /*
3645 * Are we the only task in the tree?
3646 */
3647 if (unlikely(rq->nr_running == 1))
3648 return;
3649
3650 clear_buddies(cfs_rq, se);
3651
3652 if (curr->policy != SCHED_BATCH) {
3653 update_rq_clock(rq);
3654 /*
3655 * Update run-time statistics of the 'current'.
3656 */
3657 update_curr(cfs_rq);
3658 /*
3659 * Tell update_rq_clock() that we've just updated,
3660 * so we don't do microscopic update in schedule()
3661 * and double the fastpath cost.
3662 */
3663 rq->skip_clock_update = 1;
3664 }
3665
3666 set_skip_buddy(se);
3667}
3668
3669static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3670{
3671 struct sched_entity *se = &p->se;
3672
3673 /* throttled hierarchies are not runnable */
3674 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3675 return false;
3676
3677 /* Tell the scheduler that we'd really like pse to run next. */
3678 set_next_buddy(se);
3679
3680 yield_task_fair(rq);
3681
3682 return true;
3683}
3684
3685#ifdef CONFIG_SMP
3686/**************************************************
3687 * Fair scheduling class load-balancing methods.
3688 *
3689 * BASICS
3690 *
3691 * The purpose of load-balancing is to achieve the same basic fairness the
3692 * per-cpu scheduler provides, namely provide a proportional amount of compute
3693 * time to each task. This is expressed in the following equation:
3694 *
3695 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3696 *
3697 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3698 * W_i,0 is defined as:
3699 *
3700 * W_i,0 = \Sum_j w_i,j (2)
3701 *
3702 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3703 * is derived from the nice value as per prio_to_weight[].
3704 *
3705 * The weight average is an exponential decay average of the instantaneous
3706 * weight:
3707 *
3708 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3709 *
3710 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3711 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3712 * can also include other factors [XXX].
3713 *
3714 * To achieve this balance we define a measure of imbalance which follows
3715 * directly from (1):
3716 *
3717 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3718 *
3719 * We them move tasks around to minimize the imbalance. In the continuous
3720 * function space it is obvious this converges, in the discrete case we get
3721 * a few fun cases generally called infeasible weight scenarios.
3722 *
3723 * [XXX expand on:
3724 * - infeasible weights;
3725 * - local vs global optima in the discrete case. ]
3726 *
3727 *
3728 * SCHED DOMAINS
3729 *
3730 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3731 * for all i,j solution, we create a tree of cpus that follows the hardware
3732 * topology where each level pairs two lower groups (or better). This results
3733 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3734 * tree to only the first of the previous level and we decrease the frequency
3735 * of load-balance at each level inv. proportional to the number of cpus in
3736 * the groups.
3737 *
3738 * This yields:
3739 *
3740 * log_2 n 1 n
3741 * \Sum { --- * --- * 2^i } = O(n) (5)
3742 * i = 0 2^i 2^i
3743 * `- size of each group
3744 * | | `- number of cpus doing load-balance
3745 * | `- freq
3746 * `- sum over all levels
3747 *
3748 * Coupled with a limit on how many tasks we can migrate every balance pass,
3749 * this makes (5) the runtime complexity of the balancer.
3750 *
3751 * An important property here is that each CPU is still (indirectly) connected
3752 * to every other cpu in at most O(log n) steps:
3753 *
3754 * The adjacency matrix of the resulting graph is given by:
3755 *
3756 * log_2 n
3757 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3758 * k = 0
3759 *
3760 * And you'll find that:
3761 *
3762 * A^(log_2 n)_i,j != 0 for all i,j (7)
3763 *
3764 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3765 * The task movement gives a factor of O(m), giving a convergence complexity
3766 * of:
3767 *
3768 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3769 *
3770 *
3771 * WORK CONSERVING
3772 *
3773 * In order to avoid CPUs going idle while there's still work to do, new idle
3774 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3775 * tree itself instead of relying on other CPUs to bring it work.
3776 *
3777 * This adds some complexity to both (5) and (8) but it reduces the total idle
3778 * time.
3779 *
3780 * [XXX more?]
3781 *
3782 *
3783 * CGROUPS
3784 *
3785 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3786 *
3787 * s_k,i
3788 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3789 * S_k
3790 *
3791 * Where
3792 *
3793 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3794 *
3795 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3796 *
3797 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3798 * property.
3799 *
3800 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3801 * rewrite all of this once again.]
3802 */
3803
3804static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3805
3806#define LBF_ALL_PINNED 0x01
3807#define LBF_NEED_BREAK 0x02
3808#define LBF_SOME_PINNED 0x04
3809
3810struct lb_env {
3811 struct sched_domain *sd;
3812
3813 struct rq *src_rq;
3814 int src_cpu;
3815
3816 int dst_cpu;
3817 struct rq *dst_rq;
3818
3819 struct cpumask *dst_grpmask;
3820 int new_dst_cpu;
3821 enum cpu_idle_type idle;
3822 long imbalance;
3823 /* The set of CPUs under consideration for load-balancing */
3824 struct cpumask *cpus;
3825
3826 unsigned int flags;
3827
3828 unsigned int loop;
3829 unsigned int loop_break;
3830 unsigned int loop_max;
3831};
3832
3833/*
3834 * move_task - move a task from one runqueue to another runqueue.
3835 * Both runqueues must be locked.
3836 */
3837static void move_task(struct task_struct *p, struct lb_env *env)
3838{
3839 deactivate_task(env->src_rq, p, 0);
3840 set_task_cpu(p, env->dst_cpu);
3841 activate_task(env->dst_rq, p, 0);
3842 check_preempt_curr(env->dst_rq, p, 0);
3843}
3844
3845/*
3846 * Is this task likely cache-hot:
3847 */
3848static int
3849task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3850{
3851 s64 delta;
3852
3853 if (p->sched_class != &fair_sched_class)
3854 return 0;
3855
3856 if (unlikely(p->policy == SCHED_IDLE))
3857 return 0;
3858
3859 /*
3860 * Buddy candidates are cache hot:
3861 */
3862 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3863 (&p->se == cfs_rq_of(&p->se)->next ||
3864 &p->se == cfs_rq_of(&p->se)->last))
3865 return 1;
3866
3867 if (sysctl_sched_migration_cost == -1)
3868 return 1;
3869 if (sysctl_sched_migration_cost == 0)
3870 return 0;
3871
3872 delta = now - p->se.exec_start;
3873
3874 return delta < (s64)sysctl_sched_migration_cost;
3875}
3876
3877/*
3878 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3879 */
3880static
3881int can_migrate_task(struct task_struct *p, struct lb_env *env)
3882{
3883 int tsk_cache_hot = 0;
3884 /*
3885 * We do not migrate tasks that are:
3886 * 1) running (obviously), or
3887 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3888 * 3) are cache-hot on their current CPU.
3889 */
3890 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3891 int new_dst_cpu;
3892
3893 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3894
3895 /*
3896 * Remember if this task can be migrated to any other cpu in
3897 * our sched_group. We may want to revisit it if we couldn't
3898 * meet load balance goals by pulling other tasks on src_cpu.
3899 *
3900 * Also avoid computing new_dst_cpu if we have already computed
3901 * one in current iteration.
3902 */
3903 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3904 return 0;
3905
3906 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3907 tsk_cpus_allowed(p));
3908 if (new_dst_cpu < nr_cpu_ids) {
3909 env->flags |= LBF_SOME_PINNED;
3910 env->new_dst_cpu = new_dst_cpu;
3911 }
3912 return 0;
3913 }
3914
3915 /* Record that we found atleast one task that could run on dst_cpu */
3916 env->flags &= ~LBF_ALL_PINNED;
3917
3918 if (task_running(env->src_rq, p)) {
3919 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3920 return 0;
3921 }
3922
3923 /*
3924 * Aggressive migration if:
3925 * 1) task is cache cold, or
3926 * 2) too many balance attempts have failed.
3927 */
3928
3929 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3930 if (!tsk_cache_hot ||
3931 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3932#ifdef CONFIG_SCHEDSTATS
3933 if (tsk_cache_hot) {
3934 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3935 schedstat_inc(p, se.statistics.nr_forced_migrations);
3936 }
3937#endif
3938 return 1;
3939 }
3940
3941 if (tsk_cache_hot) {
3942 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3943 return 0;
3944 }
3945 return 1;
3946}
3947
3948/*
3949 * move_one_task tries to move exactly one task from busiest to this_rq, as
3950 * part of active balancing operations within "domain".
3951 * Returns 1 if successful and 0 otherwise.
3952 *
3953 * Called with both runqueues locked.
3954 */
3955static int move_one_task(struct lb_env *env)
3956{
3957 struct task_struct *p, *n;
3958
3959 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3960 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3961 continue;
3962
3963 if (!can_migrate_task(p, env))
3964 continue;
3965
3966 move_task(p, env);
3967 /*
3968 * Right now, this is only the second place move_task()
3969 * is called, so we can safely collect move_task()
3970 * stats here rather than inside move_task().
3971 */
3972 schedstat_inc(env->sd, lb_gained[env->idle]);
3973 return 1;
3974 }
3975 return 0;
3976}
3977
3978static unsigned long task_h_load(struct task_struct *p);
3979
3980static const unsigned int sched_nr_migrate_break = 32;
3981
3982/*
3983 * move_tasks tries to move up to imbalance weighted load from busiest to
3984 * this_rq, as part of a balancing operation within domain "sd".
3985 * Returns 1 if successful and 0 otherwise.
3986 *
3987 * Called with both runqueues locked.
3988 */
3989static int move_tasks(struct lb_env *env)
3990{
3991 struct list_head *tasks = &env->src_rq->cfs_tasks;
3992 struct task_struct *p;
3993 unsigned long load;
3994 int pulled = 0;
3995
3996 if (env->imbalance <= 0)
3997 return 0;
3998
3999 while (!list_empty(tasks)) {
4000 p = list_first_entry(tasks, struct task_struct, se.group_node);
4001
4002 env->loop++;
4003 /* We've more or less seen every task there is, call it quits */
4004 if (env->loop > env->loop_max)
4005 break;
4006
4007 /* take a breather every nr_migrate tasks */
4008 if (env->loop > env->loop_break) {
4009 env->loop_break += sched_nr_migrate_break;
4010 env->flags |= LBF_NEED_BREAK;
4011 break;
4012 }
4013
4014 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4015 goto next;
4016
4017 load = task_h_load(p);
4018
4019 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4020 goto next;
4021
4022 if ((load / 2) > env->imbalance)
4023 goto next;
4024
4025 if (!can_migrate_task(p, env))
4026 goto next;
4027
4028 move_task(p, env);
4029 pulled++;
4030 env->imbalance -= load;
4031
4032#ifdef CONFIG_PREEMPT
4033 /*
4034 * NEWIDLE balancing is a source of latency, so preemptible
4035 * kernels will stop after the first task is pulled to minimize
4036 * the critical section.
4037 */
4038 if (env->idle == CPU_NEWLY_IDLE)
4039 break;
4040#endif
4041
4042 /*
4043 * We only want to steal up to the prescribed amount of
4044 * weighted load.
4045 */
4046 if (env->imbalance <= 0)
4047 break;
4048
4049 continue;
4050next:
4051 list_move_tail(&p->se.group_node, tasks);
4052 }
4053
4054 /*
4055 * Right now, this is one of only two places move_task() is called,
4056 * so we can safely collect move_task() stats here rather than
4057 * inside move_task().
4058 */
4059 schedstat_add(env->sd, lb_gained[env->idle], pulled);
4060
4061 return pulled;
4062}
4063
4064#ifdef CONFIG_FAIR_GROUP_SCHED
4065/*
4066 * update tg->load_weight by folding this cpu's load_avg
4067 */
4068static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4069{
4070 struct sched_entity *se = tg->se[cpu];
4071 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4072
4073 /* throttled entities do not contribute to load */
4074 if (throttled_hierarchy(cfs_rq))
4075 return;
4076
4077 update_cfs_rq_blocked_load(cfs_rq, 1);
4078
4079 if (se) {
4080 update_entity_load_avg(se, 1);
4081 /*
4082 * We pivot on our runnable average having decayed to zero for
4083 * list removal. This generally implies that all our children
4084 * have also been removed (modulo rounding error or bandwidth
4085 * control); however, such cases are rare and we can fix these
4086 * at enqueue.
4087 *
4088 * TODO: fix up out-of-order children on enqueue.
4089 */
4090 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4091 list_del_leaf_cfs_rq(cfs_rq);
4092 } else {
4093 struct rq *rq = rq_of(cfs_rq);
4094 update_rq_runnable_avg(rq, rq->nr_running);
4095 }
4096}
4097
4098static void update_blocked_averages(int cpu)
4099{
4100 struct rq *rq = cpu_rq(cpu);
4101 struct cfs_rq *cfs_rq;
4102 unsigned long flags;
4103
4104 raw_spin_lock_irqsave(&rq->lock, flags);
4105 update_rq_clock(rq);
4106 /*
4107 * Iterates the task_group tree in a bottom up fashion, see
4108 * list_add_leaf_cfs_rq() for details.
4109 */
4110 for_each_leaf_cfs_rq(rq, cfs_rq) {
4111 /*
4112 * Note: We may want to consider periodically releasing
4113 * rq->lock about these updates so that creating many task
4114 * groups does not result in continually extending hold time.
4115 */
4116 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
4117 }
4118
4119 raw_spin_unlock_irqrestore(&rq->lock, flags);
4120}
4121
4122/*
4123 * Compute the cpu's hierarchical load factor for each task group.
4124 * This needs to be done in a top-down fashion because the load of a child
4125 * group is a fraction of its parents load.
4126 */
4127static int tg_load_down(struct task_group *tg, void *data)
4128{
4129 unsigned long load;
4130 long cpu = (long)data;
4131
4132 if (!tg->parent) {
4133 load = cpu_rq(cpu)->load.weight;
4134 } else {
4135 load = tg->parent->cfs_rq[cpu]->h_load;
4136 load *= tg->se[cpu]->load.weight;
4137 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
4138 }
4139
4140 tg->cfs_rq[cpu]->h_load = load;
4141
4142 return 0;
4143}
4144
4145static void update_h_load(long cpu)
4146{
4147 struct rq *rq = cpu_rq(cpu);
4148 unsigned long now = jiffies;
4149
4150 if (rq->h_load_throttle == now)
4151 return;
4152
4153 rq->h_load_throttle = now;
4154
4155 rcu_read_lock();
4156 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
4157 rcu_read_unlock();
4158}
4159
4160static unsigned long task_h_load(struct task_struct *p)
4161{
4162 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4163 unsigned long load;
4164
4165 load = p->se.load.weight;
4166 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4167
4168 return load;
4169}
4170#else
4171static inline void update_blocked_averages(int cpu)
4172{
4173}
4174
4175static inline void update_h_load(long cpu)
4176{
4177}
4178
4179static unsigned long task_h_load(struct task_struct *p)
4180{
4181 return p->se.load.weight;
4182}
4183#endif
4184
4185/********** Helpers for find_busiest_group ************************/
4186/*
4187 * sd_lb_stats - Structure to store the statistics of a sched_domain
4188 * during load balancing.
4189 */
4190struct sd_lb_stats {
4191 struct sched_group *busiest; /* Busiest group in this sd */
4192 struct sched_group *this; /* Local group in this sd */
4193 unsigned long total_load; /* Total load of all groups in sd */
4194 unsigned long total_pwr; /* Total power of all groups in sd */
4195 unsigned long avg_load; /* Average load across all groups in sd */
4196
4197 /** Statistics of this group */
4198 unsigned long this_load;
4199 unsigned long this_load_per_task;
4200 unsigned long this_nr_running;
4201 unsigned long this_has_capacity;
4202 unsigned int this_idle_cpus;
4203
4204 /* Statistics of the busiest group */
4205 unsigned int busiest_idle_cpus;
4206 unsigned long max_load;
4207 unsigned long busiest_load_per_task;
4208 unsigned long busiest_nr_running;
4209 unsigned long busiest_group_capacity;
4210 unsigned long busiest_has_capacity;
4211 unsigned int busiest_group_weight;
4212
4213 int group_imb; /* Is there imbalance in this sd */
4214};
4215
4216/*
4217 * sg_lb_stats - stats of a sched_group required for load_balancing
4218 */
4219struct sg_lb_stats {
4220 unsigned long avg_load; /*Avg load across the CPUs of the group */
4221 unsigned long group_load; /* Total load over the CPUs of the group */
4222 unsigned long sum_nr_running; /* Nr tasks running in the group */
4223 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4224 unsigned long group_capacity;
4225 unsigned long idle_cpus;
4226 unsigned long group_weight;
4227 int group_imb; /* Is there an imbalance in the group ? */
4228 int group_has_capacity; /* Is there extra capacity in the group? */
4229};
4230
4231/**
4232 * get_sd_load_idx - Obtain the load index for a given sched domain.
4233 * @sd: The sched_domain whose load_idx is to be obtained.
4234 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4235 */
4236static inline int get_sd_load_idx(struct sched_domain *sd,
4237 enum cpu_idle_type idle)
4238{
4239 int load_idx;
4240
4241 switch (idle) {
4242 case CPU_NOT_IDLE:
4243 load_idx = sd->busy_idx;
4244 break;
4245
4246 case CPU_NEWLY_IDLE:
4247 load_idx = sd->newidle_idx;
4248 break;
4249 default:
4250 load_idx = sd->idle_idx;
4251 break;
4252 }
4253
4254 return load_idx;
4255}
4256
4257unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4258{
4259 return SCHED_POWER_SCALE;
4260}
4261
4262unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4263{
4264 return default_scale_freq_power(sd, cpu);
4265}
4266
4267unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4268{
4269 unsigned long weight = sd->span_weight;
4270 unsigned long smt_gain = sd->smt_gain;
4271
4272 smt_gain /= weight;
4273
4274 return smt_gain;
4275}
4276
4277unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4278{
4279 return default_scale_smt_power(sd, cpu);
4280}
4281
4282unsigned long scale_rt_power(int cpu)
4283{
4284 struct rq *rq = cpu_rq(cpu);
4285 u64 total, available, age_stamp, avg;
4286
4287 /*
4288 * Since we're reading these variables without serialization make sure
4289 * we read them once before doing sanity checks on them.
4290 */
4291 age_stamp = ACCESS_ONCE(rq->age_stamp);
4292 avg = ACCESS_ONCE(rq->rt_avg);
4293
4294 total = sched_avg_period() + (rq->clock - age_stamp);
4295
4296 if (unlikely(total < avg)) {
4297 /* Ensures that power won't end up being negative */
4298 available = 0;
4299 } else {
4300 available = total - avg;
4301 }
4302
4303 if (unlikely((s64)total < SCHED_POWER_SCALE))
4304 total = SCHED_POWER_SCALE;
4305
4306 total >>= SCHED_POWER_SHIFT;
4307
4308 return div_u64(available, total);
4309}
4310
4311static void update_cpu_power(struct sched_domain *sd, int cpu)
4312{
4313 unsigned long weight = sd->span_weight;
4314 unsigned long power = SCHED_POWER_SCALE;
4315 struct sched_group *sdg = sd->groups;
4316
4317 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4318 if (sched_feat(ARCH_POWER))
4319 power *= arch_scale_smt_power(sd, cpu);
4320 else
4321 power *= default_scale_smt_power(sd, cpu);
4322
4323 power >>= SCHED_POWER_SHIFT;
4324 }
4325
4326 sdg->sgp->power_orig = power;
4327
4328 if (sched_feat(ARCH_POWER))
4329 power *= arch_scale_freq_power(sd, cpu);
4330 else
4331 power *= default_scale_freq_power(sd, cpu);
4332
4333 power >>= SCHED_POWER_SHIFT;
4334
4335 power *= scale_rt_power(cpu);
4336 power >>= SCHED_POWER_SHIFT;
4337
4338 if (!power)
4339 power = 1;
4340
4341 cpu_rq(cpu)->cpu_power = power;
4342 sdg->sgp->power = power;
4343}
4344
4345void update_group_power(struct sched_domain *sd, int cpu)
4346{
4347 struct sched_domain *child = sd->child;
4348 struct sched_group *group, *sdg = sd->groups;
4349 unsigned long power;
4350 unsigned long interval;
4351
4352 interval = msecs_to_jiffies(sd->balance_interval);
4353 interval = clamp(interval, 1UL, max_load_balance_interval);
4354 sdg->sgp->next_update = jiffies + interval;
4355
4356 if (!child) {
4357 update_cpu_power(sd, cpu);
4358 return;
4359 }
4360
4361 power = 0;
4362
4363 if (child->flags & SD_OVERLAP) {
4364 /*
4365 * SD_OVERLAP domains cannot assume that child groups
4366 * span the current group.
4367 */
4368
4369 for_each_cpu(cpu, sched_group_cpus(sdg))
4370 power += power_of(cpu);
4371 } else {
4372 /*
4373 * !SD_OVERLAP domains can assume that child groups
4374 * span the current group.
4375 */
4376
4377 group = child->groups;
4378 do {
4379 power += group->sgp->power;
4380 group = group->next;
4381 } while (group != child->groups);
4382 }
4383
4384 sdg->sgp->power_orig = sdg->sgp->power = power;
4385}
4386
4387/*
4388 * Try and fix up capacity for tiny siblings, this is needed when
4389 * things like SD_ASYM_PACKING need f_b_g to select another sibling
4390 * which on its own isn't powerful enough.
4391 *
4392 * See update_sd_pick_busiest() and check_asym_packing().
4393 */
4394static inline int
4395fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4396{
4397 /*
4398 * Only siblings can have significantly less than SCHED_POWER_SCALE
4399 */
4400 if (!(sd->flags & SD_SHARE_CPUPOWER))
4401 return 0;
4402
4403 /*
4404 * If ~90% of the cpu_power is still there, we're good.
4405 */
4406 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4407 return 1;
4408
4409 return 0;
4410}
4411
4412/**
4413 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4414 * @env: The load balancing environment.
4415 * @group: sched_group whose statistics are to be updated.
4416 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4417 * @local_group: Does group contain this_cpu.
4418 * @balance: Should we balance.
4419 * @sgs: variable to hold the statistics for this group.
4420 */
4421static inline void update_sg_lb_stats(struct lb_env *env,
4422 struct sched_group *group, int load_idx,
4423 int local_group, int *balance, struct sg_lb_stats *sgs)
4424{
4425 unsigned long nr_running, max_nr_running, min_nr_running;
4426 unsigned long load, max_cpu_load, min_cpu_load;
4427 unsigned int balance_cpu = -1, first_idle_cpu = 0;
4428 unsigned long avg_load_per_task = 0;
4429 int i;
4430
4431 if (local_group)
4432 balance_cpu = group_balance_cpu(group);
4433
4434 /* Tally up the load of all CPUs in the group */
4435 max_cpu_load = 0;
4436 min_cpu_load = ~0UL;
4437 max_nr_running = 0;
4438 min_nr_running = ~0UL;
4439
4440 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4441 struct rq *rq = cpu_rq(i);
4442
4443 nr_running = rq->nr_running;
4444
4445 /* Bias balancing toward cpus of our domain */
4446 if (local_group) {
4447 if (idle_cpu(i) && !first_idle_cpu &&
4448 cpumask_test_cpu(i, sched_group_mask(group))) {
4449 first_idle_cpu = 1;
4450 balance_cpu = i;
4451 }
4452
4453 load = target_load(i, load_idx);
4454 } else {
4455 load = source_load(i, load_idx);
4456 if (load > max_cpu_load)
4457 max_cpu_load = load;
4458 if (min_cpu_load > load)
4459 min_cpu_load = load;
4460
4461 if (nr_running > max_nr_running)
4462 max_nr_running = nr_running;
4463 if (min_nr_running > nr_running)
4464 min_nr_running = nr_running;
4465 }
4466
4467 sgs->group_load += load;
4468 sgs->sum_nr_running += nr_running;
4469 sgs->sum_weighted_load += weighted_cpuload(i);
4470 if (idle_cpu(i))
4471 sgs->idle_cpus++;
4472 }
4473
4474 /*
4475 * First idle cpu or the first cpu(busiest) in this sched group
4476 * is eligible for doing load balancing at this and above
4477 * domains. In the newly idle case, we will allow all the cpu's
4478 * to do the newly idle load balance.
4479 */
4480 if (local_group) {
4481 if (env->idle != CPU_NEWLY_IDLE) {
4482 if (balance_cpu != env->dst_cpu) {
4483 *balance = 0;
4484 return;
4485 }
4486 update_group_power(env->sd, env->dst_cpu);
4487 } else if (time_after_eq(jiffies, group->sgp->next_update))
4488 update_group_power(env->sd, env->dst_cpu);
4489 }
4490
4491 /* Adjust by relative CPU power of the group */
4492 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
4493
4494 /*
4495 * Consider the group unbalanced when the imbalance is larger
4496 * than the average weight of a task.
4497 *
4498 * APZ: with cgroup the avg task weight can vary wildly and
4499 * might not be a suitable number - should we keep a
4500 * normalized nr_running number somewhere that negates
4501 * the hierarchy?
4502 */
4503 if (sgs->sum_nr_running)
4504 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4505
4506 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4507 (max_nr_running - min_nr_running) > 1)
4508 sgs->group_imb = 1;
4509
4510 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4511 SCHED_POWER_SCALE);
4512 if (!sgs->group_capacity)
4513 sgs->group_capacity = fix_small_capacity(env->sd, group);
4514 sgs->group_weight = group->group_weight;
4515
4516 if (sgs->group_capacity > sgs->sum_nr_running)
4517 sgs->group_has_capacity = 1;
4518}
4519
4520/**
4521 * update_sd_pick_busiest - return 1 on busiest group
4522 * @env: The load balancing environment.
4523 * @sds: sched_domain statistics
4524 * @sg: sched_group candidate to be checked for being the busiest
4525 * @sgs: sched_group statistics
4526 *
4527 * Determine if @sg is a busier group than the previously selected
4528 * busiest group.
4529 */
4530static bool update_sd_pick_busiest(struct lb_env *env,
4531 struct sd_lb_stats *sds,
4532 struct sched_group *sg,
4533 struct sg_lb_stats *sgs)
4534{
4535 if (sgs->avg_load <= sds->max_load)
4536 return false;
4537
4538 if (sgs->sum_nr_running > sgs->group_capacity)
4539 return true;
4540
4541 if (sgs->group_imb)
4542 return true;
4543
4544 /*
4545 * ASYM_PACKING needs to move all the work to the lowest
4546 * numbered CPUs in the group, therefore mark all groups
4547 * higher than ourself as busy.
4548 */
4549 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4550 env->dst_cpu < group_first_cpu(sg)) {
4551 if (!sds->busiest)
4552 return true;
4553
4554 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4555 return true;
4556 }
4557
4558 return false;
4559}
4560
4561/**
4562 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4563 * @env: The load balancing environment.
4564 * @balance: Should we balance.
4565 * @sds: variable to hold the statistics for this sched_domain.
4566 */
4567static inline void update_sd_lb_stats(struct lb_env *env,
4568 int *balance, struct sd_lb_stats *sds)
4569{
4570 struct sched_domain *child = env->sd->child;
4571 struct sched_group *sg = env->sd->groups;
4572 struct sg_lb_stats sgs;
4573 int load_idx, prefer_sibling = 0;
4574
4575 if (child && child->flags & SD_PREFER_SIBLING)
4576 prefer_sibling = 1;
4577
4578 load_idx = get_sd_load_idx(env->sd, env->idle);
4579
4580 do {
4581 int local_group;
4582
4583 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4584 memset(&sgs, 0, sizeof(sgs));
4585 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
4586
4587 if (local_group && !(*balance))
4588 return;
4589
4590 sds->total_load += sgs.group_load;
4591 sds->total_pwr += sg->sgp->power;
4592
4593 /*
4594 * In case the child domain prefers tasks go to siblings
4595 * first, lower the sg capacity to one so that we'll try
4596 * and move all the excess tasks away. We lower the capacity
4597 * of a group only if the local group has the capacity to fit
4598 * these excess tasks, i.e. nr_running < group_capacity. The
4599 * extra check prevents the case where you always pull from the
4600 * heaviest group when it is already under-utilized (possible
4601 * with a large weight task outweighs the tasks on the system).
4602 */
4603 if (prefer_sibling && !local_group && sds->this_has_capacity)
4604 sgs.group_capacity = min(sgs.group_capacity, 1UL);
4605
4606 if (local_group) {
4607 sds->this_load = sgs.avg_load;
4608 sds->this = sg;
4609 sds->this_nr_running = sgs.sum_nr_running;
4610 sds->this_load_per_task = sgs.sum_weighted_load;
4611 sds->this_has_capacity = sgs.group_has_capacity;
4612 sds->this_idle_cpus = sgs.idle_cpus;
4613 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4614 sds->max_load = sgs.avg_load;
4615 sds->busiest = sg;
4616 sds->busiest_nr_running = sgs.sum_nr_running;
4617 sds->busiest_idle_cpus = sgs.idle_cpus;
4618 sds->busiest_group_capacity = sgs.group_capacity;
4619 sds->busiest_load_per_task = sgs.sum_weighted_load;
4620 sds->busiest_has_capacity = sgs.group_has_capacity;
4621 sds->busiest_group_weight = sgs.group_weight;
4622 sds->group_imb = sgs.group_imb;
4623 }
4624
4625 sg = sg->next;
4626 } while (sg != env->sd->groups);
4627}
4628
4629/**
4630 * check_asym_packing - Check to see if the group is packed into the
4631 * sched doman.
4632 *
4633 * This is primarily intended to used at the sibling level. Some
4634 * cores like POWER7 prefer to use lower numbered SMT threads. In the
4635 * case of POWER7, it can move to lower SMT modes only when higher
4636 * threads are idle. When in lower SMT modes, the threads will
4637 * perform better since they share less core resources. Hence when we
4638 * have idle threads, we want them to be the higher ones.
4639 *
4640 * This packing function is run on idle threads. It checks to see if
4641 * the busiest CPU in this domain (core in the P7 case) has a higher
4642 * CPU number than the packing function is being run on. Here we are
4643 * assuming lower CPU number will be equivalent to lower a SMT thread
4644 * number.
4645 *
4646 * Returns 1 when packing is required and a task should be moved to
4647 * this CPU. The amount of the imbalance is returned in *imbalance.
4648 *
4649 * @env: The load balancing environment.
4650 * @sds: Statistics of the sched_domain which is to be packed
4651 */
4652static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4653{
4654 int busiest_cpu;
4655
4656 if (!(env->sd->flags & SD_ASYM_PACKING))
4657 return 0;
4658
4659 if (!sds->busiest)
4660 return 0;
4661
4662 busiest_cpu = group_first_cpu(sds->busiest);
4663 if (env->dst_cpu > busiest_cpu)
4664 return 0;
4665
4666 env->imbalance = DIV_ROUND_CLOSEST(
4667 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4668
4669 return 1;
4670}
4671
4672/**
4673 * fix_small_imbalance - Calculate the minor imbalance that exists
4674 * amongst the groups of a sched_domain, during
4675 * load balancing.
4676 * @env: The load balancing environment.
4677 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4678 */
4679static inline
4680void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4681{
4682 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4683 unsigned int imbn = 2;
4684 unsigned long scaled_busy_load_per_task;
4685
4686 if (sds->this_nr_running) {
4687 sds->this_load_per_task /= sds->this_nr_running;
4688 if (sds->busiest_load_per_task >
4689 sds->this_load_per_task)
4690 imbn = 1;
4691 } else {
4692 sds->this_load_per_task =
4693 cpu_avg_load_per_task(env->dst_cpu);
4694 }
4695
4696 scaled_busy_load_per_task = sds->busiest_load_per_task
4697 * SCHED_POWER_SCALE;
4698 scaled_busy_load_per_task /= sds->busiest->sgp->power;
4699
4700 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4701 (scaled_busy_load_per_task * imbn)) {
4702 env->imbalance = sds->busiest_load_per_task;
4703 return;
4704 }
4705
4706 /*
4707 * OK, we don't have enough imbalance to justify moving tasks,
4708 * however we may be able to increase total CPU power used by
4709 * moving them.
4710 */
4711
4712 pwr_now += sds->busiest->sgp->power *
4713 min(sds->busiest_load_per_task, sds->max_load);
4714 pwr_now += sds->this->sgp->power *
4715 min(sds->this_load_per_task, sds->this_load);
4716 pwr_now /= SCHED_POWER_SCALE;
4717
4718 /* Amount of load we'd subtract */
4719 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4720 sds->busiest->sgp->power;
4721 if (sds->max_load > tmp)
4722 pwr_move += sds->busiest->sgp->power *
4723 min(sds->busiest_load_per_task, sds->max_load - tmp);
4724
4725 /* Amount of load we'd add */
4726 if (sds->max_load * sds->busiest->sgp->power <
4727 sds->busiest_load_per_task * SCHED_POWER_SCALE)
4728 tmp = (sds->max_load * sds->busiest->sgp->power) /
4729 sds->this->sgp->power;
4730 else
4731 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4732 sds->this->sgp->power;
4733 pwr_move += sds->this->sgp->power *
4734 min(sds->this_load_per_task, sds->this_load + tmp);
4735 pwr_move /= SCHED_POWER_SCALE;
4736
4737 /* Move if we gain throughput */
4738 if (pwr_move > pwr_now)
4739 env->imbalance = sds->busiest_load_per_task;
4740}
4741
4742/**
4743 * calculate_imbalance - Calculate the amount of imbalance present within the
4744 * groups of a given sched_domain during load balance.
4745 * @env: load balance environment
4746 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4747 */
4748static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4749{
4750 unsigned long max_pull, load_above_capacity = ~0UL;
4751
4752 sds->busiest_load_per_task /= sds->busiest_nr_running;
4753 if (sds->group_imb) {
4754 sds->busiest_load_per_task =
4755 min(sds->busiest_load_per_task, sds->avg_load);
4756 }
4757
4758 /*
4759 * In the presence of smp nice balancing, certain scenarios can have
4760 * max load less than avg load(as we skip the groups at or below
4761 * its cpu_power, while calculating max_load..)
4762 */
4763 if (sds->max_load < sds->avg_load) {
4764 env->imbalance = 0;
4765 return fix_small_imbalance(env, sds);
4766 }
4767
4768 if (!sds->group_imb) {
4769 /*
4770 * Don't want to pull so many tasks that a group would go idle.
4771 */
4772 load_above_capacity = (sds->busiest_nr_running -
4773 sds->busiest_group_capacity);
4774
4775 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4776
4777 load_above_capacity /= sds->busiest->sgp->power;
4778 }
4779
4780 /*
4781 * We're trying to get all the cpus to the average_load, so we don't
4782 * want to push ourselves above the average load, nor do we wish to
4783 * reduce the max loaded cpu below the average load. At the same time,
4784 * we also don't want to reduce the group load below the group capacity
4785 * (so that we can implement power-savings policies etc). Thus we look
4786 * for the minimum possible imbalance.
4787 * Be careful of negative numbers as they'll appear as very large values
4788 * with unsigned longs.
4789 */
4790 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4791
4792 /* How much load to actually move to equalise the imbalance */
4793 env->imbalance = min(max_pull * sds->busiest->sgp->power,
4794 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4795 / SCHED_POWER_SCALE;
4796
4797 /*
4798 * if *imbalance is less than the average load per runnable task
4799 * there is no guarantee that any tasks will be moved so we'll have
4800 * a think about bumping its value to force at least one task to be
4801 * moved
4802 */
4803 if (env->imbalance < sds->busiest_load_per_task)
4804 return fix_small_imbalance(env, sds);
4805
4806}
4807
4808/******* find_busiest_group() helpers end here *********************/
4809
4810/**
4811 * find_busiest_group - Returns the busiest group within the sched_domain
4812 * if there is an imbalance. If there isn't an imbalance, and
4813 * the user has opted for power-savings, it returns a group whose
4814 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4815 * such a group exists.
4816 *
4817 * Also calculates the amount of weighted load which should be moved
4818 * to restore balance.
4819 *
4820 * @env: The load balancing environment.
4821 * @balance: Pointer to a variable indicating if this_cpu
4822 * is the appropriate cpu to perform load balancing at this_level.
4823 *
4824 * Returns: - the busiest group if imbalance exists.
4825 * - If no imbalance and user has opted for power-savings balance,
4826 * return the least loaded group whose CPUs can be
4827 * put to idle by rebalancing its tasks onto our group.
4828 */
4829static struct sched_group *
4830find_busiest_group(struct lb_env *env, int *balance)
4831{
4832 struct sd_lb_stats sds;
4833
4834 memset(&sds, 0, sizeof(sds));
4835
4836 /*
4837 * Compute the various statistics relavent for load balancing at
4838 * this level.
4839 */
4840 update_sd_lb_stats(env, balance, &sds);
4841
4842 /*
4843 * this_cpu is not the appropriate cpu to perform load balancing at
4844 * this level.
4845 */
4846 if (!(*balance))
4847 goto ret;
4848
4849 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4850 check_asym_packing(env, &sds))
4851 return sds.busiest;
4852
4853 /* There is no busy sibling group to pull tasks from */
4854 if (!sds.busiest || sds.busiest_nr_running == 0)
4855 goto out_balanced;
4856
4857 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4858
4859 /*
4860 * If the busiest group is imbalanced the below checks don't
4861 * work because they assumes all things are equal, which typically
4862 * isn't true due to cpus_allowed constraints and the like.
4863 */
4864 if (sds.group_imb)
4865 goto force_balance;
4866
4867 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4868 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4869 !sds.busiest_has_capacity)
4870 goto force_balance;
4871
4872 /*
4873 * If the local group is more busy than the selected busiest group
4874 * don't try and pull any tasks.
4875 */
4876 if (sds.this_load >= sds.max_load)
4877 goto out_balanced;
4878
4879 /*
4880 * Don't pull any tasks if this group is already above the domain
4881 * average load.
4882 */
4883 if (sds.this_load >= sds.avg_load)
4884 goto out_balanced;
4885
4886 if (env->idle == CPU_IDLE) {
4887 /*
4888 * This cpu is idle. If the busiest group load doesn't
4889 * have more tasks than the number of available cpu's and
4890 * there is no imbalance between this and busiest group
4891 * wrt to idle cpu's, it is balanced.
4892 */
4893 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4894 sds.busiest_nr_running <= sds.busiest_group_weight)
4895 goto out_balanced;
4896 } else {
4897 /*
4898 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4899 * imbalance_pct to be conservative.
4900 */
4901 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4902 goto out_balanced;
4903 }
4904
4905force_balance:
4906 /* Looks like there is an imbalance. Compute it */
4907 calculate_imbalance(env, &sds);
4908 return sds.busiest;
4909
4910out_balanced:
4911ret:
4912 env->imbalance = 0;
4913 return NULL;
4914}
4915
4916/*
4917 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4918 */
4919static struct rq *find_busiest_queue(struct lb_env *env,
4920 struct sched_group *group)
4921{
4922 struct rq *busiest = NULL, *rq;
4923 unsigned long max_load = 0;
4924 int i;
4925
4926 for_each_cpu(i, sched_group_cpus(group)) {
4927 unsigned long power = power_of(i);
4928 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4929 SCHED_POWER_SCALE);
4930 unsigned long wl;
4931
4932 if (!capacity)
4933 capacity = fix_small_capacity(env->sd, group);
4934
4935 if (!cpumask_test_cpu(i, env->cpus))
4936 continue;
4937
4938 rq = cpu_rq(i);
4939 wl = weighted_cpuload(i);
4940
4941 /*
4942 * When comparing with imbalance, use weighted_cpuload()
4943 * which is not scaled with the cpu power.
4944 */
4945 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4946 continue;
4947
4948 /*
4949 * For the load comparisons with the other cpu's, consider
4950 * the weighted_cpuload() scaled with the cpu power, so that
4951 * the load can be moved away from the cpu that is potentially
4952 * running at a lower capacity.
4953 */
4954 wl = (wl * SCHED_POWER_SCALE) / power;
4955
4956 if (wl > max_load) {
4957 max_load = wl;
4958 busiest = rq;
4959 }
4960 }
4961
4962 return busiest;
4963}
4964
4965/*
4966 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4967 * so long as it is large enough.
4968 */
4969#define MAX_PINNED_INTERVAL 512
4970
4971/* Working cpumask for load_balance and load_balance_newidle. */
4972DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4973
4974static int need_active_balance(struct lb_env *env)
4975{
4976 struct sched_domain *sd = env->sd;
4977
4978 if (env->idle == CPU_NEWLY_IDLE) {
4979
4980 /*
4981 * ASYM_PACKING needs to force migrate tasks from busy but
4982 * higher numbered CPUs in order to pack all tasks in the
4983 * lowest numbered CPUs.
4984 */
4985 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4986 return 1;
4987 }
4988
4989 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4990}
4991
4992static int active_load_balance_cpu_stop(void *data);
4993
4994/*
4995 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4996 * tasks if there is an imbalance.
4997 */
4998static int load_balance(int this_cpu, struct rq *this_rq,
4999 struct sched_domain *sd, enum cpu_idle_type idle,
5000 int *balance)
5001{
5002 int ld_moved, cur_ld_moved, active_balance = 0;
5003 int lb_iterations, max_lb_iterations;
5004 struct sched_group *group;
5005 struct rq *busiest;
5006 unsigned long flags;
5007 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
5008
5009 struct lb_env env = {
5010 .sd = sd,
5011 .dst_cpu = this_cpu,
5012 .dst_rq = this_rq,
5013 .dst_grpmask = sched_group_cpus(sd->groups),
5014 .idle = idle,
5015 .loop_break = sched_nr_migrate_break,
5016 .cpus = cpus,
5017 };
5018
5019 cpumask_copy(cpus, cpu_active_mask);
5020 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5021
5022 schedstat_inc(sd, lb_count[idle]);
5023
5024redo:
5025 group = find_busiest_group(&env, balance);
5026
5027 if (*balance == 0)
5028 goto out_balanced;
5029
5030 if (!group) {
5031 schedstat_inc(sd, lb_nobusyg[idle]);
5032 goto out_balanced;
5033 }
5034
5035 busiest = find_busiest_queue(&env, group);
5036 if (!busiest) {
5037 schedstat_inc(sd, lb_nobusyq[idle]);
5038 goto out_balanced;
5039 }
5040
5041 BUG_ON(busiest == env.dst_rq);
5042
5043 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5044
5045 ld_moved = 0;
5046 lb_iterations = 1;
5047 if (busiest->nr_running > 1) {
5048 /*
5049 * Attempt to move tasks. If find_busiest_group has found
5050 * an imbalance but busiest->nr_running <= 1, the group is
5051 * still unbalanced. ld_moved simply stays zero, so it is
5052 * correctly treated as an imbalance.
5053 */
5054 env.flags |= LBF_ALL_PINNED;
5055 env.src_cpu = busiest->cpu;
5056 env.src_rq = busiest;
5057 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5058
5059 update_h_load(env.src_cpu);
5060more_balance:
5061 local_irq_save(flags);
5062 double_rq_lock(env.dst_rq, busiest);
5063
5064 /*
5065 * cur_ld_moved - load moved in current iteration
5066 * ld_moved - cumulative load moved across iterations
5067 */
5068 cur_ld_moved = move_tasks(&env);
5069 ld_moved += cur_ld_moved;
5070 double_rq_unlock(env.dst_rq, busiest);
5071 local_irq_restore(flags);
5072
5073 if (env.flags & LBF_NEED_BREAK) {
5074 env.flags &= ~LBF_NEED_BREAK;
5075 goto more_balance;
5076 }
5077
5078 /*
5079 * some other cpu did the load balance for us.
5080 */
5081 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5082 resched_cpu(env.dst_cpu);
5083
5084 /*
5085 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5086 * us and move them to an alternate dst_cpu in our sched_group
5087 * where they can run. The upper limit on how many times we
5088 * iterate on same src_cpu is dependent on number of cpus in our
5089 * sched_group.
5090 *
5091 * This changes load balance semantics a bit on who can move
5092 * load to a given_cpu. In addition to the given_cpu itself
5093 * (or a ilb_cpu acting on its behalf where given_cpu is
5094 * nohz-idle), we now have balance_cpu in a position to move
5095 * load to given_cpu. In rare situations, this may cause
5096 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5097 * _independently_ and at _same_ time to move some load to
5098 * given_cpu) causing exceess load to be moved to given_cpu.
5099 * This however should not happen so much in practice and
5100 * moreover subsequent load balance cycles should correct the
5101 * excess load moved.
5102 */
5103 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
5104 lb_iterations++ < max_lb_iterations) {
5105
5106 env.dst_rq = cpu_rq(env.new_dst_cpu);
5107 env.dst_cpu = env.new_dst_cpu;
5108 env.flags &= ~LBF_SOME_PINNED;
5109 env.loop = 0;
5110 env.loop_break = sched_nr_migrate_break;
5111 /*
5112 * Go back to "more_balance" rather than "redo" since we
5113 * need to continue with same src_cpu.
5114 */
5115 goto more_balance;
5116 }
5117
5118 /* All tasks on this runqueue were pinned by CPU affinity */
5119 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5120 cpumask_clear_cpu(cpu_of(busiest), cpus);
5121 if (!cpumask_empty(cpus)) {
5122 env.loop = 0;
5123 env.loop_break = sched_nr_migrate_break;
5124 goto redo;
5125 }
5126 goto out_balanced;
5127 }
5128 }
5129
5130 if (!ld_moved) {
5131 schedstat_inc(sd, lb_failed[idle]);
5132 /*
5133 * Increment the failure counter only on periodic balance.
5134 * We do not want newidle balance, which can be very
5135 * frequent, pollute the failure counter causing
5136 * excessive cache_hot migrations and active balances.
5137 */
5138 if (idle != CPU_NEWLY_IDLE)
5139 sd->nr_balance_failed++;
5140
5141 if (need_active_balance(&env)) {
5142 raw_spin_lock_irqsave(&busiest->lock, flags);
5143
5144 /* don't kick the active_load_balance_cpu_stop,
5145 * if the curr task on busiest cpu can't be
5146 * moved to this_cpu
5147 */
5148 if (!cpumask_test_cpu(this_cpu,
5149 tsk_cpus_allowed(busiest->curr))) {
5150 raw_spin_unlock_irqrestore(&busiest->lock,
5151 flags);
5152 env.flags |= LBF_ALL_PINNED;
5153 goto out_one_pinned;
5154 }
5155
5156 /*
5157 * ->active_balance synchronizes accesses to
5158 * ->active_balance_work. Once set, it's cleared
5159 * only after active load balance is finished.
5160 */
5161 if (!busiest->active_balance) {
5162 busiest->active_balance = 1;
5163 busiest->push_cpu = this_cpu;
5164 active_balance = 1;
5165 }
5166 raw_spin_unlock_irqrestore(&busiest->lock, flags);
5167
5168 if (active_balance) {
5169 stop_one_cpu_nowait(cpu_of(busiest),
5170 active_load_balance_cpu_stop, busiest,
5171 &busiest->active_balance_work);
5172 }
5173
5174 /*
5175 * We've kicked active balancing, reset the failure
5176 * counter.
5177 */
5178 sd->nr_balance_failed = sd->cache_nice_tries+1;
5179 }
5180 } else
5181 sd->nr_balance_failed = 0;
5182
5183 if (likely(!active_balance)) {
5184 /* We were unbalanced, so reset the balancing interval */
5185 sd->balance_interval = sd->min_interval;
5186 } else {
5187 /*
5188 * If we've begun active balancing, start to back off. This
5189 * case may not be covered by the all_pinned logic if there
5190 * is only 1 task on the busy runqueue (because we don't call
5191 * move_tasks).
5192 */
5193 if (sd->balance_interval < sd->max_interval)
5194 sd->balance_interval *= 2;
5195 }
5196
5197 goto out;
5198
5199out_balanced:
5200 schedstat_inc(sd, lb_balanced[idle]);
5201
5202 sd->nr_balance_failed = 0;
5203
5204out_one_pinned:
5205 /* tune up the balancing interval */
5206 if (((env.flags & LBF_ALL_PINNED) &&
5207 sd->balance_interval < MAX_PINNED_INTERVAL) ||
5208 (sd->balance_interval < sd->max_interval))
5209 sd->balance_interval *= 2;
5210
5211 ld_moved = 0;
5212out:
5213 return ld_moved;
5214}
5215
5216/*
5217 * idle_balance is called by schedule() if this_cpu is about to become
5218 * idle. Attempts to pull tasks from other CPUs.
5219 */
5220void idle_balance(int this_cpu, struct rq *this_rq)
5221{
5222 struct sched_domain *sd;
5223 int pulled_task = 0;
5224 unsigned long next_balance = jiffies + HZ;
5225
5226 this_rq->idle_stamp = this_rq->clock;
5227
5228 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5229 return;
5230
5231 update_rq_runnable_avg(this_rq, 1);
5232
5233 /*
5234 * Drop the rq->lock, but keep IRQ/preempt disabled.
5235 */
5236 raw_spin_unlock(&this_rq->lock);
5237
5238 update_blocked_averages(this_cpu);
5239 rcu_read_lock();
5240 for_each_domain(this_cpu, sd) {
5241 unsigned long interval;
5242 int balance = 1;
5243
5244 if (!(sd->flags & SD_LOAD_BALANCE))
5245 continue;
5246
5247 if (sd->flags & SD_BALANCE_NEWIDLE) {
5248 /* If we've pulled tasks over stop searching: */
5249 pulled_task = load_balance(this_cpu, this_rq,
5250 sd, CPU_NEWLY_IDLE, &balance);
5251 }
5252
5253 interval = msecs_to_jiffies(sd->balance_interval);
5254 if (time_after(next_balance, sd->last_balance + interval))
5255 next_balance = sd->last_balance + interval;
5256 if (pulled_task) {
5257 this_rq->idle_stamp = 0;
5258 break;
5259 }
5260 }
5261 rcu_read_unlock();
5262
5263 raw_spin_lock(&this_rq->lock);
5264
5265 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5266 /*
5267 * We are going idle. next_balance may be set based on
5268 * a busy processor. So reset next_balance.
5269 */
5270 this_rq->next_balance = next_balance;
5271 }
5272}
5273
5274/*
5275 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5276 * running tasks off the busiest CPU onto idle CPUs. It requires at
5277 * least 1 task to be running on each physical CPU where possible, and
5278 * avoids physical / logical imbalances.
5279 */
5280static int active_load_balance_cpu_stop(void *data)
5281{
5282 struct rq *busiest_rq = data;
5283 int busiest_cpu = cpu_of(busiest_rq);
5284 int target_cpu = busiest_rq->push_cpu;
5285 struct rq *target_rq = cpu_rq(target_cpu);
5286 struct sched_domain *sd;
5287
5288 raw_spin_lock_irq(&busiest_rq->lock);
5289
5290 /* make sure the requested cpu hasn't gone down in the meantime */
5291 if (unlikely(busiest_cpu != smp_processor_id() ||
5292 !busiest_rq->active_balance))
5293 goto out_unlock;
5294
5295 /* Is there any task to move? */
5296 if (busiest_rq->nr_running <= 1)
5297 goto out_unlock;
5298
5299 /*
5300 * This condition is "impossible", if it occurs
5301 * we need to fix it. Originally reported by
5302 * Bjorn Helgaas on a 128-cpu setup.
5303 */
5304 BUG_ON(busiest_rq == target_rq);
5305
5306 /* move a task from busiest_rq to target_rq */
5307 double_lock_balance(busiest_rq, target_rq);
5308
5309 /* Search for an sd spanning us and the target CPU. */
5310 rcu_read_lock();
5311 for_each_domain(target_cpu, sd) {
5312 if ((sd->flags & SD_LOAD_BALANCE) &&
5313 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5314 break;
5315 }
5316
5317 if (likely(sd)) {
5318 struct lb_env env = {
5319 .sd = sd,
5320 .dst_cpu = target_cpu,
5321 .dst_rq = target_rq,
5322 .src_cpu = busiest_rq->cpu,
5323 .src_rq = busiest_rq,
5324 .idle = CPU_IDLE,
5325 };
5326
5327 schedstat_inc(sd, alb_count);
5328
5329 if (move_one_task(&env))
5330 schedstat_inc(sd, alb_pushed);
5331 else
5332 schedstat_inc(sd, alb_failed);
5333 }
5334 rcu_read_unlock();
5335 double_unlock_balance(busiest_rq, target_rq);
5336out_unlock:
5337 busiest_rq->active_balance = 0;
5338 raw_spin_unlock_irq(&busiest_rq->lock);
5339 return 0;
5340}
5341
5342#ifdef CONFIG_NO_HZ
5343/*
5344 * idle load balancing details
5345 * - When one of the busy CPUs notice that there may be an idle rebalancing
5346 * needed, they will kick the idle load balancer, which then does idle
5347 * load balancing for all the idle CPUs.
5348 */
5349static struct {
5350 cpumask_var_t idle_cpus_mask;
5351 atomic_t nr_cpus;
5352 unsigned long next_balance; /* in jiffy units */
5353} nohz ____cacheline_aligned;
5354
5355static inline int find_new_ilb(int call_cpu)
5356{
5357 int ilb = cpumask_first(nohz.idle_cpus_mask);
5358
5359 if (ilb < nr_cpu_ids && idle_cpu(ilb))
5360 return ilb;
5361
5362 return nr_cpu_ids;
5363}
5364
5365/*
5366 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5367 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5368 * CPU (if there is one).
5369 */
5370static void nohz_balancer_kick(int cpu)
5371{
5372 int ilb_cpu;
5373
5374 nohz.next_balance++;
5375
5376 ilb_cpu = find_new_ilb(cpu);
5377
5378 if (ilb_cpu >= nr_cpu_ids)
5379 return;
5380
5381 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5382 return;
5383 /*
5384 * Use smp_send_reschedule() instead of resched_cpu().
5385 * This way we generate a sched IPI on the target cpu which
5386 * is idle. And the softirq performing nohz idle load balance
5387 * will be run before returning from the IPI.
5388 */
5389 smp_send_reschedule(ilb_cpu);
5390 return;
5391}
5392
5393static inline void nohz_balance_exit_idle(int cpu)
5394{
5395 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5396 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5397 atomic_dec(&nohz.nr_cpus);
5398 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5399 }
5400}
5401
5402static inline void set_cpu_sd_state_busy(void)
5403{
5404 struct sched_domain *sd;
5405 int cpu = smp_processor_id();
5406
5407 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5408 return;
5409 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5410
5411 rcu_read_lock();
5412 for_each_domain(cpu, sd)
5413 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5414 rcu_read_unlock();
5415}
5416
5417void set_cpu_sd_state_idle(void)
5418{
5419 struct sched_domain *sd;
5420 int cpu = smp_processor_id();
5421
5422 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5423 return;
5424 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5425
5426 rcu_read_lock();
5427 for_each_domain(cpu, sd)
5428 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5429 rcu_read_unlock();
5430}
5431
5432/*
5433 * This routine will record that the cpu is going idle with tick stopped.
5434 * This info will be used in performing idle load balancing in the future.
5435 */
5436void nohz_balance_enter_idle(int cpu)
5437{
5438 /*
5439 * If this cpu is going down, then nothing needs to be done.
5440 */
5441 if (!cpu_active(cpu))
5442 return;
5443
5444 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5445 return;
5446
5447 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5448 atomic_inc(&nohz.nr_cpus);
5449 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5450}
5451
5452static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
5453 unsigned long action, void *hcpu)
5454{
5455 switch (action & ~CPU_TASKS_FROZEN) {
5456 case CPU_DYING:
5457 nohz_balance_exit_idle(smp_processor_id());
5458 return NOTIFY_OK;
5459 default:
5460 return NOTIFY_DONE;
5461 }
5462}
5463#endif
5464
5465static DEFINE_SPINLOCK(balancing);
5466
5467/*
5468 * Scale the max load_balance interval with the number of CPUs in the system.
5469 * This trades load-balance latency on larger machines for less cross talk.
5470 */
5471void update_max_interval(void)
5472{
5473 max_load_balance_interval = HZ*num_online_cpus()/10;
5474}
5475
5476/*
5477 * It checks each scheduling domain to see if it is due to be balanced,
5478 * and initiates a balancing operation if so.
5479 *
5480 * Balancing parameters are set up in arch_init_sched_domains.
5481 */
5482static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5483{
5484 int balance = 1;
5485 struct rq *rq = cpu_rq(cpu);
5486 unsigned long interval;
5487 struct sched_domain *sd;
5488 /* Earliest time when we have to do rebalance again */
5489 unsigned long next_balance = jiffies + 60*HZ;
5490 int update_next_balance = 0;
5491 int need_serialize;
5492
5493 update_blocked_averages(cpu);
5494
5495 rcu_read_lock();
5496 for_each_domain(cpu, sd) {
5497 if (!(sd->flags & SD_LOAD_BALANCE))
5498 continue;
5499
5500 interval = sd->balance_interval;
5501 if (idle != CPU_IDLE)
5502 interval *= sd->busy_factor;
5503
5504 /* scale ms to jiffies */
5505 interval = msecs_to_jiffies(interval);
5506 interval = clamp(interval, 1UL, max_load_balance_interval);
5507
5508 need_serialize = sd->flags & SD_SERIALIZE;
5509
5510 if (need_serialize) {
5511 if (!spin_trylock(&balancing))
5512 goto out;
5513 }
5514
5515 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5516 if (load_balance(cpu, rq, sd, idle, &balance)) {
5517 /*
5518 * We've pulled tasks over so either we're no
5519 * longer idle.
5520 */
5521 idle = CPU_NOT_IDLE;
5522 }
5523 sd->last_balance = jiffies;
5524 }
5525 if (need_serialize)
5526 spin_unlock(&balancing);
5527out:
5528 if (time_after(next_balance, sd->last_balance + interval)) {
5529 next_balance = sd->last_balance + interval;
5530 update_next_balance = 1;
5531 }
5532
5533 /*
5534 * Stop the load balance at this level. There is another
5535 * CPU in our sched group which is doing load balancing more
5536 * actively.
5537 */
5538 if (!balance)
5539 break;
5540 }
5541 rcu_read_unlock();
5542
5543 /*
5544 * next_balance will be updated only when there is a need.
5545 * When the cpu is attached to null domain for ex, it will not be
5546 * updated.
5547 */
5548 if (likely(update_next_balance))
5549 rq->next_balance = next_balance;
5550}
5551
5552#ifdef CONFIG_NO_HZ
5553/*
5554 * In CONFIG_NO_HZ case, the idle balance kickee will do the
5555 * rebalancing for all the cpus for whom scheduler ticks are stopped.
5556 */
5557static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5558{
5559 struct rq *this_rq = cpu_rq(this_cpu);
5560 struct rq *rq;
5561 int balance_cpu;
5562
5563 if (idle != CPU_IDLE ||
5564 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5565 goto end;
5566
5567 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5568 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5569 continue;
5570
5571 /*
5572 * If this cpu gets work to do, stop the load balancing
5573 * work being done for other cpus. Next load
5574 * balancing owner will pick it up.
5575 */
5576 if (need_resched())
5577 break;
5578
5579 rq = cpu_rq(balance_cpu);
5580
5581 raw_spin_lock_irq(&rq->lock);
5582 update_rq_clock(rq);
5583 update_idle_cpu_load(rq);
5584 raw_spin_unlock_irq(&rq->lock);
5585
5586 rebalance_domains(balance_cpu, CPU_IDLE);
5587
5588 if (time_after(this_rq->next_balance, rq->next_balance))
5589 this_rq->next_balance = rq->next_balance;
5590 }
5591 nohz.next_balance = this_rq->next_balance;
5592end:
5593 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5594}
5595
5596/*
5597 * Current heuristic for kicking the idle load balancer in the presence
5598 * of an idle cpu is the system.
5599 * - This rq has more than one task.
5600 * - At any scheduler domain level, this cpu's scheduler group has multiple
5601 * busy cpu's exceeding the group's power.
5602 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5603 * domain span are idle.
5604 */
5605static inline int nohz_kick_needed(struct rq *rq, int cpu)
5606{
5607 unsigned long now = jiffies;
5608 struct sched_domain *sd;
5609
5610 if (unlikely(idle_cpu(cpu)))
5611 return 0;
5612
5613 /*
5614 * We may be recently in ticked or tickless idle mode. At the first
5615 * busy tick after returning from idle, we will update the busy stats.
5616 */
5617 set_cpu_sd_state_busy();
5618 nohz_balance_exit_idle(cpu);
5619
5620 /*
5621 * None are in tickless mode and hence no need for NOHZ idle load
5622 * balancing.
5623 */
5624 if (likely(!atomic_read(&nohz.nr_cpus)))
5625 return 0;
5626
5627 if (time_before(now, nohz.next_balance))
5628 return 0;
5629
5630 if (rq->nr_running >= 2)
5631 goto need_kick;
5632
5633 rcu_read_lock();
5634 for_each_domain(cpu, sd) {
5635 struct sched_group *sg = sd->groups;
5636 struct sched_group_power *sgp = sg->sgp;
5637 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5638
5639 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5640 goto need_kick_unlock;
5641
5642 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5643 && (cpumask_first_and(nohz.idle_cpus_mask,
5644 sched_domain_span(sd)) < cpu))
5645 goto need_kick_unlock;
5646
5647 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5648 break;
5649 }
5650 rcu_read_unlock();
5651 return 0;
5652
5653need_kick_unlock:
5654 rcu_read_unlock();
5655need_kick:
5656 return 1;
5657}
5658#else
5659static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5660#endif
5661
5662/*
5663 * run_rebalance_domains is triggered when needed from the scheduler tick.
5664 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5665 */
5666static void run_rebalance_domains(struct softirq_action *h)
5667{
5668 int this_cpu = smp_processor_id();
5669 struct rq *this_rq = cpu_rq(this_cpu);
5670 enum cpu_idle_type idle = this_rq->idle_balance ?
5671 CPU_IDLE : CPU_NOT_IDLE;
5672
5673 rebalance_domains(this_cpu, idle);
5674
5675 /*
5676 * If this cpu has a pending nohz_balance_kick, then do the
5677 * balancing on behalf of the other idle cpus whose ticks are
5678 * stopped.
5679 */
5680 nohz_idle_balance(this_cpu, idle);
5681}
5682
5683static inline int on_null_domain(int cpu)
5684{
5685 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5686}
5687
5688/*
5689 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5690 */
5691void trigger_load_balance(struct rq *rq, int cpu)
5692{
5693 /* Don't need to rebalance while attached to NULL domain */
5694 if (time_after_eq(jiffies, rq->next_balance) &&
5695 likely(!on_null_domain(cpu)))
5696 raise_softirq(SCHED_SOFTIRQ);
5697#ifdef CONFIG_NO_HZ
5698 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5699 nohz_balancer_kick(cpu);
5700#endif
5701}
5702
5703static void rq_online_fair(struct rq *rq)
5704{
5705 update_sysctl();
5706}
5707
5708static void rq_offline_fair(struct rq *rq)
5709{
5710 update_sysctl();
5711
5712 /* Ensure any throttled groups are reachable by pick_next_task */
5713 unthrottle_offline_cfs_rqs(rq);
5714}
5715
5716#endif /* CONFIG_SMP */
5717
5718/*
5719 * scheduler tick hitting a task of our scheduling class:
5720 */
5721static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5722{
5723 struct cfs_rq *cfs_rq;
5724 struct sched_entity *se = &curr->se;
5725
5726 for_each_sched_entity(se) {
5727 cfs_rq = cfs_rq_of(se);
5728 entity_tick(cfs_rq, se, queued);
5729 }
5730
5731 if (sched_feat_numa(NUMA))
5732 task_tick_numa(rq, curr);
5733
5734 update_rq_runnable_avg(rq, 1);
5735}
5736
5737/*
5738 * called on fork with the child task as argument from the parent's context
5739 * - child not yet on the tasklist
5740 * - preemption disabled
5741 */
5742static void task_fork_fair(struct task_struct *p)
5743{
5744 struct cfs_rq *cfs_rq;
5745 struct sched_entity *se = &p->se, *curr;
5746 int this_cpu = smp_processor_id();
5747 struct rq *rq = this_rq();
5748 unsigned long flags;
5749
5750 raw_spin_lock_irqsave(&rq->lock, flags);
5751
5752 update_rq_clock(rq);
5753
5754 cfs_rq = task_cfs_rq(current);
5755 curr = cfs_rq->curr;
5756
5757 if (unlikely(task_cpu(p) != this_cpu)) {
5758 rcu_read_lock();
5759 __set_task_cpu(p, this_cpu);
5760 rcu_read_unlock();
5761 }
5762
5763 update_curr(cfs_rq);
5764
5765 if (curr)
5766 se->vruntime = curr->vruntime;
5767 place_entity(cfs_rq, se, 1);
5768
5769 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5770 /*
5771 * Upon rescheduling, sched_class::put_prev_task() will place
5772 * 'current' within the tree based on its new key value.
5773 */
5774 swap(curr->vruntime, se->vruntime);
5775 resched_task(rq->curr);
5776 }
5777
5778 se->vruntime -= cfs_rq->min_vruntime;
5779
5780 raw_spin_unlock_irqrestore(&rq->lock, flags);
5781}
5782
5783/*
5784 * Priority of the task has changed. Check to see if we preempt
5785 * the current task.
5786 */
5787static void
5788prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5789{
5790 if (!p->se.on_rq)
5791 return;
5792
5793 /*
5794 * Reschedule if we are currently running on this runqueue and
5795 * our priority decreased, or if we are not currently running on
5796 * this runqueue and our priority is higher than the current's
5797 */
5798 if (rq->curr == p) {
5799 if (p->prio > oldprio)
5800 resched_task(rq->curr);
5801 } else
5802 check_preempt_curr(rq, p, 0);
5803}
5804
5805static void switched_from_fair(struct rq *rq, struct task_struct *p)
5806{
5807 struct sched_entity *se = &p->se;
5808 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5809
5810 /*
5811 * Ensure the task's vruntime is normalized, so that when its
5812 * switched back to the fair class the enqueue_entity(.flags=0) will
5813 * do the right thing.
5814 *
5815 * If it was on_rq, then the dequeue_entity(.flags=0) will already
5816 * have normalized the vruntime, if it was !on_rq, then only when
5817 * the task is sleeping will it still have non-normalized vruntime.
5818 */
5819 if (!se->on_rq && p->state != TASK_RUNNING) {
5820 /*
5821 * Fix up our vruntime so that the current sleep doesn't
5822 * cause 'unlimited' sleep bonus.
5823 */
5824 place_entity(cfs_rq, se, 0);
5825 se->vruntime -= cfs_rq->min_vruntime;
5826 }
5827
5828#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5829 /*
5830 * Remove our load from contribution when we leave sched_fair
5831 * and ensure we don't carry in an old decay_count if we
5832 * switch back.
5833 */
5834 if (p->se.avg.decay_count) {
5835 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5836 __synchronize_entity_decay(&p->se);
5837 subtract_blocked_load_contrib(cfs_rq,
5838 p->se.avg.load_avg_contrib);
5839 }
5840#endif
5841}
5842
5843/*
5844 * We switched to the sched_fair class.
5845 */
5846static void switched_to_fair(struct rq *rq, struct task_struct *p)
5847{
5848 if (!p->se.on_rq)
5849 return;
5850
5851 /*
5852 * We were most likely switched from sched_rt, so
5853 * kick off the schedule if running, otherwise just see
5854 * if we can still preempt the current task.
5855 */
5856 if (rq->curr == p)
5857 resched_task(rq->curr);
5858 else
5859 check_preempt_curr(rq, p, 0);
5860}
5861
5862/* Account for a task changing its policy or group.
5863 *
5864 * This routine is mostly called to set cfs_rq->curr field when a task
5865 * migrates between groups/classes.
5866 */
5867static void set_curr_task_fair(struct rq *rq)
5868{
5869 struct sched_entity *se = &rq->curr->se;
5870
5871 for_each_sched_entity(se) {
5872 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5873
5874 set_next_entity(cfs_rq, se);
5875 /* ensure bandwidth has been allocated on our new cfs_rq */
5876 account_cfs_rq_runtime(cfs_rq, 0);
5877 }
5878}
5879
5880void init_cfs_rq(struct cfs_rq *cfs_rq)
5881{
5882 cfs_rq->tasks_timeline = RB_ROOT;
5883 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5884#ifndef CONFIG_64BIT
5885 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5886#endif
5887#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5888 atomic64_set(&cfs_rq->decay_counter, 1);
5889 atomic64_set(&cfs_rq->removed_load, 0);
5890#endif
5891}
5892
5893#ifdef CONFIG_FAIR_GROUP_SCHED
5894static void task_move_group_fair(struct task_struct *p, int on_rq)
5895{
5896 struct cfs_rq *cfs_rq;
5897 /*
5898 * If the task was not on the rq at the time of this cgroup movement
5899 * it must have been asleep, sleeping tasks keep their ->vruntime
5900 * absolute on their old rq until wakeup (needed for the fair sleeper
5901 * bonus in place_entity()).
5902 *
5903 * If it was on the rq, we've just 'preempted' it, which does convert
5904 * ->vruntime to a relative base.
5905 *
5906 * Make sure both cases convert their relative position when migrating
5907 * to another cgroup's rq. This does somewhat interfere with the
5908 * fair sleeper stuff for the first placement, but who cares.
5909 */
5910 /*
5911 * When !on_rq, vruntime of the task has usually NOT been normalized.
5912 * But there are some cases where it has already been normalized:
5913 *
5914 * - Moving a forked child which is waiting for being woken up by
5915 * wake_up_new_task().
5916 * - Moving a task which has been woken up by try_to_wake_up() and
5917 * waiting for actually being woken up by sched_ttwu_pending().
5918 *
5919 * To prevent boost or penalty in the new cfs_rq caused by delta
5920 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5921 */
5922 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5923 on_rq = 1;
5924
5925 if (!on_rq)
5926 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5927 set_task_rq(p, task_cpu(p));
5928 if (!on_rq) {
5929 cfs_rq = cfs_rq_of(&p->se);
5930 p->se.vruntime += cfs_rq->min_vruntime;
5931#ifdef CONFIG_SMP
5932 /*
5933 * migrate_task_rq_fair() will have removed our previous
5934 * contribution, but we must synchronize for ongoing future
5935 * decay.
5936 */
5937 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5938 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5939#endif
5940 }
5941}
5942
5943void free_fair_sched_group(struct task_group *tg)
5944{
5945 int i;
5946
5947 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5948
5949 for_each_possible_cpu(i) {
5950 if (tg->cfs_rq)
5951 kfree(tg->cfs_rq[i]);
5952 if (tg->se)
5953 kfree(tg->se[i]);
5954 }
5955
5956 kfree(tg->cfs_rq);
5957 kfree(tg->se);
5958}
5959
5960int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5961{
5962 struct cfs_rq *cfs_rq;
5963 struct sched_entity *se;
5964 int i;
5965
5966 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5967 if (!tg->cfs_rq)
5968 goto err;
5969 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5970 if (!tg->se)
5971 goto err;
5972
5973 tg->shares = NICE_0_LOAD;
5974
5975 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5976
5977 for_each_possible_cpu(i) {
5978 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5979 GFP_KERNEL, cpu_to_node(i));
5980 if (!cfs_rq)
5981 goto err;
5982
5983 se = kzalloc_node(sizeof(struct sched_entity),
5984 GFP_KERNEL, cpu_to_node(i));
5985 if (!se)
5986 goto err_free_rq;
5987
5988 init_cfs_rq(cfs_rq);
5989 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5990 }
5991
5992 return 1;
5993
5994err_free_rq:
5995 kfree(cfs_rq);
5996err:
5997 return 0;
5998}
5999
6000void unregister_fair_sched_group(struct task_group *tg, int cpu)
6001{
6002 struct rq *rq = cpu_rq(cpu);
6003 unsigned long flags;
6004
6005 /*
6006 * Only empty task groups can be destroyed; so we can speculatively
6007 * check on_list without danger of it being re-added.
6008 */
6009 if (!tg->cfs_rq[cpu]->on_list)
6010 return;
6011
6012 raw_spin_lock_irqsave(&rq->lock, flags);
6013 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6014 raw_spin_unlock_irqrestore(&rq->lock, flags);
6015}
6016
6017void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6018 struct sched_entity *se, int cpu,
6019 struct sched_entity *parent)
6020{
6021 struct rq *rq = cpu_rq(cpu);
6022
6023 cfs_rq->tg = tg;
6024 cfs_rq->rq = rq;
6025 init_cfs_rq_runtime(cfs_rq);
6026
6027 tg->cfs_rq[cpu] = cfs_rq;
6028 tg->se[cpu] = se;
6029
6030 /* se could be NULL for root_task_group */
6031 if (!se)
6032 return;
6033
6034 if (!parent)
6035 se->cfs_rq = &rq->cfs;
6036 else
6037 se->cfs_rq = parent->my_q;
6038
6039 se->my_q = cfs_rq;
6040 update_load_set(&se->load, 0);
6041 se->parent = parent;
6042}
6043
6044static DEFINE_MUTEX(shares_mutex);
6045
6046int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6047{
6048 int i;
6049 unsigned long flags;
6050
6051 /*
6052 * We can't change the weight of the root cgroup.
6053 */
6054 if (!tg->se[0])
6055 return -EINVAL;
6056
6057 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6058
6059 mutex_lock(&shares_mutex);
6060 if (tg->shares == shares)
6061 goto done;
6062
6063 tg->shares = shares;
6064 for_each_possible_cpu(i) {
6065 struct rq *rq = cpu_rq(i);
6066 struct sched_entity *se;
6067
6068 se = tg->se[i];
6069 /* Propagate contribution to hierarchy */
6070 raw_spin_lock_irqsave(&rq->lock, flags);
6071 for_each_sched_entity(se)
6072 update_cfs_shares(group_cfs_rq(se));
6073 raw_spin_unlock_irqrestore(&rq->lock, flags);
6074 }
6075
6076done:
6077 mutex_unlock(&shares_mutex);
6078 return 0;
6079}
6080#else /* CONFIG_FAIR_GROUP_SCHED */
6081
6082void free_fair_sched_group(struct task_group *tg) { }
6083
6084int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6085{
6086 return 1;
6087}
6088
6089void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6090
6091#endif /* CONFIG_FAIR_GROUP_SCHED */
6092
6093
6094static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
6095{
6096 struct sched_entity *se = &task->se;
6097 unsigned int rr_interval = 0;
6098
6099 /*
6100 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6101 * idle runqueue:
6102 */
6103 if (rq->cfs.load.weight)
6104 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6105
6106 return rr_interval;
6107}
6108
6109/*
6110 * All the scheduling class methods:
6111 */
6112const struct sched_class fair_sched_class = {
6113 .next = &idle_sched_class,
6114 .enqueue_task = enqueue_task_fair,
6115 .dequeue_task = dequeue_task_fair,
6116 .yield_task = yield_task_fair,
6117 .yield_to_task = yield_to_task_fair,
6118
6119 .check_preempt_curr = check_preempt_wakeup,
6120
6121 .pick_next_task = pick_next_task_fair,
6122 .put_prev_task = put_prev_task_fair,
6123
6124#ifdef CONFIG_SMP
6125 .select_task_rq = select_task_rq_fair,
6126#ifdef CONFIG_FAIR_GROUP_SCHED
6127 .migrate_task_rq = migrate_task_rq_fair,
6128#endif
6129 .rq_online = rq_online_fair,
6130 .rq_offline = rq_offline_fair,
6131
6132 .task_waking = task_waking_fair,
6133#endif
6134
6135 .set_curr_task = set_curr_task_fair,
6136 .task_tick = task_tick_fair,
6137 .task_fork = task_fork_fair,
6138
6139 .prio_changed = prio_changed_fair,
6140 .switched_from = switched_from_fair,
6141 .switched_to = switched_to_fair,
6142
6143 .get_rr_interval = get_rr_interval_fair,
6144
6145#ifdef CONFIG_FAIR_GROUP_SCHED
6146 .task_move_group = task_move_group_fair,
6147#endif
6148};
6149
6150#ifdef CONFIG_SCHED_DEBUG
6151void print_cfs_stats(struct seq_file *m, int cpu)
6152{
6153 struct cfs_rq *cfs_rq;
6154
6155 rcu_read_lock();
6156 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
6157 print_cfs_rq(m, cpu, cfs_rq);
6158 rcu_read_unlock();
6159}
6160#endif
6161
6162__init void init_sched_fair_class(void)
6163{
6164#ifdef CONFIG_SMP
6165 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6166
6167#ifdef CONFIG_NO_HZ
6168 nohz.next_balance = jiffies;
6169 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6170 cpu_notifier(sched_ilb_notifier, 0);
6171#endif
6172#endif /* SMP */
6173
6174}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
deleted file mode 100644
index 1ad1d2b5395..00000000000
--- a/kernel/sched/features.h
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * Only give sleepers 50% of their service deficit. This allows
3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart.
5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7
8/*
9 * Place new tasks ahead so that they do not starve already running
10 * tasks
11 */
12SCHED_FEAT(START_DEBIT, true)
13
14/*
15 * Prefer to schedule the task we woke last (assuming it failed
16 * wakeup-preemption), since its likely going to consume data we
17 * touched, increases cache locality.
18 */
19SCHED_FEAT(NEXT_BUDDY, false)
20
21/*
22 * Prefer to schedule the task that ran last (when we did
23 * wake-preempt) as that likely will touch the same data, increases
24 * cache locality.
25 */
26SCHED_FEAT(LAST_BUDDY, true)
27
28/*
29 * Consider buddies to be cache hot, decreases the likelyness of a
30 * cache buddy being migrated away, increases cache locality.
31 */
32SCHED_FEAT(CACHE_HOT_BUDDY, true)
33
34/*
35 * Allow wakeup-time preemption of the current task:
36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38
39/*
40 * Use arch dependent cpu power functions
41 */
42SCHED_FEAT(ARCH_POWER, true)
43
44SCHED_FEAT(HRTICK, false)
45SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true)
47
48/*
49 * Spin-wait on mutex acquisition when the mutex owner is running on
50 * another cpu -- assumes that when the owner is running, it will soon
51 * release the lock. Decreases scheduling overhead.
52 */
53SCHED_FEAT(OWNER_SPIN, true)
54
55/*
56 * Decrement CPU power based on time not spent running tasks
57 */
58SCHED_FEAT(NONTASK_POWER, true)
59
60/*
61 * Queue remote wakeups on the target CPU and process them
62 * using the scheduler IPI. Reduces rq->lock contention/bounces.
63 */
64SCHED_FEAT(TTWU_QUEUE, true)
65
66SCHED_FEAT(FORCE_SD_OVERLAP, false)
67SCHED_FEAT(RT_RUNTIME_SHARE, true)
68SCHED_FEAT(LB_MIN, false)
69
70/*
71 * Apply the automatic NUMA scheduling policy. Enabled automatically
72 * at runtime if running on a NUMA machine. Can be controlled via
73 * numa_balancing=. Allow PTE scanning to be forced on UMA machines
74 * for debugging the core machinery.
75 */
76#ifdef CONFIG_NUMA_BALANCING
77SCHED_FEAT(NUMA, false)
78SCHED_FEAT(NUMA_FORCE, false)
79#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index b6baf370cae..00000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,98 +0,0 @@
1#include "sched.h"
2
3/*
4 * idle-task scheduling class.
5 *
6 * (NOTE: these are not related to SCHED_IDLE tasks which are
7 * handled in sched/fair.c)
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{
14 return task_cpu(p); /* IDLE tasks as never migrated */
15}
16#endif /* CONFIG_SMP */
17/*
18 * Idle tasks are unconditionally rescheduled:
19 */
20static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->idle);
23}
24
25static struct task_struct *pick_next_task_idle(struct rq *rq)
26{
27 schedstat_inc(rq, sched_goidle);
28 return rq->idle;
29}
30
31/*
32 * It is not legal to sleep in the idle task - print a warning
33 * message if some code attempts to do it:
34 */
35static void
36dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
37{
38 raw_spin_unlock_irq(&rq->lock);
39 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
40 dump_stack();
41 raw_spin_lock_irq(&rq->lock);
42}
43
44static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
45{
46}
47
48static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
49{
50}
51
52static void set_curr_task_idle(struct rq *rq)
53{
54}
55
56static void switched_to_idle(struct rq *rq, struct task_struct *p)
57{
58 BUG();
59}
60
61static void
62prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
63{
64 BUG();
65}
66
67static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
68{
69 return 0;
70}
71
72/*
73 * Simple, special scheduling class for the per-CPU idle tasks:
74 */
75const struct sched_class idle_sched_class = {
76 /* .next is NULL */
77 /* no enqueue/yield_task for idle tasks */
78
79 /* dequeue is not valid, we print a debug message there: */
80 .dequeue_task = dequeue_task_idle,
81
82 .check_preempt_curr = check_preempt_curr_idle,
83
84 .pick_next_task = pick_next_task_idle,
85 .put_prev_task = put_prev_task_idle,
86
87#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle,
89#endif
90
91 .set_curr_task = set_curr_task_idle,
92 .task_tick = task_tick_idle,
93
94 .get_rr_interval = get_rr_interval_idle,
95
96 .prio_changed = prio_changed_idle,
97 .switched_to = switched_to_idle,
98};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
deleted file mode 100644
index 418feb01344..00000000000
--- a/kernel/sched/rt.c
+++ /dev/null
@@ -1,2094 +0,0 @@
1/*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies)
4 */
5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
92
93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
94
95static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
96{
97#ifdef CONFIG_SCHED_DEBUG
98 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
99#endif
100 return container_of(rt_se, struct task_struct, rt);
101}
102
103static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
104{
105 return rt_rq->rq;
106}
107
108static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
109{
110 return rt_se->rt_rq;
111}
112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
198#else /* CONFIG_RT_GROUP_SCHED */
199
200#define rt_entity_is_task(rt_se) (1)
201
202static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
203{
204 return container_of(rt_se, struct task_struct, rt);
205}
206
207static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
208{
209 return container_of(rt_rq, struct rq, rt);
210}
211
212static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
213{
214 struct task_struct *p = rt_task_of(rt_se);
215 struct rq *rq = task_rq(p);
216
217 return &rq->rt;
218}
219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
226#endif /* CONFIG_RT_GROUP_SCHED */
227
228#ifdef CONFIG_SMP
229
230static inline int rt_overloaded(struct rq *rq)
231{
232 return atomic_read(&rq->rd->rto_count);
233}
234
235static inline void rt_set_overload(struct rq *rq)
236{
237 if (!rq->online)
238 return;
239
240 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
241 /*
242 * Make sure the mask is visible before we set
243 * the overload count. That is checked to determine
244 * if we should look at the mask. It would be a shame
245 * if we looked at the mask, but the mask was not
246 * updated yet.
247 */
248 wmb();
249 atomic_inc(&rq->rd->rto_count);
250}
251
252static inline void rt_clear_overload(struct rq *rq)
253{
254 if (!rq->online)
255 return;
256
257 /* the order here really doesn't matter */
258 atomic_dec(&rq->rd->rto_count);
259 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
260}
261
262static void update_rt_migration(struct rt_rq *rt_rq)
263{
264 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
265 if (!rt_rq->overloaded) {
266 rt_set_overload(rq_of_rt_rq(rt_rq));
267 rt_rq->overloaded = 1;
268 }
269 } else if (rt_rq->overloaded) {
270 rt_clear_overload(rq_of_rt_rq(rt_rq));
271 rt_rq->overloaded = 0;
272 }
273}
274
275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
276{
277 struct task_struct *p;
278
279 if (!rt_entity_is_task(rt_se))
280 return;
281
282 p = rt_task_of(rt_se);
283 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
284
285 rt_rq->rt_nr_total++;
286 if (p->nr_cpus_allowed > 1)
287 rt_rq->rt_nr_migratory++;
288
289 update_rt_migration(rt_rq);
290}
291
292static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
293{
294 struct task_struct *p;
295
296 if (!rt_entity_is_task(rt_se))
297 return;
298
299 p = rt_task_of(rt_se);
300 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
301
302 rt_rq->rt_nr_total--;
303 if (p->nr_cpus_allowed > 1)
304 rt_rq->rt_nr_migratory--;
305
306 update_rt_migration(rt_rq);
307}
308
309static inline int has_pushable_tasks(struct rq *rq)
310{
311 return !plist_head_empty(&rq->rt.pushable_tasks);
312}
313
314static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
315{
316 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
317 plist_node_init(&p->pushable_tasks, p->prio);
318 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
319
320 /* Update the highest prio pushable task */
321 if (p->prio < rq->rt.highest_prio.next)
322 rq->rt.highest_prio.next = p->prio;
323}
324
325static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
326{
327 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
328
329 /* Update the new highest prio pushable task */
330 if (has_pushable_tasks(rq)) {
331 p = plist_first_entry(&rq->rt.pushable_tasks,
332 struct task_struct, pushable_tasks);
333 rq->rt.highest_prio.next = p->prio;
334 } else
335 rq->rt.highest_prio.next = MAX_RT_PRIO;
336}
337
338#else
339
340static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
341{
342}
343
344static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
345{
346}
347
348static inline
349void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
350{
351}
352
353static inline
354void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
355{
356}
357
358#endif /* CONFIG_SMP */
359
360static inline int on_rt_rq(struct sched_rt_entity *rt_se)
361{
362 return !list_empty(&rt_se->run_list);
363}
364
365#ifdef CONFIG_RT_GROUP_SCHED
366
367static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
368{
369 if (!rt_rq->tg)
370 return RUNTIME_INF;
371
372 return rt_rq->rt_runtime;
373}
374
375static inline u64 sched_rt_period(struct rt_rq *rt_rq)
376{
377 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
378}
379
380typedef struct task_group *rt_rq_iter_t;
381
382static inline struct task_group *next_task_group(struct task_group *tg)
383{
384 do {
385 tg = list_entry_rcu(tg->list.next,
386 typeof(struct task_group), list);
387 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
388
389 if (&tg->list == &task_groups)
390 tg = NULL;
391
392 return tg;
393}
394
395#define for_each_rt_rq(rt_rq, iter, rq) \
396 for (iter = container_of(&task_groups, typeof(*iter), list); \
397 (iter = next_task_group(iter)) && \
398 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
399
400static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
401{
402 list_add_rcu(&rt_rq->leaf_rt_rq_list,
403 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
404}
405
406static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
407{
408 list_del_rcu(&rt_rq->leaf_rt_rq_list);
409}
410
411#define for_each_leaf_rt_rq(rt_rq, rq) \
412 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
413
414#define for_each_sched_rt_entity(rt_se) \
415 for (; rt_se; rt_se = rt_se->parent)
416
417static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
418{
419 return rt_se->my_q;
420}
421
422static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
423static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
424
425static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
426{
427 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
428 struct sched_rt_entity *rt_se;
429
430 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
431
432 rt_se = rt_rq->tg->rt_se[cpu];
433
434 if (rt_rq->rt_nr_running) {
435 if (rt_se && !on_rt_rq(rt_se))
436 enqueue_rt_entity(rt_se, false);
437 if (rt_rq->highest_prio.curr < curr->prio)
438 resched_task(curr);
439 }
440}
441
442static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
443{
444 struct sched_rt_entity *rt_se;
445 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
446
447 rt_se = rt_rq->tg->rt_se[cpu];
448
449 if (rt_se && on_rt_rq(rt_se))
450 dequeue_rt_entity(rt_se);
451}
452
453static inline int rt_rq_throttled(struct rt_rq *rt_rq)
454{
455 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
456}
457
458static int rt_se_boosted(struct sched_rt_entity *rt_se)
459{
460 struct rt_rq *rt_rq = group_rt_rq(rt_se);
461 struct task_struct *p;
462
463 if (rt_rq)
464 return !!rt_rq->rt_nr_boosted;
465
466 p = rt_task_of(rt_se);
467 return p->prio != p->normal_prio;
468}
469
470#ifdef CONFIG_SMP
471static inline const struct cpumask *sched_rt_period_mask(void)
472{
473 return cpu_rq(smp_processor_id())->rd->span;
474}
475#else
476static inline const struct cpumask *sched_rt_period_mask(void)
477{
478 return cpu_online_mask;
479}
480#endif
481
482static inline
483struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
484{
485 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
486}
487
488static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
489{
490 return &rt_rq->tg->rt_bandwidth;
491}
492
493#else /* !CONFIG_RT_GROUP_SCHED */
494
495static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
496{
497 return rt_rq->rt_runtime;
498}
499
500static inline u64 sched_rt_period(struct rt_rq *rt_rq)
501{
502 return ktime_to_ns(def_rt_bandwidth.rt_period);
503}
504
505typedef struct rt_rq *rt_rq_iter_t;
506
507#define for_each_rt_rq(rt_rq, iter, rq) \
508 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
509
510static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
511{
512}
513
514static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
515{
516}
517
518#define for_each_leaf_rt_rq(rt_rq, rq) \
519 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
520
521#define for_each_sched_rt_entity(rt_se) \
522 for (; rt_se; rt_se = NULL)
523
524static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
525{
526 return NULL;
527}
528
529static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
530{
531 if (rt_rq->rt_nr_running)
532 resched_task(rq_of_rt_rq(rt_rq)->curr);
533}
534
535static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
536{
537}
538
539static inline int rt_rq_throttled(struct rt_rq *rt_rq)
540{
541 return rt_rq->rt_throttled;
542}
543
544static inline const struct cpumask *sched_rt_period_mask(void)
545{
546 return cpu_online_mask;
547}
548
549static inline
550struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
551{
552 return &cpu_rq(cpu)->rt;
553}
554
555static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
556{
557 return &def_rt_bandwidth;
558}
559
560#endif /* CONFIG_RT_GROUP_SCHED */
561
562#ifdef CONFIG_SMP
563/*
564 * We ran out of runtime, see if we can borrow some from our neighbours.
565 */
566static int do_balance_runtime(struct rt_rq *rt_rq)
567{
568 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
569 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
570 int i, weight, more = 0;
571 u64 rt_period;
572
573 weight = cpumask_weight(rd->span);
574
575 raw_spin_lock(&rt_b->rt_runtime_lock);
576 rt_period = ktime_to_ns(rt_b->rt_period);
577 for_each_cpu(i, rd->span) {
578 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
579 s64 diff;
580
581 if (iter == rt_rq)
582 continue;
583
584 raw_spin_lock(&iter->rt_runtime_lock);
585 /*
586 * Either all rqs have inf runtime and there's nothing to steal
587 * or __disable_runtime() below sets a specific rq to inf to
588 * indicate its been disabled and disalow stealing.
589 */
590 if (iter->rt_runtime == RUNTIME_INF)
591 goto next;
592
593 /*
594 * From runqueues with spare time, take 1/n part of their
595 * spare time, but no more than our period.
596 */
597 diff = iter->rt_runtime - iter->rt_time;
598 if (diff > 0) {
599 diff = div_u64((u64)diff, weight);
600 if (rt_rq->rt_runtime + diff > rt_period)
601 diff = rt_period - rt_rq->rt_runtime;
602 iter->rt_runtime -= diff;
603 rt_rq->rt_runtime += diff;
604 more = 1;
605 if (rt_rq->rt_runtime == rt_period) {
606 raw_spin_unlock(&iter->rt_runtime_lock);
607 break;
608 }
609 }
610next:
611 raw_spin_unlock(&iter->rt_runtime_lock);
612 }
613 raw_spin_unlock(&rt_b->rt_runtime_lock);
614
615 return more;
616}
617
618/*
619 * Ensure this RQ takes back all the runtime it lend to its neighbours.
620 */
621static void __disable_runtime(struct rq *rq)
622{
623 struct root_domain *rd = rq->rd;
624 rt_rq_iter_t iter;
625 struct rt_rq *rt_rq;
626
627 if (unlikely(!scheduler_running))
628 return;
629
630 for_each_rt_rq(rt_rq, iter, rq) {
631 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
632 s64 want;
633 int i;
634
635 raw_spin_lock(&rt_b->rt_runtime_lock);
636 raw_spin_lock(&rt_rq->rt_runtime_lock);
637 /*
638 * Either we're all inf and nobody needs to borrow, or we're
639 * already disabled and thus have nothing to do, or we have
640 * exactly the right amount of runtime to take out.
641 */
642 if (rt_rq->rt_runtime == RUNTIME_INF ||
643 rt_rq->rt_runtime == rt_b->rt_runtime)
644 goto balanced;
645 raw_spin_unlock(&rt_rq->rt_runtime_lock);
646
647 /*
648 * Calculate the difference between what we started out with
649 * and what we current have, that's the amount of runtime
650 * we lend and now have to reclaim.
651 */
652 want = rt_b->rt_runtime - rt_rq->rt_runtime;
653
654 /*
655 * Greedy reclaim, take back as much as we can.
656 */
657 for_each_cpu(i, rd->span) {
658 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
659 s64 diff;
660
661 /*
662 * Can't reclaim from ourselves or disabled runqueues.
663 */
664 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
665 continue;
666
667 raw_spin_lock(&iter->rt_runtime_lock);
668 if (want > 0) {
669 diff = min_t(s64, iter->rt_runtime, want);
670 iter->rt_runtime -= diff;
671 want -= diff;
672 } else {
673 iter->rt_runtime -= want;
674 want -= want;
675 }
676 raw_spin_unlock(&iter->rt_runtime_lock);
677
678 if (!want)
679 break;
680 }
681
682 raw_spin_lock(&rt_rq->rt_runtime_lock);
683 /*
684 * We cannot be left wanting - that would mean some runtime
685 * leaked out of the system.
686 */
687 BUG_ON(want);
688balanced:
689 /*
690 * Disable all the borrow logic by pretending we have inf
691 * runtime - in which case borrowing doesn't make sense.
692 */
693 rt_rq->rt_runtime = RUNTIME_INF;
694 rt_rq->rt_throttled = 0;
695 raw_spin_unlock(&rt_rq->rt_runtime_lock);
696 raw_spin_unlock(&rt_b->rt_runtime_lock);
697 }
698}
699
700static void disable_runtime(struct rq *rq)
701{
702 unsigned long flags;
703
704 raw_spin_lock_irqsave(&rq->lock, flags);
705 __disable_runtime(rq);
706 raw_spin_unlock_irqrestore(&rq->lock, flags);
707}
708
709static void __enable_runtime(struct rq *rq)
710{
711 rt_rq_iter_t iter;
712 struct rt_rq *rt_rq;
713
714 if (unlikely(!scheduler_running))
715 return;
716
717 /*
718 * Reset each runqueue's bandwidth settings
719 */
720 for_each_rt_rq(rt_rq, iter, rq) {
721 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
722
723 raw_spin_lock(&rt_b->rt_runtime_lock);
724 raw_spin_lock(&rt_rq->rt_runtime_lock);
725 rt_rq->rt_runtime = rt_b->rt_runtime;
726 rt_rq->rt_time = 0;
727 rt_rq->rt_throttled = 0;
728 raw_spin_unlock(&rt_rq->rt_runtime_lock);
729 raw_spin_unlock(&rt_b->rt_runtime_lock);
730 }
731}
732
733static void enable_runtime(struct rq *rq)
734{
735 unsigned long flags;
736
737 raw_spin_lock_irqsave(&rq->lock, flags);
738 __enable_runtime(rq);
739 raw_spin_unlock_irqrestore(&rq->lock, flags);
740}
741
742int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
743{
744 int cpu = (int)(long)hcpu;
745
746 switch (action) {
747 case CPU_DOWN_PREPARE:
748 case CPU_DOWN_PREPARE_FROZEN:
749 disable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 case CPU_DOWN_FAILED:
753 case CPU_DOWN_FAILED_FROZEN:
754 case CPU_ONLINE:
755 case CPU_ONLINE_FROZEN:
756 enable_runtime(cpu_rq(cpu));
757 return NOTIFY_OK;
758
759 default:
760 return NOTIFY_DONE;
761 }
762}
763
764static int balance_runtime(struct rt_rq *rt_rq)
765{
766 int more = 0;
767
768 if (!sched_feat(RT_RUNTIME_SHARE))
769 return more;
770
771 if (rt_rq->rt_time > rt_rq->rt_runtime) {
772 raw_spin_unlock(&rt_rq->rt_runtime_lock);
773 more = do_balance_runtime(rt_rq);
774 raw_spin_lock(&rt_rq->rt_runtime_lock);
775 }
776
777 return more;
778}
779#else /* !CONFIG_SMP */
780static inline int balance_runtime(struct rt_rq *rt_rq)
781{
782 return 0;
783}
784#endif /* CONFIG_SMP */
785
786static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
787{
788 int i, idle = 1, throttled = 0;
789 const struct cpumask *span;
790
791 span = sched_rt_period_mask();
792#ifdef CONFIG_RT_GROUP_SCHED
793 /*
794 * FIXME: isolated CPUs should really leave the root task group,
795 * whether they are isolcpus or were isolated via cpusets, lest
796 * the timer run on a CPU which does not service all runqueues,
797 * potentially leaving other CPUs indefinitely throttled. If
798 * isolation is really required, the user will turn the throttle
799 * off to kill the perturbations it causes anyway. Meanwhile,
800 * this maintains functionality for boot and/or troubleshooting.
801 */
802 if (rt_b == &root_task_group.rt_bandwidth)
803 span = cpu_online_mask;
804#endif
805 for_each_cpu(i, span) {
806 int enqueue = 0;
807 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
808 struct rq *rq = rq_of_rt_rq(rt_rq);
809
810 raw_spin_lock(&rq->lock);
811 if (rt_rq->rt_time) {
812 u64 runtime;
813
814 raw_spin_lock(&rt_rq->rt_runtime_lock);
815 if (rt_rq->rt_throttled)
816 balance_runtime(rt_rq);
817 runtime = rt_rq->rt_runtime;
818 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
819 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
820 rt_rq->rt_throttled = 0;
821 enqueue = 1;
822
823 /*
824 * Force a clock update if the CPU was idle,
825 * lest wakeup -> unthrottle time accumulate.
826 */
827 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
828 rq->skip_clock_update = -1;
829 }
830 if (rt_rq->rt_time || rt_rq->rt_nr_running)
831 idle = 0;
832 raw_spin_unlock(&rt_rq->rt_runtime_lock);
833 } else if (rt_rq->rt_nr_running) {
834 idle = 0;
835 if (!rt_rq_throttled(rt_rq))
836 enqueue = 1;
837 }
838 if (rt_rq->rt_throttled)
839 throttled = 1;
840
841 if (enqueue)
842 sched_rt_rq_enqueue(rt_rq);
843 raw_spin_unlock(&rq->lock);
844 }
845
846 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
847 return 1;
848
849 return idle;
850}
851
852static inline int rt_se_prio(struct sched_rt_entity *rt_se)
853{
854#ifdef CONFIG_RT_GROUP_SCHED
855 struct rt_rq *rt_rq = group_rt_rq(rt_se);
856
857 if (rt_rq)
858 return rt_rq->highest_prio.curr;
859#endif
860
861 return rt_task_of(rt_se)->prio;
862}
863
864static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
865{
866 u64 runtime = sched_rt_runtime(rt_rq);
867
868 if (rt_rq->rt_throttled)
869 return rt_rq_throttled(rt_rq);
870
871 if (runtime >= sched_rt_period(rt_rq))
872 return 0;
873
874 balance_runtime(rt_rq);
875 runtime = sched_rt_runtime(rt_rq);
876 if (runtime == RUNTIME_INF)
877 return 0;
878
879 if (rt_rq->rt_time > runtime) {
880 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
881
882 /*
883 * Don't actually throttle groups that have no runtime assigned
884 * but accrue some time due to boosting.
885 */
886 if (likely(rt_b->rt_runtime)) {
887 static bool once = false;
888
889 rt_rq->rt_throttled = 1;
890
891 if (!once) {
892 once = true;
893 printk_sched("sched: RT throttling activated\n");
894 }
895 } else {
896 /*
897 * In case we did anyway, make it go away,
898 * replenishment is a joke, since it will replenish us
899 * with exactly 0 ns.
900 */
901 rt_rq->rt_time = 0;
902 }
903
904 if (rt_rq_throttled(rt_rq)) {
905 sched_rt_rq_dequeue(rt_rq);
906 return 1;
907 }
908 }
909
910 return 0;
911}
912
913/*
914 * Update the current task's runtime statistics. Skip current tasks that
915 * are not in our scheduling class.
916 */
917static void update_curr_rt(struct rq *rq)
918{
919 struct task_struct *curr = rq->curr;
920 struct sched_rt_entity *rt_se = &curr->rt;
921 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
922 u64 delta_exec;
923
924 if (curr->sched_class != &rt_sched_class)
925 return;
926
927 delta_exec = rq->clock_task - curr->se.exec_start;
928 if (unlikely((s64)delta_exec < 0))
929 delta_exec = 0;
930
931 schedstat_set(curr->se.statistics.exec_max,
932 max(curr->se.statistics.exec_max, delta_exec));
933
934 curr->se.sum_exec_runtime += delta_exec;
935 account_group_exec_runtime(curr, delta_exec);
936
937 curr->se.exec_start = rq->clock_task;
938 cpuacct_charge(curr, delta_exec);
939
940 sched_rt_avg_update(rq, delta_exec);
941
942 if (!rt_bandwidth_enabled())
943 return;
944
945 for_each_sched_rt_entity(rt_se) {
946 rt_rq = rt_rq_of_se(rt_se);
947
948 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
949 raw_spin_lock(&rt_rq->rt_runtime_lock);
950 rt_rq->rt_time += delta_exec;
951 if (sched_rt_runtime_exceeded(rt_rq))
952 resched_task(curr);
953 raw_spin_unlock(&rt_rq->rt_runtime_lock);
954 }
955 }
956}
957
958#if defined CONFIG_SMP
959
960static void
961inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
962{
963 struct rq *rq = rq_of_rt_rq(rt_rq);
964
965 if (rq->online && prio < prev_prio)
966 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
967}
968
969static void
970dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
971{
972 struct rq *rq = rq_of_rt_rq(rt_rq);
973
974 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
975 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
976}
977
978#else /* CONFIG_SMP */
979
980static inline
981void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
982static inline
983void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
984
985#endif /* CONFIG_SMP */
986
987#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
988static void
989inc_rt_prio(struct rt_rq *rt_rq, int prio)
990{
991 int prev_prio = rt_rq->highest_prio.curr;
992
993 if (prio < prev_prio)
994 rt_rq->highest_prio.curr = prio;
995
996 inc_rt_prio_smp(rt_rq, prio, prev_prio);
997}
998
999static void
1000dec_rt_prio(struct rt_rq *rt_rq, int prio)
1001{
1002 int prev_prio = rt_rq->highest_prio.curr;
1003
1004 if (rt_rq->rt_nr_running) {
1005
1006 WARN_ON(prio < prev_prio);
1007
1008 /*
1009 * This may have been our highest task, and therefore
1010 * we may have some recomputation to do
1011 */
1012 if (prio == prev_prio) {
1013 struct rt_prio_array *array = &rt_rq->active;
1014
1015 rt_rq->highest_prio.curr =
1016 sched_find_first_bit(array->bitmap);
1017 }
1018
1019 } else
1020 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1021
1022 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1023}
1024
1025#else
1026
1027static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1028static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1029
1030#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1031
1032#ifdef CONFIG_RT_GROUP_SCHED
1033
1034static void
1035inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1036{
1037 if (rt_se_boosted(rt_se))
1038 rt_rq->rt_nr_boosted++;
1039
1040 if (rt_rq->tg)
1041 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1042}
1043
1044static void
1045dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1046{
1047 if (rt_se_boosted(rt_se))
1048 rt_rq->rt_nr_boosted--;
1049
1050 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1051}
1052
1053#else /* CONFIG_RT_GROUP_SCHED */
1054
1055static void
1056inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1057{
1058 start_rt_bandwidth(&def_rt_bandwidth);
1059}
1060
1061static inline
1062void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1063
1064#endif /* CONFIG_RT_GROUP_SCHED */
1065
1066static inline
1067void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1068{
1069 int prio = rt_se_prio(rt_se);
1070
1071 WARN_ON(!rt_prio(prio));
1072 rt_rq->rt_nr_running++;
1073
1074 inc_rt_prio(rt_rq, prio);
1075 inc_rt_migration(rt_se, rt_rq);
1076 inc_rt_group(rt_se, rt_rq);
1077}
1078
1079static inline
1080void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1081{
1082 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1083 WARN_ON(!rt_rq->rt_nr_running);
1084 rt_rq->rt_nr_running--;
1085
1086 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1087 dec_rt_migration(rt_se, rt_rq);
1088 dec_rt_group(rt_se, rt_rq);
1089}
1090
1091static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1092{
1093 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1094 struct rt_prio_array *array = &rt_rq->active;
1095 struct rt_rq *group_rq = group_rt_rq(rt_se);
1096 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1097
1098 /*
1099 * Don't enqueue the group if its throttled, or when empty.
1100 * The latter is a consequence of the former when a child group
1101 * get throttled and the current group doesn't have any other
1102 * active members.
1103 */
1104 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1105 return;
1106
1107 if (!rt_rq->rt_nr_running)
1108 list_add_leaf_rt_rq(rt_rq);
1109
1110 if (head)
1111 list_add(&rt_se->run_list, queue);
1112 else
1113 list_add_tail(&rt_se->run_list, queue);
1114 __set_bit(rt_se_prio(rt_se), array->bitmap);
1115
1116 inc_rt_tasks(rt_se, rt_rq);
1117}
1118
1119static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1120{
1121 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1122 struct rt_prio_array *array = &rt_rq->active;
1123
1124 list_del_init(&rt_se->run_list);
1125 if (list_empty(array->queue + rt_se_prio(rt_se)))
1126 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1127
1128 dec_rt_tasks(rt_se, rt_rq);
1129 if (!rt_rq->rt_nr_running)
1130 list_del_leaf_rt_rq(rt_rq);
1131}
1132
1133/*
1134 * Because the prio of an upper entry depends on the lower
1135 * entries, we must remove entries top - down.
1136 */
1137static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1138{
1139 struct sched_rt_entity *back = NULL;
1140
1141 for_each_sched_rt_entity(rt_se) {
1142 rt_se->back = back;
1143 back = rt_se;
1144 }
1145
1146 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1147 if (on_rt_rq(rt_se))
1148 __dequeue_rt_entity(rt_se);
1149 }
1150}
1151
1152static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1153{
1154 dequeue_rt_stack(rt_se);
1155 for_each_sched_rt_entity(rt_se)
1156 __enqueue_rt_entity(rt_se, head);
1157}
1158
1159static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1160{
1161 dequeue_rt_stack(rt_se);
1162
1163 for_each_sched_rt_entity(rt_se) {
1164 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1165
1166 if (rt_rq && rt_rq->rt_nr_running)
1167 __enqueue_rt_entity(rt_se, false);
1168 }
1169}
1170
1171/*
1172 * Adding/removing a task to/from a priority array:
1173 */
1174static void
1175enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1176{
1177 struct sched_rt_entity *rt_se = &p->rt;
1178
1179 if (flags & ENQUEUE_WAKEUP)
1180 rt_se->timeout = 0;
1181
1182 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1183
1184 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1185 enqueue_pushable_task(rq, p);
1186
1187 inc_nr_running(rq);
1188}
1189
1190static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1191{
1192 struct sched_rt_entity *rt_se = &p->rt;
1193
1194 update_curr_rt(rq);
1195 dequeue_rt_entity(rt_se);
1196
1197 dequeue_pushable_task(rq, p);
1198
1199 dec_nr_running(rq);
1200}
1201
1202/*
1203 * Put task to the head or the end of the run list without the overhead of
1204 * dequeue followed by enqueue.
1205 */
1206static void
1207requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1208{
1209 if (on_rt_rq(rt_se)) {
1210 struct rt_prio_array *array = &rt_rq->active;
1211 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1212
1213 if (head)
1214 list_move(&rt_se->run_list, queue);
1215 else
1216 list_move_tail(&rt_se->run_list, queue);
1217 }
1218}
1219
1220static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1221{
1222 struct sched_rt_entity *rt_se = &p->rt;
1223 struct rt_rq *rt_rq;
1224
1225 for_each_sched_rt_entity(rt_se) {
1226 rt_rq = rt_rq_of_se(rt_se);
1227 requeue_rt_entity(rt_rq, rt_se, head);
1228 }
1229}
1230
1231static void yield_task_rt(struct rq *rq)
1232{
1233 requeue_task_rt(rq, rq->curr, 0);
1234}
1235
1236#ifdef CONFIG_SMP
1237static int find_lowest_rq(struct task_struct *task);
1238
1239static int
1240select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1241{
1242 struct task_struct *curr;
1243 struct rq *rq;
1244 int cpu;
1245
1246 cpu = task_cpu(p);
1247
1248 if (p->nr_cpus_allowed == 1)
1249 goto out;
1250
1251 /* For anything but wake ups, just return the task_cpu */
1252 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1253 goto out;
1254
1255 rq = cpu_rq(cpu);
1256
1257 rcu_read_lock();
1258 curr = ACCESS_ONCE(rq->curr); /* unlocked access */
1259
1260 /*
1261 * If the current task on @p's runqueue is an RT task, then
1262 * try to see if we can wake this RT task up on another
1263 * runqueue. Otherwise simply start this RT task
1264 * on its current runqueue.
1265 *
1266 * We want to avoid overloading runqueues. If the woken
1267 * task is a higher priority, then it will stay on this CPU
1268 * and the lower prio task should be moved to another CPU.
1269 * Even though this will probably make the lower prio task
1270 * lose its cache, we do not want to bounce a higher task
1271 * around just because it gave up its CPU, perhaps for a
1272 * lock?
1273 *
1274 * For equal prio tasks, we just let the scheduler sort it out.
1275 *
1276 * Otherwise, just let it ride on the affined RQ and the
1277 * post-schedule router will push the preempted task away
1278 *
1279 * This test is optimistic, if we get it wrong the load-balancer
1280 * will have to sort it out.
1281 */
1282 if (curr && unlikely(rt_task(curr)) &&
1283 (curr->nr_cpus_allowed < 2 ||
1284 curr->prio <= p->prio) &&
1285 (p->nr_cpus_allowed > 1)) {
1286 int target = find_lowest_rq(p);
1287
1288 if (target != -1)
1289 cpu = target;
1290 }
1291 rcu_read_unlock();
1292
1293out:
1294 return cpu;
1295}
1296
1297static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1298{
1299 if (rq->curr->nr_cpus_allowed == 1)
1300 return;
1301
1302 if (p->nr_cpus_allowed != 1
1303 && cpupri_find(&rq->rd->cpupri, p, NULL))
1304 return;
1305
1306 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1307 return;
1308
1309 /*
1310 * There appears to be other cpus that can accept
1311 * current and none to run 'p', so lets reschedule
1312 * to try and push current away:
1313 */
1314 requeue_task_rt(rq, p, 1);
1315 resched_task(rq->curr);
1316}
1317
1318#endif /* CONFIG_SMP */
1319
1320/*
1321 * Preempt the current task with a newly woken task if needed:
1322 */
1323static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1324{
1325 if (p->prio < rq->curr->prio) {
1326 resched_task(rq->curr);
1327 return;
1328 }
1329
1330#ifdef CONFIG_SMP
1331 /*
1332 * If:
1333 *
1334 * - the newly woken task is of equal priority to the current task
1335 * - the newly woken task is non-migratable while current is migratable
1336 * - current will be preempted on the next reschedule
1337 *
1338 * we should check to see if current can readily move to a different
1339 * cpu. If so, we will reschedule to allow the push logic to try
1340 * to move current somewhere else, making room for our non-migratable
1341 * task.
1342 */
1343 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1344 check_preempt_equal_prio(rq, p);
1345#endif
1346}
1347
1348static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1349 struct rt_rq *rt_rq)
1350{
1351 struct rt_prio_array *array = &rt_rq->active;
1352 struct sched_rt_entity *next = NULL;
1353 struct list_head *queue;
1354 int idx;
1355
1356 idx = sched_find_first_bit(array->bitmap);
1357 BUG_ON(idx >= MAX_RT_PRIO);
1358
1359 queue = array->queue + idx;
1360 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1361
1362 return next;
1363}
1364
1365static struct task_struct *_pick_next_task_rt(struct rq *rq)
1366{
1367 struct sched_rt_entity *rt_se;
1368 struct task_struct *p;
1369 struct rt_rq *rt_rq;
1370
1371 rt_rq = &rq->rt;
1372
1373 if (!rt_rq->rt_nr_running)
1374 return NULL;
1375
1376 if (rt_rq_throttled(rt_rq))
1377 return NULL;
1378
1379 do {
1380 rt_se = pick_next_rt_entity(rq, rt_rq);
1381 BUG_ON(!rt_se);
1382 rt_rq = group_rt_rq(rt_se);
1383 } while (rt_rq);
1384
1385 p = rt_task_of(rt_se);
1386 p->se.exec_start = rq->clock_task;
1387
1388 return p;
1389}
1390
1391static struct task_struct *pick_next_task_rt(struct rq *rq)
1392{
1393 struct task_struct *p = _pick_next_task_rt(rq);
1394
1395 /* The running task is never eligible for pushing */
1396 if (p)
1397 dequeue_pushable_task(rq, p);
1398
1399#ifdef CONFIG_SMP
1400 /*
1401 * We detect this state here so that we can avoid taking the RQ
1402 * lock again later if there is no need to push
1403 */
1404 rq->post_schedule = has_pushable_tasks(rq);
1405#endif
1406
1407 return p;
1408}
1409
1410static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1411{
1412 update_curr_rt(rq);
1413
1414 /*
1415 * The previous task needs to be made eligible for pushing
1416 * if it is still active
1417 */
1418 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1419 enqueue_pushable_task(rq, p);
1420}
1421
1422#ifdef CONFIG_SMP
1423
1424/* Only try algorithms three times */
1425#define RT_MAX_TRIES 3
1426
1427static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1428{
1429 if (!task_running(rq, p) &&
1430 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1431 (p->nr_cpus_allowed > 1))
1432 return 1;
1433 return 0;
1434}
1435
1436/* Return the second highest RT task, NULL otherwise */
1437static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1438{
1439 struct task_struct *next = NULL;
1440 struct sched_rt_entity *rt_se;
1441 struct rt_prio_array *array;
1442 struct rt_rq *rt_rq;
1443 int idx;
1444
1445 for_each_leaf_rt_rq(rt_rq, rq) {
1446 array = &rt_rq->active;
1447 idx = sched_find_first_bit(array->bitmap);
1448next_idx:
1449 if (idx >= MAX_RT_PRIO)
1450 continue;
1451 if (next && next->prio <= idx)
1452 continue;
1453 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1454 struct task_struct *p;
1455
1456 if (!rt_entity_is_task(rt_se))
1457 continue;
1458
1459 p = rt_task_of(rt_se);
1460 if (pick_rt_task(rq, p, cpu)) {
1461 next = p;
1462 break;
1463 }
1464 }
1465 if (!next) {
1466 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
1467 goto next_idx;
1468 }
1469 }
1470
1471 return next;
1472}
1473
1474static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1475
1476static int find_lowest_rq(struct task_struct *task)
1477{
1478 struct sched_domain *sd;
1479 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
1480 int this_cpu = smp_processor_id();
1481 int cpu = task_cpu(task);
1482
1483 /* Make sure the mask is initialized first */
1484 if (unlikely(!lowest_mask))
1485 return -1;
1486
1487 if (task->nr_cpus_allowed == 1)
1488 return -1; /* No other targets possible */
1489
1490 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1491 return -1; /* No targets found */
1492
1493 /*
1494 * At this point we have built a mask of cpus representing the
1495 * lowest priority tasks in the system. Now we want to elect
1496 * the best one based on our affinity and topology.
1497 *
1498 * We prioritize the last cpu that the task executed on since
1499 * it is most likely cache-hot in that location.
1500 */
1501 if (cpumask_test_cpu(cpu, lowest_mask))
1502 return cpu;
1503
1504 /*
1505 * Otherwise, we consult the sched_domains span maps to figure
1506 * out which cpu is logically closest to our hot cache data.
1507 */
1508 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1509 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1510
1511 rcu_read_lock();
1512 for_each_domain(cpu, sd) {
1513 if (sd->flags & SD_WAKE_AFFINE) {
1514 int best_cpu;
1515
1516 /*
1517 * "this_cpu" is cheaper to preempt than a
1518 * remote processor.
1519 */
1520 if (this_cpu != -1 &&
1521 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1522 rcu_read_unlock();
1523 return this_cpu;
1524 }
1525
1526 best_cpu = cpumask_first_and(lowest_mask,
1527 sched_domain_span(sd));
1528 if (best_cpu < nr_cpu_ids) {
1529 rcu_read_unlock();
1530 return best_cpu;
1531 }
1532 }
1533 }
1534 rcu_read_unlock();
1535
1536 /*
1537 * And finally, if there were no matches within the domains
1538 * just give the caller *something* to work with from the compatible
1539 * locations.
1540 */
1541 if (this_cpu != -1)
1542 return this_cpu;
1543
1544 cpu = cpumask_any(lowest_mask);
1545 if (cpu < nr_cpu_ids)
1546 return cpu;
1547 return -1;
1548}
1549
1550/* Will lock the rq it finds */
1551static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1552{
1553 struct rq *lowest_rq = NULL;
1554 int tries;
1555 int cpu;
1556
1557 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1558 cpu = find_lowest_rq(task);
1559
1560 if ((cpu == -1) || (cpu == rq->cpu))
1561 break;
1562
1563 lowest_rq = cpu_rq(cpu);
1564
1565 /* if the prio of this runqueue changed, try again */
1566 if (double_lock_balance(rq, lowest_rq)) {
1567 /*
1568 * We had to unlock the run queue. In
1569 * the mean time, task could have
1570 * migrated already or had its affinity changed.
1571 * Also make sure that it wasn't scheduled on its rq.
1572 */
1573 if (unlikely(task_rq(task) != rq ||
1574 !cpumask_test_cpu(lowest_rq->cpu,
1575 tsk_cpus_allowed(task)) ||
1576 task_running(rq, task) ||
1577 !task->on_rq)) {
1578
1579 double_unlock_balance(rq, lowest_rq);
1580 lowest_rq = NULL;
1581 break;
1582 }
1583 }
1584
1585 /* If this rq is still suitable use it. */
1586 if (lowest_rq->rt.highest_prio.curr > task->prio)
1587 break;
1588
1589 /* try again */
1590 double_unlock_balance(rq, lowest_rq);
1591 lowest_rq = NULL;
1592 }
1593
1594 return lowest_rq;
1595}
1596
1597static struct task_struct *pick_next_pushable_task(struct rq *rq)
1598{
1599 struct task_struct *p;
1600
1601 if (!has_pushable_tasks(rq))
1602 return NULL;
1603
1604 p = plist_first_entry(&rq->rt.pushable_tasks,
1605 struct task_struct, pushable_tasks);
1606
1607 BUG_ON(rq->cpu != task_cpu(p));
1608 BUG_ON(task_current(rq, p));
1609 BUG_ON(p->nr_cpus_allowed <= 1);
1610
1611 BUG_ON(!p->on_rq);
1612 BUG_ON(!rt_task(p));
1613
1614 return p;
1615}
1616
1617/*
1618 * If the current CPU has more than one RT task, see if the non
1619 * running task can migrate over to a CPU that is running a task
1620 * of lesser priority.
1621 */
1622static int push_rt_task(struct rq *rq)
1623{
1624 struct task_struct *next_task;
1625 struct rq *lowest_rq;
1626 int ret = 0;
1627
1628 if (!rq->rt.overloaded)
1629 return 0;
1630
1631 next_task = pick_next_pushable_task(rq);
1632 if (!next_task)
1633 return 0;
1634
1635retry:
1636 if (unlikely(next_task == rq->curr)) {
1637 WARN_ON(1);
1638 return 0;
1639 }
1640
1641 /*
1642 * It's possible that the next_task slipped in of
1643 * higher priority than current. If that's the case
1644 * just reschedule current.
1645 */
1646 if (unlikely(next_task->prio < rq->curr->prio)) {
1647 resched_task(rq->curr);
1648 return 0;
1649 }
1650
1651 /* We might release rq lock */
1652 get_task_struct(next_task);
1653
1654 /* find_lock_lowest_rq locks the rq if found */
1655 lowest_rq = find_lock_lowest_rq(next_task, rq);
1656 if (!lowest_rq) {
1657 struct task_struct *task;
1658 /*
1659 * find_lock_lowest_rq releases rq->lock
1660 * so it is possible that next_task has migrated.
1661 *
1662 * We need to make sure that the task is still on the same
1663 * run-queue and is also still the next task eligible for
1664 * pushing.
1665 */
1666 task = pick_next_pushable_task(rq);
1667 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1668 /*
1669 * The task hasn't migrated, and is still the next
1670 * eligible task, but we failed to find a run-queue
1671 * to push it to. Do not retry in this case, since
1672 * other cpus will pull from us when ready.
1673 */
1674 goto out;
1675 }
1676
1677 if (!task)
1678 /* No more tasks, just exit */
1679 goto out;
1680
1681 /*
1682 * Something has shifted, try again.
1683 */
1684 put_task_struct(next_task);
1685 next_task = task;
1686 goto retry;
1687 }
1688
1689 deactivate_task(rq, next_task, 0);
1690 set_task_cpu(next_task, lowest_rq->cpu);
1691 activate_task(lowest_rq, next_task, 0);
1692 ret = 1;
1693
1694 resched_task(lowest_rq->curr);
1695
1696 double_unlock_balance(rq, lowest_rq);
1697
1698out:
1699 put_task_struct(next_task);
1700
1701 return ret;
1702}
1703
1704static void push_rt_tasks(struct rq *rq)
1705{
1706 /* push_rt_task will return true if it moved an RT */
1707 while (push_rt_task(rq))
1708 ;
1709}
1710
1711static int pull_rt_task(struct rq *this_rq)
1712{
1713 int this_cpu = this_rq->cpu, ret = 0, cpu;
1714 struct task_struct *p;
1715 struct rq *src_rq;
1716
1717 if (likely(!rt_overloaded(this_rq)))
1718 return 0;
1719
1720 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1721 if (this_cpu == cpu)
1722 continue;
1723
1724 src_rq = cpu_rq(cpu);
1725
1726 /*
1727 * Don't bother taking the src_rq->lock if the next highest
1728 * task is known to be lower-priority than our current task.
1729 * This may look racy, but if this value is about to go
1730 * logically higher, the src_rq will push this task away.
1731 * And if its going logically lower, we do not care
1732 */
1733 if (src_rq->rt.highest_prio.next >=
1734 this_rq->rt.highest_prio.curr)
1735 continue;
1736
1737 /*
1738 * We can potentially drop this_rq's lock in
1739 * double_lock_balance, and another CPU could
1740 * alter this_rq
1741 */
1742 double_lock_balance(this_rq, src_rq);
1743
1744 /*
1745 * Are there still pullable RT tasks?
1746 */
1747 if (src_rq->rt.rt_nr_running <= 1)
1748 goto skip;
1749
1750 p = pick_next_highest_task_rt(src_rq, this_cpu);
1751
1752 /*
1753 * Do we have an RT task that preempts
1754 * the to-be-scheduled task?
1755 */
1756 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1757 WARN_ON(p == src_rq->curr);
1758 WARN_ON(!p->on_rq);
1759
1760 /*
1761 * There's a chance that p is higher in priority
1762 * than what's currently running on its cpu.
1763 * This is just that p is wakeing up and hasn't
1764 * had a chance to schedule. We only pull
1765 * p if it is lower in priority than the
1766 * current task on the run queue
1767 */
1768 if (p->prio < src_rq->curr->prio)
1769 goto skip;
1770
1771 ret = 1;
1772
1773 deactivate_task(src_rq, p, 0);
1774 set_task_cpu(p, this_cpu);
1775 activate_task(this_rq, p, 0);
1776 /*
1777 * We continue with the search, just in
1778 * case there's an even higher prio task
1779 * in another runqueue. (low likelihood
1780 * but possible)
1781 */
1782 }
1783skip:
1784 double_unlock_balance(this_rq, src_rq);
1785 }
1786
1787 return ret;
1788}
1789
1790static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1791{
1792 /* Try to pull RT tasks here if we lower this rq's prio */
1793 if (rq->rt.highest_prio.curr > prev->prio)
1794 pull_rt_task(rq);
1795}
1796
1797static void post_schedule_rt(struct rq *rq)
1798{
1799 push_rt_tasks(rq);
1800}
1801
1802/*
1803 * If we are not running and we are not going to reschedule soon, we should
1804 * try to push tasks away now
1805 */
1806static void task_woken_rt(struct rq *rq, struct task_struct *p)
1807{
1808 if (!task_running(rq, p) &&
1809 !test_tsk_need_resched(rq->curr) &&
1810 has_pushable_tasks(rq) &&
1811 p->nr_cpus_allowed > 1 &&
1812 rt_task(rq->curr) &&
1813 (rq->curr->nr_cpus_allowed < 2 ||
1814 rq->curr->prio <= p->prio))
1815 push_rt_tasks(rq);
1816}
1817
1818static void set_cpus_allowed_rt(struct task_struct *p,
1819 const struct cpumask *new_mask)
1820{
1821 struct rq *rq;
1822 int weight;
1823
1824 BUG_ON(!rt_task(p));
1825
1826 if (!p->on_rq)
1827 return;
1828
1829 weight = cpumask_weight(new_mask);
1830
1831 /*
1832 * Only update if the process changes its state from whether it
1833 * can migrate or not.
1834 */
1835 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1836 return;
1837
1838 rq = task_rq(p);
1839
1840 /*
1841 * The process used to be able to migrate OR it can now migrate
1842 */
1843 if (weight <= 1) {
1844 if (!task_current(rq, p))
1845 dequeue_pushable_task(rq, p);
1846 BUG_ON(!rq->rt.rt_nr_migratory);
1847 rq->rt.rt_nr_migratory--;
1848 } else {
1849 if (!task_current(rq, p))
1850 enqueue_pushable_task(rq, p);
1851 rq->rt.rt_nr_migratory++;
1852 }
1853
1854 update_rt_migration(&rq->rt);
1855}
1856
1857/* Assumes rq->lock is held */
1858static void rq_online_rt(struct rq *rq)
1859{
1860 if (rq->rt.overloaded)
1861 rt_set_overload(rq);
1862
1863 __enable_runtime(rq);
1864
1865 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
1866}
1867
1868/* Assumes rq->lock is held */
1869static void rq_offline_rt(struct rq *rq)
1870{
1871 if (rq->rt.overloaded)
1872 rt_clear_overload(rq);
1873
1874 __disable_runtime(rq);
1875
1876 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1877}
1878
1879/*
1880 * When switch from the rt queue, we bring ourselves to a position
1881 * that we might want to pull RT tasks from other runqueues.
1882 */
1883static void switched_from_rt(struct rq *rq, struct task_struct *p)
1884{
1885 /*
1886 * If there are other RT tasks then we will reschedule
1887 * and the scheduling of the other RT tasks will handle
1888 * the balancing. But if we are the last RT task
1889 * we may need to handle the pulling of RT tasks
1890 * now.
1891 */
1892 if (p->on_rq && !rq->rt.rt_nr_running)
1893 pull_rt_task(rq);
1894}
1895
1896void init_sched_rt_class(void)
1897{
1898 unsigned int i;
1899
1900 for_each_possible_cpu(i) {
1901 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1902 GFP_KERNEL, cpu_to_node(i));
1903 }
1904}
1905#endif /* CONFIG_SMP */
1906
1907/*
1908 * When switching a task to RT, we may overload the runqueue
1909 * with RT tasks. In this case we try to push them off to
1910 * other runqueues.
1911 */
1912static void switched_to_rt(struct rq *rq, struct task_struct *p)
1913{
1914 int check_resched = 1;
1915
1916 /*
1917 * If we are already running, then there's nothing
1918 * that needs to be done. But if we are not running
1919 * we may need to preempt the current running task.
1920 * If that current running task is also an RT task
1921 * then see if we can move to another run queue.
1922 */
1923 if (p->on_rq && rq->curr != p) {
1924#ifdef CONFIG_SMP
1925 if (rq->rt.overloaded && push_rt_task(rq) &&
1926 /* Don't resched if we changed runqueues */
1927 rq != task_rq(p))
1928 check_resched = 0;
1929#endif /* CONFIG_SMP */
1930 if (check_resched && p->prio < rq->curr->prio)
1931 resched_task(rq->curr);
1932 }
1933}
1934
1935/*
1936 * Priority of the task has changed. This may cause
1937 * us to initiate a push or pull.
1938 */
1939static void
1940prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1941{
1942 if (!p->on_rq)
1943 return;
1944
1945 if (rq->curr == p) {
1946#ifdef CONFIG_SMP
1947 /*
1948 * If our priority decreases while running, we
1949 * may need to pull tasks to this runqueue.
1950 */
1951 if (oldprio < p->prio)
1952 pull_rt_task(rq);
1953 /*
1954 * If there's a higher priority task waiting to run
1955 * then reschedule. Note, the above pull_rt_task
1956 * can release the rq lock and p could migrate.
1957 * Only reschedule if p is still on the same runqueue.
1958 */
1959 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
1960 resched_task(p);
1961#else
1962 /* For UP simply resched on drop of prio */
1963 if (oldprio < p->prio)
1964 resched_task(p);
1965#endif /* CONFIG_SMP */
1966 } else {
1967 /*
1968 * This task is not running, but if it is
1969 * greater than the current running task
1970 * then reschedule.
1971 */
1972 if (p->prio < rq->curr->prio)
1973 resched_task(rq->curr);
1974 }
1975}
1976
1977static void watchdog(struct rq *rq, struct task_struct *p)
1978{
1979 unsigned long soft, hard;
1980
1981 /* max may change after cur was read, this will be fixed next tick */
1982 soft = task_rlimit(p, RLIMIT_RTTIME);
1983 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1984
1985 if (soft != RLIM_INFINITY) {
1986 unsigned long next;
1987
1988 p->rt.timeout++;
1989 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1990 if (p->rt.timeout > next)
1991 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1992 }
1993}
1994
1995static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1996{
1997 struct sched_rt_entity *rt_se = &p->rt;
1998
1999 update_curr_rt(rq);
2000
2001 watchdog(rq, p);
2002
2003 /*
2004 * RR tasks need a special form of timeslice management.
2005 * FIFO tasks have no timeslices.
2006 */
2007 if (p->policy != SCHED_RR)
2008 return;
2009
2010 if (--p->rt.time_slice)
2011 return;
2012
2013 p->rt.time_slice = RR_TIMESLICE;
2014
2015 /*
2016 * Requeue to the end of queue if we (and all of our ancestors) are the
2017 * only element on the queue
2018 */
2019 for_each_sched_rt_entity(rt_se) {
2020 if (rt_se->run_list.prev != rt_se->run_list.next) {
2021 requeue_task_rt(rq, p, 0);
2022 set_tsk_need_resched(p);
2023 return;
2024 }
2025 }
2026}
2027
2028static void set_curr_task_rt(struct rq *rq)
2029{
2030 struct task_struct *p = rq->curr;
2031
2032 p->se.exec_start = rq->clock_task;
2033
2034 /* The running task is never eligible for pushing */
2035 dequeue_pushable_task(rq, p);
2036}
2037
2038static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2039{
2040 /*
2041 * Time slice is 0 for SCHED_FIFO tasks
2042 */
2043 if (task->policy == SCHED_RR)
2044 return RR_TIMESLICE;
2045 else
2046 return 0;
2047}
2048
2049const struct sched_class rt_sched_class = {
2050 .next = &fair_sched_class,
2051 .enqueue_task = enqueue_task_rt,
2052 .dequeue_task = dequeue_task_rt,
2053 .yield_task = yield_task_rt,
2054
2055 .check_preempt_curr = check_preempt_curr_rt,
2056
2057 .pick_next_task = pick_next_task_rt,
2058 .put_prev_task = put_prev_task_rt,
2059
2060#ifdef CONFIG_SMP
2061 .select_task_rq = select_task_rq_rt,
2062
2063 .set_cpus_allowed = set_cpus_allowed_rt,
2064 .rq_online = rq_online_rt,
2065 .rq_offline = rq_offline_rt,
2066 .pre_schedule = pre_schedule_rt,
2067 .post_schedule = post_schedule_rt,
2068 .task_woken = task_woken_rt,
2069 .switched_from = switched_from_rt,
2070#endif
2071
2072 .set_curr_task = set_curr_task_rt,
2073 .task_tick = task_tick_rt,
2074
2075 .get_rr_interval = get_rr_interval_rt,
2076
2077 .prio_changed = prio_changed_rt,
2078 .switched_to = switched_to_rt,
2079};
2080
2081#ifdef CONFIG_SCHED_DEBUG
2082extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2083
2084void print_rt_stats(struct seq_file *m, int cpu)
2085{
2086 rt_rq_iter_t iter;
2087 struct rt_rq *rt_rq;
2088
2089 rcu_read_lock();
2090 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2091 print_rt_rq(m, cpu, rt_rq);
2092 rcu_read_unlock();
2093}
2094#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
deleted file mode 100644
index fc886441436..00000000000
--- a/kernel/sched/sched.h
+++ /dev/null
@@ -1,1241 +0,0 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 */
40
41/*
42 * single value that denotes runtime == period, ie unlimited time.
43 */
44#define RUNTIME_INF ((u64)~0ULL)
45
46static inline int rt_policy(int policy)
47{
48 if (policy == SCHED_FIFO || policy == SCHED_RR)
49 return 1;
50 return 0;
51}
52
53static inline int task_has_rt_policy(struct task_struct *p)
54{
55 return rt_policy(p->policy);
56}
57
58/*
59 * This is the priority-queue data structure of the RT scheduling class:
60 */
61struct rt_prio_array {
62 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
63 struct list_head queue[MAX_RT_PRIO];
64};
65
66struct rt_bandwidth {
67 /* nests inside the rq lock: */
68 raw_spinlock_t rt_runtime_lock;
69 ktime_t rt_period;
70 u64 rt_runtime;
71 struct hrtimer rt_period_timer;
72};
73
74extern struct mutex sched_domains_mutex;
75
76#ifdef CONFIG_CGROUP_SCHED
77
78#include <linux/cgroup.h>
79
80struct cfs_rq;
81struct rt_rq;
82
83extern struct list_head task_groups;
84
85struct cfs_bandwidth {
86#ifdef CONFIG_CFS_BANDWIDTH
87 raw_spinlock_t lock;
88 ktime_t period;
89 u64 quota, runtime;
90 s64 hierarchal_quota;
91 u64 runtime_expires;
92
93 int idle, timer_active;
94 struct hrtimer period_timer, slack_timer;
95 struct list_head throttled_cfs_rq;
96
97 /* statistics */
98 int nr_periods, nr_throttled;
99 u64 throttled_time;
100#endif
101};
102
103/* task group related information */
104struct task_group {
105 struct cgroup_subsys_state css;
106
107#ifdef CONFIG_FAIR_GROUP_SCHED
108 /* schedulable entities of this group on each cpu */
109 struct sched_entity **se;
110 /* runqueue "owned" by this group on each cpu */
111 struct cfs_rq **cfs_rq;
112 unsigned long shares;
113
114 atomic_t load_weight;
115 atomic64_t load_avg;
116 atomic_t runnable_avg;
117#endif
118
119#ifdef CONFIG_RT_GROUP_SCHED
120 struct sched_rt_entity **rt_se;
121 struct rt_rq **rt_rq;
122
123 struct rt_bandwidth rt_bandwidth;
124#endif
125
126 struct rcu_head rcu;
127 struct list_head list;
128
129 struct task_group *parent;
130 struct list_head siblings;
131 struct list_head children;
132
133#ifdef CONFIG_SCHED_AUTOGROUP
134 struct autogroup *autogroup;
135#endif
136
137 struct cfs_bandwidth cfs_bandwidth;
138};
139
140#ifdef CONFIG_FAIR_GROUP_SCHED
141#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
142
143/*
144 * A weight of 0 or 1 can cause arithmetics problems.
145 * A weight of a cfs_rq is the sum of weights of which entities
146 * are queued on this cfs_rq, so a weight of a entity should not be
147 * too large, so as the shares value of a task group.
148 * (The default weight is 1024 - so there's no practical
149 * limitation from this.)
150 */
151#define MIN_SHARES (1UL << 1)
152#define MAX_SHARES (1UL << 18)
153#endif
154
155/* Default task group.
156 * Every task in system belong to this group at bootup.
157 */
158extern struct task_group root_task_group;
159
160typedef int (*tg_visitor)(struct task_group *, void *);
161
162extern int walk_tg_tree_from(struct task_group *from,
163 tg_visitor down, tg_visitor up, void *data);
164
165/*
166 * Iterate the full tree, calling @down when first entering a node and @up when
167 * leaving it for the final time.
168 *
169 * Caller must hold rcu_lock or sufficient equivalent.
170 */
171static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
172{
173 return walk_tg_tree_from(&root_task_group, down, up, data);
174}
175
176extern int tg_nop(struct task_group *tg, void *data);
177
178extern void free_fair_sched_group(struct task_group *tg);
179extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
180extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
181extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
182 struct sched_entity *se, int cpu,
183 struct sched_entity *parent);
184extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
185extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
186
187extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
188extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
189extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
190
191extern void free_rt_sched_group(struct task_group *tg);
192extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
193extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
194 struct sched_rt_entity *rt_se, int cpu,
195 struct sched_rt_entity *parent);
196
197#else /* CONFIG_CGROUP_SCHED */
198
199struct cfs_bandwidth { };
200
201#endif /* CONFIG_CGROUP_SCHED */
202
203/* CFS-related fields in a runqueue */
204struct cfs_rq {
205 struct load_weight load;
206 unsigned int nr_running, h_nr_running;
207
208 u64 exec_clock;
209 u64 min_vruntime;
210#ifndef CONFIG_64BIT
211 u64 min_vruntime_copy;
212#endif
213
214 struct rb_root tasks_timeline;
215 struct rb_node *rb_leftmost;
216
217 /*
218 * 'curr' points to currently running entity on this cfs_rq.
219 * It is set to NULL otherwise (i.e when none are currently running).
220 */
221 struct sched_entity *curr, *next, *last, *skip;
222
223#ifdef CONFIG_SCHED_DEBUG
224 unsigned int nr_spread_over;
225#endif
226
227#ifdef CONFIG_SMP
228/*
229 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
230 * removed when useful for applications beyond shares distribution (e.g.
231 * load-balance).
232 */
233#ifdef CONFIG_FAIR_GROUP_SCHED
234 /*
235 * CFS Load tracking
236 * Under CFS, load is tracked on a per-entity basis and aggregated up.
237 * This allows for the description of both thread and group usage (in
238 * the FAIR_GROUP_SCHED case).
239 */
240 u64 runnable_load_avg, blocked_load_avg;
241 atomic64_t decay_counter, removed_load;
242 u64 last_decay;
243#endif /* CONFIG_FAIR_GROUP_SCHED */
244/* These always depend on CONFIG_FAIR_GROUP_SCHED */
245#ifdef CONFIG_FAIR_GROUP_SCHED
246 u32 tg_runnable_contrib;
247 u64 tg_load_contrib;
248#endif /* CONFIG_FAIR_GROUP_SCHED */
249
250 /*
251 * h_load = weight * f(tg)
252 *
253 * Where f(tg) is the recursive weight fraction assigned to
254 * this group.
255 */
256 unsigned long h_load;
257#endif /* CONFIG_SMP */
258
259#ifdef CONFIG_FAIR_GROUP_SCHED
260 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
261
262 /*
263 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
264 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
265 * (like users, containers etc.)
266 *
267 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
268 * list is used during load balance.
269 */
270 int on_list;
271 struct list_head leaf_cfs_rq_list;
272 struct task_group *tg; /* group that "owns" this runqueue */
273
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_clock, throttled_clock_task;
280 u64 throttled_clock_task_time;
281 int throttled, throttle_count;
282 struct list_head throttled_list;
283#endif /* CONFIG_CFS_BANDWIDTH */
284#endif /* CONFIG_FAIR_GROUP_SCHED */
285};
286
287static inline int rt_bandwidth_enabled(void)
288{
289 return sysctl_sched_rt_runtime >= 0;
290}
291
292/* Real-Time classes' related field in a runqueue: */
293struct rt_rq {
294 struct rt_prio_array active;
295 unsigned int rt_nr_running;
296#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
297 struct {
298 int curr; /* highest queued rt task prio */
299#ifdef CONFIG_SMP
300 int next; /* next highest */
301#endif
302 } highest_prio;
303#endif
304#ifdef CONFIG_SMP
305 unsigned long rt_nr_migratory;
306 unsigned long rt_nr_total;
307 int overloaded;
308 struct plist_head pushable_tasks;
309#endif
310 int rt_throttled;
311 u64 rt_time;
312 u64 rt_runtime;
313 /* Nests inside the rq lock: */
314 raw_spinlock_t rt_runtime_lock;
315
316#ifdef CONFIG_RT_GROUP_SCHED
317 unsigned long rt_nr_boosted;
318
319 struct rq *rq;
320 struct list_head leaf_rt_rq_list;
321 struct task_group *tg;
322#endif
323};
324
325#ifdef CONFIG_SMP
326
327/*
328 * We add the notion of a root-domain which will be used to define per-domain
329 * variables. Each exclusive cpuset essentially defines an island domain by
330 * fully partitioning the member cpus from any other cpuset. Whenever a new
331 * exclusive cpuset is created, we also create and attach a new root-domain
332 * object.
333 *
334 */
335struct root_domain {
336 atomic_t refcount;
337 atomic_t rto_count;
338 struct rcu_head rcu;
339 cpumask_var_t span;
340 cpumask_var_t online;
341
342 /*
343 * The "RT overload" flag: it gets set if a CPU has more than
344 * one runnable RT task.
345 */
346 cpumask_var_t rto_mask;
347 struct cpupri cpupri;
348};
349
350extern struct root_domain def_root_domain;
351
352#endif /* CONFIG_SMP */
353
354/*
355 * This is the main, per-CPU runqueue data structure.
356 *
357 * Locking rule: those places that want to lock multiple runqueues
358 * (such as the load balancing or the thread migration code), lock
359 * acquire operations must be ordered by ascending &runqueue.
360 */
361struct rq {
362 /* runqueue lock: */
363 raw_spinlock_t lock;
364
365 /*
366 * nr_running and cpu_load should be in the same cacheline because
367 * remote CPUs use both these fields when doing load calculation.
368 */
369 unsigned int nr_running;
370 #define CPU_LOAD_IDX_MAX 5
371 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
372 unsigned long last_load_update_tick;
373#ifdef CONFIG_NO_HZ
374 u64 nohz_stamp;
375 unsigned long nohz_flags;
376#endif
377 int skip_clock_update;
378
379 /* capture load from *all* tasks on this cpu: */
380 struct load_weight load;
381 unsigned long nr_load_updates;
382 u64 nr_switches;
383
384 struct cfs_rq cfs;
385 struct rt_rq rt;
386
387#ifdef CONFIG_FAIR_GROUP_SCHED
388 /* list of leaf cfs_rq on this cpu: */
389 struct list_head leaf_cfs_rq_list;
390#ifdef CONFIG_SMP
391 unsigned long h_load_throttle;
392#endif /* CONFIG_SMP */
393#endif /* CONFIG_FAIR_GROUP_SCHED */
394
395#ifdef CONFIG_RT_GROUP_SCHED
396 struct list_head leaf_rt_rq_list;
397#endif
398
399 /*
400 * This is part of a global counter where only the total sum
401 * over all CPUs matters. A task can increase this counter on
402 * one CPU and if it got migrated afterwards it may decrease
403 * it on another CPU. Always updated under the runqueue lock:
404 */
405 unsigned long nr_uninterruptible;
406
407 struct task_struct *curr, *idle, *stop;
408 unsigned long next_balance;
409 struct mm_struct *prev_mm;
410
411 u64 clock;
412 u64 clock_task;
413
414 atomic_t nr_iowait;
415
416#ifdef CONFIG_SMP
417 struct root_domain *rd;
418 struct sched_domain *sd;
419
420 unsigned long cpu_power;
421
422 unsigned char idle_balance;
423 /* For active balancing */
424 int post_schedule;
425 int active_balance;
426 int push_cpu;
427 struct cpu_stop_work active_balance_work;
428 /* cpu of this runqueue: */
429 int cpu;
430 int online;
431
432 struct list_head cfs_tasks;
433
434 u64 rt_avg;
435 u64 age_stamp;
436 u64 idle_stamp;
437 u64 avg_idle;
438#endif
439
440#ifdef CONFIG_IRQ_TIME_ACCOUNTING
441 u64 prev_irq_time;
442#endif
443#ifdef CONFIG_PARAVIRT
444 u64 prev_steal_time;
445#endif
446#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
447 u64 prev_steal_time_rq;
448#endif
449
450 /* calc_load related fields */
451 unsigned long calc_load_update;
452 long calc_load_active;
453
454#ifdef CONFIG_SCHED_HRTICK
455#ifdef CONFIG_SMP
456 int hrtick_csd_pending;
457 struct call_single_data hrtick_csd;
458#endif
459 struct hrtimer hrtick_timer;
460#endif
461
462#ifdef CONFIG_SCHEDSTATS
463 /* latency stats */
464 struct sched_info rq_sched_info;
465 unsigned long long rq_cpu_time;
466 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
467
468 /* sys_sched_yield() stats */
469 unsigned int yld_count;
470
471 /* schedule() stats */
472 unsigned int sched_count;
473 unsigned int sched_goidle;
474
475 /* try_to_wake_up() stats */
476 unsigned int ttwu_count;
477 unsigned int ttwu_local;
478#endif
479
480#ifdef CONFIG_SMP
481 struct llist_head wake_list;
482#endif
483
484 struct sched_avg avg;
485};
486
487static inline int cpu_of(struct rq *rq)
488{
489#ifdef CONFIG_SMP
490 return rq->cpu;
491#else
492 return 0;
493#endif
494}
495
496DECLARE_PER_CPU(struct rq, runqueues);
497
498#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
499#define this_rq() (&__get_cpu_var(runqueues))
500#define task_rq(p) cpu_rq(task_cpu(p))
501#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
502#define raw_rq() (&__raw_get_cpu_var(runqueues))
503
504#ifdef CONFIG_SMP
505
506#define rcu_dereference_check_sched_domain(p) \
507 rcu_dereference_check((p), \
508 lockdep_is_held(&sched_domains_mutex))
509
510/*
511 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
512 * See detach_destroy_domains: synchronize_sched for details.
513 *
514 * The domain tree of any CPU may only be accessed from within
515 * preempt-disabled sections.
516 */
517#define for_each_domain(cpu, __sd) \
518 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
519 __sd; __sd = __sd->parent)
520
521#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
522
523/**
524 * highest_flag_domain - Return highest sched_domain containing flag.
525 * @cpu: The cpu whose highest level of sched domain is to
526 * be returned.
527 * @flag: The flag to check for the highest sched_domain
528 * for the given cpu.
529 *
530 * Returns the highest sched_domain of a cpu which contains the given flag.
531 */
532static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
533{
534 struct sched_domain *sd, *hsd = NULL;
535
536 for_each_domain(cpu, sd) {
537 if (!(sd->flags & flag))
538 break;
539 hsd = sd;
540 }
541
542 return hsd;
543}
544
545DECLARE_PER_CPU(struct sched_domain *, sd_llc);
546DECLARE_PER_CPU(int, sd_llc_id);
547
548extern int group_balance_cpu(struct sched_group *sg);
549
550#endif /* CONFIG_SMP */
551
552#include "stats.h"
553#include "auto_group.h"
554
555#ifdef CONFIG_CGROUP_SCHED
556
557/*
558 * Return the group to which this tasks belongs.
559 *
560 * We cannot use task_subsys_state() and friends because the cgroup
561 * subsystem changes that value before the cgroup_subsys::attach() method
562 * is called, therefore we cannot pin it and might observe the wrong value.
563 *
564 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
565 * core changes this before calling sched_move_task().
566 *
567 * Instead we use a 'copy' which is updated from sched_move_task() while
568 * holding both task_struct::pi_lock and rq::lock.
569 */
570static inline struct task_group *task_group(struct task_struct *p)
571{
572 return p->sched_task_group;
573}
574
575/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
576static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
577{
578#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
579 struct task_group *tg = task_group(p);
580#endif
581
582#ifdef CONFIG_FAIR_GROUP_SCHED
583 p->se.cfs_rq = tg->cfs_rq[cpu];
584 p->se.parent = tg->se[cpu];
585#endif
586
587#ifdef CONFIG_RT_GROUP_SCHED
588 p->rt.rt_rq = tg->rt_rq[cpu];
589 p->rt.parent = tg->rt_se[cpu];
590#endif
591}
592
593#else /* CONFIG_CGROUP_SCHED */
594
595static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
596static inline struct task_group *task_group(struct task_struct *p)
597{
598 return NULL;
599}
600
601#endif /* CONFIG_CGROUP_SCHED */
602
603static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
604{
605 set_task_rq(p, cpu);
606#ifdef CONFIG_SMP
607 /*
608 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
609 * successfuly executed on another CPU. We must ensure that updates of
610 * per-task data have been completed by this moment.
611 */
612 smp_wmb();
613 task_thread_info(p)->cpu = cpu;
614#endif
615}
616
617/*
618 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
619 */
620#ifdef CONFIG_SCHED_DEBUG
621# include <linux/static_key.h>
622# define const_debug __read_mostly
623#else
624# define const_debug const
625#endif
626
627extern const_debug unsigned int sysctl_sched_features;
628
629#define SCHED_FEAT(name, enabled) \
630 __SCHED_FEAT_##name ,
631
632enum {
633#include "features.h"
634 __SCHED_FEAT_NR,
635};
636
637#undef SCHED_FEAT
638
639#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
640static __always_inline bool static_branch__true(struct static_key *key)
641{
642 return static_key_true(key); /* Not out of line branch. */
643}
644
645static __always_inline bool static_branch__false(struct static_key *key)
646{
647 return static_key_false(key); /* Out of line branch. */
648}
649
650#define SCHED_FEAT(name, enabled) \
651static __always_inline bool static_branch_##name(struct static_key *key) \
652{ \
653 return static_branch__##enabled(key); \
654}
655
656#include "features.h"
657
658#undef SCHED_FEAT
659
660extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
661#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
662#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
663#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
664#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
665
666#ifdef CONFIG_NUMA_BALANCING
667#define sched_feat_numa(x) sched_feat(x)
668#ifdef CONFIG_SCHED_DEBUG
669#define numabalancing_enabled sched_feat_numa(NUMA)
670#else
671extern bool numabalancing_enabled;
672#endif /* CONFIG_SCHED_DEBUG */
673#else
674#define sched_feat_numa(x) (0)
675#define numabalancing_enabled (0)
676#endif /* CONFIG_NUMA_BALANCING */
677
678static inline u64 global_rt_period(void)
679{
680 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
681}
682
683static inline u64 global_rt_runtime(void)
684{
685 if (sysctl_sched_rt_runtime < 0)
686 return RUNTIME_INF;
687
688 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
689}
690
691
692
693static inline int task_current(struct rq *rq, struct task_struct *p)
694{
695 return rq->curr == p;
696}
697
698static inline int task_running(struct rq *rq, struct task_struct *p)
699{
700#ifdef CONFIG_SMP
701 return p->on_cpu;
702#else
703 return task_current(rq, p);
704#endif
705}
706
707
708#ifndef prepare_arch_switch
709# define prepare_arch_switch(next) do { } while (0)
710#endif
711#ifndef finish_arch_switch
712# define finish_arch_switch(prev) do { } while (0)
713#endif
714#ifndef finish_arch_post_lock_switch
715# define finish_arch_post_lock_switch() do { } while (0)
716#endif
717
718#ifndef __ARCH_WANT_UNLOCKED_CTXSW
719static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
720{
721#ifdef CONFIG_SMP
722 /*
723 * We can optimise this out completely for !SMP, because the
724 * SMP rebalancing from interrupt is the only thing that cares
725 * here.
726 */
727 next->on_cpu = 1;
728#endif
729}
730
731static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
732{
733#ifdef CONFIG_SMP
734 /*
735 * After ->on_cpu is cleared, the task can be moved to a different CPU.
736 * We must ensure this doesn't happen until the switch is completely
737 * finished.
738 */
739 smp_wmb();
740 prev->on_cpu = 0;
741#endif
742#ifdef CONFIG_DEBUG_SPINLOCK
743 /* this is a valid case when another task releases the spinlock */
744 rq->lock.owner = current;
745#endif
746 /*
747 * If we are tracking spinlock dependencies then we have to
748 * fix up the runqueue lock - which gets 'carried over' from
749 * prev into current:
750 */
751 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
752
753 raw_spin_unlock_irq(&rq->lock);
754}
755
756#else /* __ARCH_WANT_UNLOCKED_CTXSW */
757static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
758{
759#ifdef CONFIG_SMP
760 /*
761 * We can optimise this out completely for !SMP, because the
762 * SMP rebalancing from interrupt is the only thing that cares
763 * here.
764 */
765 next->on_cpu = 1;
766#endif
767 raw_spin_unlock(&rq->lock);
768}
769
770static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
771{
772#ifdef CONFIG_SMP
773 /*
774 * After ->on_cpu is cleared, the task can be moved to a different CPU.
775 * We must ensure this doesn't happen until the switch is completely
776 * finished.
777 */
778 smp_wmb();
779 prev->on_cpu = 0;
780#endif
781 local_irq_enable();
782}
783#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
784
785
786static inline void update_load_add(struct load_weight *lw, unsigned long inc)
787{
788 lw->weight += inc;
789 lw->inv_weight = 0;
790}
791
792static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
793{
794 lw->weight -= dec;
795 lw->inv_weight = 0;
796}
797
798static inline void update_load_set(struct load_weight *lw, unsigned long w)
799{
800 lw->weight = w;
801 lw->inv_weight = 0;
802}
803
804/*
805 * To aid in avoiding the subversion of "niceness" due to uneven distribution
806 * of tasks with abnormal "nice" values across CPUs the contribution that
807 * each task makes to its run queue's load is weighted according to its
808 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
809 * scaled version of the new time slice allocation that they receive on time
810 * slice expiry etc.
811 */
812
813#define WEIGHT_IDLEPRIO 3
814#define WMULT_IDLEPRIO 1431655765
815
816/*
817 * Nice levels are multiplicative, with a gentle 10% change for every
818 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
819 * nice 1, it will get ~10% less CPU time than another CPU-bound task
820 * that remained on nice 0.
821 *
822 * The "10% effect" is relative and cumulative: from _any_ nice level,
823 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
824 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
825 * If a task goes up by ~10% and another task goes down by ~10% then
826 * the relative distance between them is ~25%.)
827 */
828static const int prio_to_weight[40] = {
829 /* -20 */ 88761, 71755, 56483, 46273, 36291,
830 /* -15 */ 29154, 23254, 18705, 14949, 11916,
831 /* -10 */ 9548, 7620, 6100, 4904, 3906,
832 /* -5 */ 3121, 2501, 1991, 1586, 1277,
833 /* 0 */ 1024, 820, 655, 526, 423,
834 /* 5 */ 335, 272, 215, 172, 137,
835 /* 10 */ 110, 87, 70, 56, 45,
836 /* 15 */ 36, 29, 23, 18, 15,
837};
838
839/*
840 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
841 *
842 * In cases where the weight does not change often, we can use the
843 * precalculated inverse to speed up arithmetics by turning divisions
844 * into multiplications:
845 */
846static const u32 prio_to_wmult[40] = {
847 /* -20 */ 48388, 59856, 76040, 92818, 118348,
848 /* -15 */ 147320, 184698, 229616, 287308, 360437,
849 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
850 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
851 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
852 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
853 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
854 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
855};
856
857/* Time spent by the tasks of the cpu accounting group executing in ... */
858enum cpuacct_stat_index {
859 CPUACCT_STAT_USER, /* ... user mode */
860 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
861
862 CPUACCT_STAT_NSTATS,
863};
864
865
866#define sched_class_highest (&stop_sched_class)
867#define for_each_class(class) \
868 for (class = sched_class_highest; class; class = class->next)
869
870extern const struct sched_class stop_sched_class;
871extern const struct sched_class rt_sched_class;
872extern const struct sched_class fair_sched_class;
873extern const struct sched_class idle_sched_class;
874
875
876#ifdef CONFIG_SMP
877
878extern void trigger_load_balance(struct rq *rq, int cpu);
879extern void idle_balance(int this_cpu, struct rq *this_rq);
880
881#else /* CONFIG_SMP */
882
883static inline void idle_balance(int cpu, struct rq *rq)
884{
885}
886
887#endif
888
889extern void sysrq_sched_debug_show(void);
890extern void sched_init_granularity(void);
891extern void update_max_interval(void);
892extern void update_group_power(struct sched_domain *sd, int cpu);
893extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
894extern void init_sched_rt_class(void);
895extern void init_sched_fair_class(void);
896
897extern void resched_task(struct task_struct *p);
898extern void resched_cpu(int cpu);
899
900extern struct rt_bandwidth def_rt_bandwidth;
901extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
902
903extern void update_idle_cpu_load(struct rq *this_rq);
904
905#ifdef CONFIG_CGROUP_CPUACCT
906#include <linux/cgroup.h>
907/* track cpu usage of a group of tasks and its child groups */
908struct cpuacct {
909 struct cgroup_subsys_state css;
910 /* cpuusage holds pointer to a u64-type object on every cpu */
911 u64 __percpu *cpuusage;
912 struct kernel_cpustat __percpu *cpustat;
913};
914
915extern struct cgroup_subsys cpuacct_subsys;
916extern struct cpuacct root_cpuacct;
917
918/* return cpu accounting group corresponding to this container */
919static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
920{
921 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
922 struct cpuacct, css);
923}
924
925/* return cpu accounting group to which this task belongs */
926static inline struct cpuacct *task_ca(struct task_struct *tsk)
927{
928 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
929 struct cpuacct, css);
930}
931
932static inline struct cpuacct *parent_ca(struct cpuacct *ca)
933{
934 if (!ca || !ca->css.cgroup->parent)
935 return NULL;
936 return cgroup_ca(ca->css.cgroup->parent);
937}
938
939extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
940#else
941static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
942#endif
943
944#ifdef CONFIG_PARAVIRT
945static inline u64 steal_ticks(u64 steal)
946{
947 if (unlikely(steal > NSEC_PER_SEC))
948 return div_u64(steal, TICK_NSEC);
949
950 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
951}
952#endif
953
954static inline void inc_nr_running(struct rq *rq)
955{
956 rq->nr_running++;
957}
958
959static inline void dec_nr_running(struct rq *rq)
960{
961 rq->nr_running--;
962}
963
964extern void update_rq_clock(struct rq *rq);
965
966extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
967extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
968
969extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
970
971extern const_debug unsigned int sysctl_sched_time_avg;
972extern const_debug unsigned int sysctl_sched_nr_migrate;
973extern const_debug unsigned int sysctl_sched_migration_cost;
974
975static inline u64 sched_avg_period(void)
976{
977 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
978}
979
980#ifdef CONFIG_SCHED_HRTICK
981
982/*
983 * Use hrtick when:
984 * - enabled by features
985 * - hrtimer is actually high res
986 */
987static inline int hrtick_enabled(struct rq *rq)
988{
989 if (!sched_feat(HRTICK))
990 return 0;
991 if (!cpu_active(cpu_of(rq)))
992 return 0;
993 return hrtimer_is_hres_active(&rq->hrtick_timer);
994}
995
996void hrtick_start(struct rq *rq, u64 delay);
997
998#else
999
1000static inline int hrtick_enabled(struct rq *rq)
1001{
1002 return 0;
1003}
1004
1005#endif /* CONFIG_SCHED_HRTICK */
1006
1007#ifdef CONFIG_SMP
1008extern void sched_avg_update(struct rq *rq);
1009static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1010{
1011 rq->rt_avg += rt_delta;
1012 sched_avg_update(rq);
1013}
1014#else
1015static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
1016static inline void sched_avg_update(struct rq *rq) { }
1017#endif
1018
1019extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
1020
1021#ifdef CONFIG_SMP
1022#ifdef CONFIG_PREEMPT
1023
1024static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
1025
1026/*
1027 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1028 * way at the expense of forcing extra atomic operations in all
1029 * invocations. This assures that the double_lock is acquired using the
1030 * same underlying policy as the spinlock_t on this architecture, which
1031 * reduces latency compared to the unfair variant below. However, it
1032 * also adds more overhead and therefore may reduce throughput.
1033 */
1034static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1035 __releases(this_rq->lock)
1036 __acquires(busiest->lock)
1037 __acquires(this_rq->lock)
1038{
1039 raw_spin_unlock(&this_rq->lock);
1040 double_rq_lock(this_rq, busiest);
1041
1042 return 1;
1043}
1044
1045#else
1046/*
1047 * Unfair double_lock_balance: Optimizes throughput at the expense of
1048 * latency by eliminating extra atomic operations when the locks are
1049 * already in proper order on entry. This favors lower cpu-ids and will
1050 * grant the double lock to lower cpus over higher ids under contention,
1051 * regardless of entry order into the function.
1052 */
1053static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1054 __releases(this_rq->lock)
1055 __acquires(busiest->lock)
1056 __acquires(this_rq->lock)
1057{
1058 int ret = 0;
1059
1060 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1061 if (busiest < this_rq) {
1062 raw_spin_unlock(&this_rq->lock);
1063 raw_spin_lock(&busiest->lock);
1064 raw_spin_lock_nested(&this_rq->lock,
1065 SINGLE_DEPTH_NESTING);
1066 ret = 1;
1067 } else
1068 raw_spin_lock_nested(&busiest->lock,
1069 SINGLE_DEPTH_NESTING);
1070 }
1071 return ret;
1072}
1073
1074#endif /* CONFIG_PREEMPT */
1075
1076/*
1077 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1078 */
1079static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1080{
1081 if (unlikely(!irqs_disabled())) {
1082 /* printk() doesn't work good under rq->lock */
1083 raw_spin_unlock(&this_rq->lock);
1084 BUG_ON(1);
1085 }
1086
1087 return _double_lock_balance(this_rq, busiest);
1088}
1089
1090static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1091 __releases(busiest->lock)
1092{
1093 raw_spin_unlock(&busiest->lock);
1094 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1095}
1096
1097/*
1098 * double_rq_lock - safely lock two runqueues
1099 *
1100 * Note this does not disable interrupts like task_rq_lock,
1101 * you need to do so manually before calling.
1102 */
1103static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1104 __acquires(rq1->lock)
1105 __acquires(rq2->lock)
1106{
1107 BUG_ON(!irqs_disabled());
1108 if (rq1 == rq2) {
1109 raw_spin_lock(&rq1->lock);
1110 __acquire(rq2->lock); /* Fake it out ;) */
1111 } else {
1112 if (rq1 < rq2) {
1113 raw_spin_lock(&rq1->lock);
1114 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1115 } else {
1116 raw_spin_lock(&rq2->lock);
1117 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1118 }
1119 }
1120}
1121
1122/*
1123 * double_rq_unlock - safely unlock two runqueues
1124 *
1125 * Note this does not restore interrupts like task_rq_unlock,
1126 * you need to do so manually after calling.
1127 */
1128static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1129 __releases(rq1->lock)
1130 __releases(rq2->lock)
1131{
1132 raw_spin_unlock(&rq1->lock);
1133 if (rq1 != rq2)
1134 raw_spin_unlock(&rq2->lock);
1135 else
1136 __release(rq2->lock);
1137}
1138
1139#else /* CONFIG_SMP */
1140
1141/*
1142 * double_rq_lock - safely lock two runqueues
1143 *
1144 * Note this does not disable interrupts like task_rq_lock,
1145 * you need to do so manually before calling.
1146 */
1147static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1148 __acquires(rq1->lock)
1149 __acquires(rq2->lock)
1150{
1151 BUG_ON(!irqs_disabled());
1152 BUG_ON(rq1 != rq2);
1153 raw_spin_lock(&rq1->lock);
1154 __acquire(rq2->lock); /* Fake it out ;) */
1155}
1156
1157/*
1158 * double_rq_unlock - safely unlock two runqueues
1159 *
1160 * Note this does not restore interrupts like task_rq_unlock,
1161 * you need to do so manually after calling.
1162 */
1163static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1164 __releases(rq1->lock)
1165 __releases(rq2->lock)
1166{
1167 BUG_ON(rq1 != rq2);
1168 raw_spin_unlock(&rq1->lock);
1169 __release(rq2->lock);
1170}
1171
1172#endif
1173
1174extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1175extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1176extern void print_cfs_stats(struct seq_file *m, int cpu);
1177extern void print_rt_stats(struct seq_file *m, int cpu);
1178
1179extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1180extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1181
1182extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1183
1184#ifdef CONFIG_NO_HZ
1185enum rq_nohz_flag_bits {
1186 NOHZ_TICK_STOPPED,
1187 NOHZ_BALANCE_KICK,
1188 NOHZ_IDLE,
1189};
1190
1191#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1192#endif
1193
1194#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1195
1196DECLARE_PER_CPU(u64, cpu_hardirq_time);
1197DECLARE_PER_CPU(u64, cpu_softirq_time);
1198
1199#ifndef CONFIG_64BIT
1200DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1201
1202static inline void irq_time_write_begin(void)
1203{
1204 __this_cpu_inc(irq_time_seq.sequence);
1205 smp_wmb();
1206}
1207
1208static inline void irq_time_write_end(void)
1209{
1210 smp_wmb();
1211 __this_cpu_inc(irq_time_seq.sequence);
1212}
1213
1214static inline u64 irq_time_read(int cpu)
1215{
1216 u64 irq_time;
1217 unsigned seq;
1218
1219 do {
1220 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1221 irq_time = per_cpu(cpu_softirq_time, cpu) +
1222 per_cpu(cpu_hardirq_time, cpu);
1223 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1224
1225 return irq_time;
1226}
1227#else /* CONFIG_64BIT */
1228static inline void irq_time_write_begin(void)
1229{
1230}
1231
1232static inline void irq_time_write_end(void)
1233{
1234}
1235
1236static inline u64 irq_time_read(int cpu)
1237{
1238 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1239}
1240#endif /* CONFIG_64BIT */
1241#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
deleted file mode 100644
index 903ffa9e887..00000000000
--- a/kernel/sched/stats.c
+++ /dev/null
@@ -1,111 +0,0 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
deleted file mode 100644
index 2ef90a51ec5..00000000000
--- a/kernel/sched/stats.h
+++ /dev/null
@@ -1,231 +0,0 @@
1
2#ifdef CONFIG_SCHEDSTATS
3
4/*
5 * Expects runqueue lock to be held for atomicity of update
6 */
7static inline void
8rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
9{
10 if (rq) {
11 rq->rq_sched_info.run_delay += delta;
12 rq->rq_sched_info.pcount++;
13 }
14}
15
16/*
17 * Expects runqueue lock to be held for atomicity of update
18 */
19static inline void
20rq_sched_info_depart(struct rq *rq, unsigned long long delta)
21{
22 if (rq)
23 rq->rq_cpu_time += delta;
24}
25
26static inline void
27rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
28{
29 if (rq)
30 rq->rq_sched_info.run_delay += delta;
31}
32# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
33# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
34# define schedstat_set(var, val) do { var = (val); } while (0)
35#else /* !CONFIG_SCHEDSTATS */
36static inline void
37rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
38{}
39static inline void
40rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
41{}
42static inline void
43rq_sched_info_depart(struct rq *rq, unsigned long long delta)
44{}
45# define schedstat_inc(rq, field) do { } while (0)
46# define schedstat_add(rq, field, amt) do { } while (0)
47# define schedstat_set(var, val) do { } while (0)
48#endif
49
50#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
51static inline void sched_info_reset_dequeued(struct task_struct *t)
52{
53 t->sched_info.last_queued = 0;
54}
55
56/*
57 * We are interested in knowing how long it was from the *first* time a
58 * task was queued to the time that it finally hit a cpu, we call this routine
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew.
61 */
62static inline void sched_info_dequeued(struct task_struct *t)
63{
64 unsigned long long now = task_rq(t)->clock, delta = 0;
65
66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued)
68 delta = now - t->sched_info.last_queued;
69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta;
71
72 rq_sched_info_dequeued(task_rq(t), delta);
73}
74
75/*
76 * Called when a task finally hits the cpu. We can now calculate how
77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is.
79 */
80static void sched_info_arrive(struct task_struct *t)
81{
82 unsigned long long now = task_rq(t)->clock, delta = 0;
83
84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued;
86 sched_info_reset_dequeued(t);
87 t->sched_info.run_delay += delta;
88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++;
90
91 rq_sched_info_arrive(task_rq(t), delta);
92}
93
94/*
95 * This function is only called from enqueue_task(), but also only updates
96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */
99static inline void sched_info_queued(struct task_struct *t)
100{
101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = task_rq(t)->clock;
104}
105
106/*
107 * Called when a process ceases being the active-running process, either
108 * voluntarily or involuntarily. Now we can calculate how long we ran.
109 * Also, if the process is still in the TASK_RUNNING state, call
110 * sched_info_queued() to mark that it has now again started waiting on
111 * the runqueue.
112 */
113static inline void sched_info_depart(struct task_struct *t)
114{
115 unsigned long long delta = task_rq(t)->clock -
116 t->sched_info.last_arrival;
117
118 rq_sched_info_depart(task_rq(t), delta);
119
120 if (t->state == TASK_RUNNING)
121 sched_info_queued(t);
122}
123
124/*
125 * Called when tasks are switched involuntarily due, typically, to expiring
126 * their time slice. (This may also be called when switching to or from
127 * the idle task.) We are only called when prev != next.
128 */
129static inline void
130__sched_info_switch(struct task_struct *prev, struct task_struct *next)
131{
132 struct rq *rq = task_rq(prev);
133
134 /*
135 * prev now departs the cpu. It's not interesting to record
136 * stats about how efficient we were at scheduling the idle
137 * process, however.
138 */
139 if (prev != rq->idle)
140 sched_info_depart(prev);
141
142 if (next != rq->idle)
143 sched_info_arrive(next);
144}
145static inline void
146sched_info_switch(struct task_struct *prev, struct task_struct *next)
147{
148 if (unlikely(sched_info_on()))
149 __sched_info_switch(prev, next);
150}
151#else
152#define sched_info_queued(t) do { } while (0)
153#define sched_info_reset_dequeued(t) do { } while (0)
154#define sched_info_dequeued(t) do { } while (0)
155#define sched_info_switch(t, next) do { } while (0)
156#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
157
158/*
159 * The following are functions that support scheduler-internal time accounting.
160 * These functions are generally called at the timer tick. None of this depends
161 * on CONFIG_SCHEDSTATS.
162 */
163
164/**
165 * account_group_user_time - Maintain utime for a thread group.
166 *
167 * @tsk: Pointer to task structure.
168 * @cputime: Time value by which to increment the utime field of the
169 * thread_group_cputime structure.
170 *
171 * If thread group time is being maintained, get the structure for the
172 * running CPU and update the utime field there.
173 */
174static inline void account_group_user_time(struct task_struct *tsk,
175 cputime_t cputime)
176{
177 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
178
179 if (!cputimer->running)
180 return;
181
182 raw_spin_lock(&cputimer->lock);
183 cputimer->cputime.utime += cputime;
184 raw_spin_unlock(&cputimer->lock);
185}
186
187/**
188 * account_group_system_time - Maintain stime for a thread group.
189 *
190 * @tsk: Pointer to task structure.
191 * @cputime: Time value by which to increment the stime field of the
192 * thread_group_cputime structure.
193 *
194 * If thread group time is being maintained, get the structure for the
195 * running CPU and update the stime field there.
196 */
197static inline void account_group_system_time(struct task_struct *tsk,
198 cputime_t cputime)
199{
200 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
201
202 if (!cputimer->running)
203 return;
204
205 raw_spin_lock(&cputimer->lock);
206 cputimer->cputime.stime += cputime;
207 raw_spin_unlock(&cputimer->lock);
208}
209
210/**
211 * account_group_exec_runtime - Maintain exec runtime for a thread group.
212 *
213 * @tsk: Pointer to task structure.
214 * @ns: Time value by which to increment the sum_exec_runtime field
215 * of the thread_group_cputime structure.
216 *
217 * If thread group time is being maintained, get the structure for the
218 * running CPU and update the sum_exec_runtime field there.
219 */
220static inline void account_group_exec_runtime(struct task_struct *tsk,
221 unsigned long long ns)
222{
223 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
224
225 if (!cputimer->running)
226 return;
227
228 raw_spin_lock(&cputimer->lock);
229 cputimer->cputime.sum_exec_runtime += ns;
230 raw_spin_unlock(&cputimer->lock);
231}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
deleted file mode 100644
index da5eb5bed84..00000000000
--- a/kernel/sched/stop_task.c
+++ /dev/null
@@ -1,128 +0,0 @@
1#include "sched.h"
2
3/*
4 * stop-task scheduling class.
5 *
6 * The stop task is the highest priority task in the system, it preempts
7 * everything and will be preempted by nothing.
8 *
9 * See kernel/stop_machine.c
10 */
11
12#ifdef CONFIG_SMP
13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
15{
16 return task_cpu(p); /* stop tasks as never migrate */
17}
18#endif /* CONFIG_SMP */
19
20static void
21check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
22{
23 /* we're never preempted */
24}
25
26static struct task_struct *pick_next_task_stop(struct rq *rq)
27{
28 struct task_struct *stop = rq->stop;
29
30 if (stop && stop->on_rq) {
31 stop->se.exec_start = rq->clock_task;
32 return stop;
33 }
34
35 return NULL;
36}
37
38static void
39enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
40{
41 inc_nr_running(rq);
42}
43
44static void
45dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
46{
47 dec_nr_running(rq);
48}
49
50static void yield_task_stop(struct rq *rq)
51{
52 BUG(); /* the stop task should never yield, its pointless. */
53}
54
55static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
56{
57 struct task_struct *curr = rq->curr;
58 u64 delta_exec;
59
60 delta_exec = rq->clock_task - curr->se.exec_start;
61 if (unlikely((s64)delta_exec < 0))
62 delta_exec = 0;
63
64 schedstat_set(curr->se.statistics.exec_max,
65 max(curr->se.statistics.exec_max, delta_exec));
66
67 curr->se.sum_exec_runtime += delta_exec;
68 account_group_exec_runtime(curr, delta_exec);
69
70 curr->se.exec_start = rq->clock_task;
71 cpuacct_charge(curr, delta_exec);
72}
73
74static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
75{
76}
77
78static void set_curr_task_stop(struct rq *rq)
79{
80 struct task_struct *stop = rq->stop;
81
82 stop->se.exec_start = rq->clock_task;
83}
84
85static void switched_to_stop(struct rq *rq, struct task_struct *p)
86{
87 BUG(); /* its impossible to change to this class */
88}
89
90static void
91prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
92{
93 BUG(); /* how!?, what priority? */
94}
95
96static unsigned int
97get_rr_interval_stop(struct rq *rq, struct task_struct *task)
98{
99 return 0;
100}
101
102/*
103 * Simple, special scheduling class for the per-CPU stop tasks:
104 */
105const struct sched_class stop_sched_class = {
106 .next = &rt_sched_class,
107
108 .enqueue_task = enqueue_task_stop,
109 .dequeue_task = dequeue_task_stop,
110 .yield_task = yield_task_stop,
111
112 .check_preempt_curr = check_preempt_curr_stop,
113
114 .pick_next_task = pick_next_task_stop,
115 .put_prev_task = put_prev_task_stop,
116
117#ifdef CONFIG_SMP
118 .select_task_rq = select_task_rq_stop,
119#endif
120
121 .set_curr_task = set_curr_task_stop,
122 .task_tick = task_tick_stop,
123
124 .get_rr_interval = get_rr_interval_stop,
125
126 .prio_changed = prio_changed_stop,
127 .switched_to = switched_to_stop,
128};