aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/latencytop.h3
-rw-r--r--include/linux/sched.h9
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/sched.c1828
-rw-r--r--kernel/sched.h1064
-rw-r--r--kernel/sched_autogroup.c33
-rw-r--r--kernel/sched_autogroup.h26
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c580
-rw-r--r--kernel/sched_idletask.c4
-rw-r--r--kernel/sched_rt.c209
-rw-r--r--kernel/sched_stats.c111
-rw-r--r--kernel/sched_stats.h103
-rw-r--r--kernel/sched_stoptask.c4
14 files changed, 2034 insertions, 1954 deletions
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index b0e99898527c..e23121f9d82a 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -10,6 +10,8 @@
10#define _INCLUDE_GUARD_LATENCYTOP_H_ 10#define _INCLUDE_GUARD_LATENCYTOP_H_
11 11
12#include <linux/compiler.h> 12#include <linux/compiler.h>
13struct task_struct;
14
13#ifdef CONFIG_LATENCYTOP 15#ifdef CONFIG_LATENCYTOP
14 16
15#define LT_SAVECOUNT 32 17#define LT_SAVECOUNT 32
@@ -23,7 +25,6 @@ struct latency_record {
23}; 25};
24 26
25 27
26struct task_struct;
27 28
28extern int latencytop_enabled; 29extern int latencytop_enabled;
29void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); 30void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68daf4f27e2c..8db17b7622ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -925,6 +925,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
925 return to_cpumask(sg->cpumask); 925 return to_cpumask(sg->cpumask);
926} 926}
927 927
928/**
929 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
930 * @group: The group whose first cpu is to be returned.
931 */
932static inline unsigned int group_first_cpu(struct sched_group *group)
933{
934 return cpumask_first(sched_group_cpus(group));
935}
936
928struct sched_domain_attr { 937struct sched_domain_attr {
929 int relax_domain_level; 938 int relax_domain_level;
930}; 939};
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02c..1a4d37d7f39a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
@@ -10,8 +10,12 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o sched_clock.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o 14
15obj-y += sched.o sched_idletask.o sched_fair.o sched_rt.o sched_stoptask.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += sched_autogroup.o
17obj-$(CONFIG_SCHEDSTATS) += sched_stats.o
18obj-$(CONFIG_SCHED_DEBUG) += sched_debug.o
15 19
16ifdef CONFIG_FUNCTION_TRACER 20ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 21# Do not trace debug files and internal ftrace files
diff --git a/kernel/sched.c b/kernel/sched.c
index c9e3ab6e299e..2ffcceed8862 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -72,133 +71,20 @@
72#include <linux/ftrace.h> 71#include <linux/ftrace.h>
73#include <linux/slab.h> 72#include <linux/slab.h>
74#include <linux/init_task.h> 73#include <linux/init_task.h>
75#include <linux/jump_label.h>
76 74
77#include <asm/tlb.h> 75#include <asm/tlb.h>
78#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
79#include <asm/mutex.h>
80#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
81#include <asm/paravirt.h> 78#include <asm/paravirt.h>
82#endif 79#endif
83 80
84#include "sched_cpupri.h" 81#include "sched.h"
85#include "workqueue_sched.h" 82#include "workqueue_sched.h"
86#include "sched_autogroup.h"
87 83
88#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
89#include <trace/events/sched.h> 85#include <trace/events/sched.h>
90 86
91/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
92 * Convert user-nice values [ -20 ... 0 ... 19 ]
93 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
94 * and back.
95 */
96#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
97#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
98#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
99
100/*
101 * 'User priority' is the nice value converted to something we
102 * can work with better when scaling various scheduler parameters,
103 * it's a [ 0 ... 39 ] range.
104 */
105#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
106#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
107#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
108
109/*
110 * Helpers for converting nanosecond timing to jiffy resolution
111 */
112#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
113
114#define NICE_0_LOAD SCHED_LOAD_SCALE
115#define NICE_0_SHIFT SCHED_LOAD_SHIFT
116
117/*
118 * These are the 'tuning knobs' of the scheduler:
119 *
120 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
121 * Timeslices get refilled after they expire.
122 */
123#define DEF_TIMESLICE (100 * HZ / 1000)
124
125/*
126 * single value that denotes runtime == period, ie unlimited time.
127 */
128#define RUNTIME_INF ((u64)~0ULL)
129
130static inline int rt_policy(int policy)
131{
132 if (policy == SCHED_FIFO || policy == SCHED_RR)
133 return 1;
134 return 0;
135}
136
137static inline int task_has_rt_policy(struct task_struct *p)
138{
139 return rt_policy(p->policy);
140}
141
142/*
143 * This is the priority-queue data structure of the RT scheduling class:
144 */
145struct rt_prio_array {
146 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
147 struct list_head queue[MAX_RT_PRIO];
148};
149
150struct rt_bandwidth {
151 /* nests inside the rq lock: */
152 raw_spinlock_t rt_runtime_lock;
153 ktime_t rt_period;
154 u64 rt_runtime;
155 struct hrtimer rt_period_timer;
156};
157
158static struct rt_bandwidth def_rt_bandwidth;
159
160static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
161
162static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
163{
164 struct rt_bandwidth *rt_b =
165 container_of(timer, struct rt_bandwidth, rt_period_timer);
166 ktime_t now;
167 int overrun;
168 int idle = 0;
169
170 for (;;) {
171 now = hrtimer_cb_get_time(timer);
172 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
173
174 if (!overrun)
175 break;
176
177 idle = do_sched_rt_period_timer(rt_b, overrun);
178 }
179
180 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
181}
182
183static
184void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
185{
186 rt_b->rt_period = ns_to_ktime(period);
187 rt_b->rt_runtime = runtime;
188
189 raw_spin_lock_init(&rt_b->rt_runtime_lock);
190
191 hrtimer_init(&rt_b->rt_period_timer,
192 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
193 rt_b->rt_period_timer.function = sched_rt_period_timer;
194}
195
196static inline int rt_bandwidth_enabled(void)
197{
198 return sysctl_sched_rt_runtime >= 0;
199}
200
201static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
202{ 88{
203 unsigned long delta; 89 unsigned long delta;
204 ktime_t soft, hard, now; 90 ktime_t soft, hard, now;
@@ -218,609 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
218 } 104 }
219} 105}
220 106
221static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 107DEFINE_MUTEX(sched_domains_mutex);
222{ 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
223 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
224 return;
225
226 if (hrtimer_active(&rt_b->rt_period_timer))
227 return;
228
229 raw_spin_lock(&rt_b->rt_runtime_lock);
230 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
231 raw_spin_unlock(&rt_b->rt_runtime_lock);
232}
233
234#ifdef CONFIG_RT_GROUP_SCHED
235static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
236{
237 hrtimer_cancel(&rt_b->rt_period_timer);
238}
239#endif
240
241/*
242 * sched_domains_mutex serializes calls to init_sched_domains,
243 * detach_destroy_domains and partition_sched_domains.
244 */
245static DEFINE_MUTEX(sched_domains_mutex);
246
247#ifdef CONFIG_CGROUP_SCHED
248
249#include <linux/cgroup.h>
250
251struct cfs_rq;
252
253static LIST_HEAD(task_groups);
254
255struct cfs_bandwidth {
256#ifdef CONFIG_CFS_BANDWIDTH
257 raw_spinlock_t lock;
258 ktime_t period;
259 u64 quota, runtime;
260 s64 hierarchal_quota;
261 u64 runtime_expires;
262
263 int idle, timer_active;
264 struct hrtimer period_timer, slack_timer;
265 struct list_head throttled_cfs_rq;
266
267 /* statistics */
268 int nr_periods, nr_throttled;
269 u64 throttled_time;
270#endif
271};
272
273/* task group related information */
274struct task_group {
275 struct cgroup_subsys_state css;
276
277#ifdef CONFIG_FAIR_GROUP_SCHED
278 /* schedulable entities of this group on each cpu */
279 struct sched_entity **se;
280 /* runqueue "owned" by this group on each cpu */
281 struct cfs_rq **cfs_rq;
282 unsigned long shares;
283
284 atomic_t load_weight;
285#endif
286
287#ifdef CONFIG_RT_GROUP_SCHED
288 struct sched_rt_entity **rt_se;
289 struct rt_rq **rt_rq;
290
291 struct rt_bandwidth rt_bandwidth;
292#endif
293
294 struct rcu_head rcu;
295 struct list_head list;
296
297 struct task_group *parent;
298 struct list_head siblings;
299 struct list_head children;
300
301#ifdef CONFIG_SCHED_AUTOGROUP
302 struct autogroup *autogroup;
303#endif
304
305 struct cfs_bandwidth cfs_bandwidth;
306};
307
308/* task_group_lock serializes the addition/removal of task groups */
309static DEFINE_SPINLOCK(task_group_lock);
310
311#ifdef CONFIG_FAIR_GROUP_SCHED
312
313# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
314
315/*
316 * A weight of 0 or 1 can cause arithmetics problems.
317 * A weight of a cfs_rq is the sum of weights of which entities
318 * are queued on this cfs_rq, so a weight of a entity should not be
319 * too large, so as the shares value of a task group.
320 * (The default weight is 1024 - so there's no practical
321 * limitation from this.)
322 */
323#define MIN_SHARES (1UL << 1)
324#define MAX_SHARES (1UL << 18)
325
326static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
327#endif
328
329/* Default task group.
330 * Every task in system belong to this group at bootup.
331 */
332struct task_group root_task_group;
333
334#endif /* CONFIG_CGROUP_SCHED */
335
336/* CFS-related fields in a runqueue */
337struct cfs_rq {
338 struct load_weight load;
339 unsigned long nr_running, h_nr_running;
340
341 u64 exec_clock;
342 u64 min_vruntime;
343#ifndef CONFIG_64BIT
344 u64 min_vruntime_copy;
345#endif
346
347 struct rb_root tasks_timeline;
348 struct rb_node *rb_leftmost;
349
350 struct list_head tasks;
351 struct list_head *balance_iterator;
352
353 /*
354 * 'curr' points to currently running entity on this cfs_rq.
355 * It is set to NULL otherwise (i.e when none are currently running).
356 */
357 struct sched_entity *curr, *next, *last, *skip;
358
359#ifdef CONFIG_SCHED_DEBUG
360 unsigned int nr_spread_over;
361#endif
362
363#ifdef CONFIG_FAIR_GROUP_SCHED
364 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
365
366 /*
367 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
368 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
369 * (like users, containers etc.)
370 *
371 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
372 * list is used during load balance.
373 */
374 int on_list;
375 struct list_head leaf_cfs_rq_list;
376 struct task_group *tg; /* group that "owns" this runqueue */
377
378#ifdef CONFIG_SMP
379 /*
380 * the part of load.weight contributed by tasks
381 */
382 unsigned long task_weight;
383
384 /*
385 * h_load = weight * f(tg)
386 *
387 * Where f(tg) is the recursive weight fraction assigned to
388 * this group.
389 */
390 unsigned long h_load;
391
392 /*
393 * Maintaining per-cpu shares distribution for group scheduling
394 *
395 * load_stamp is the last time we updated the load average
396 * load_last is the last time we updated the load average and saw load
397 * load_unacc_exec_time is currently unaccounted execution time
398 */
399 u64 load_avg;
400 u64 load_period;
401 u64 load_stamp, load_last, load_unacc_exec_time;
402
403 unsigned long load_contribution;
404#endif
405#ifdef CONFIG_CFS_BANDWIDTH
406 int runtime_enabled;
407 u64 runtime_expires;
408 s64 runtime_remaining;
409
410 u64 throttled_timestamp;
411 int throttled, throttle_count;
412 struct list_head throttled_list;
413#endif
414#endif
415};
416
417#ifdef CONFIG_FAIR_GROUP_SCHED
418#ifdef CONFIG_CFS_BANDWIDTH
419static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
420{
421 return &tg->cfs_bandwidth;
422}
423
424static inline u64 default_cfs_period(void);
425static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
426static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
427
428static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
429{
430 struct cfs_bandwidth *cfs_b =
431 container_of(timer, struct cfs_bandwidth, slack_timer);
432 do_sched_cfs_slack_timer(cfs_b);
433
434 return HRTIMER_NORESTART;
435}
436
437static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
438{
439 struct cfs_bandwidth *cfs_b =
440 container_of(timer, struct cfs_bandwidth, period_timer);
441 ktime_t now;
442 int overrun;
443 int idle = 0;
444
445 for (;;) {
446 now = hrtimer_cb_get_time(timer);
447 overrun = hrtimer_forward(timer, now, cfs_b->period);
448
449 if (!overrun)
450 break;
451
452 idle = do_sched_cfs_period_timer(cfs_b, overrun);
453 }
454
455 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
456}
457
458static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
459{
460 raw_spin_lock_init(&cfs_b->lock);
461 cfs_b->runtime = 0;
462 cfs_b->quota = RUNTIME_INF;
463 cfs_b->period = ns_to_ktime(default_cfs_period());
464
465 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
466 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->period_timer.function = sched_cfs_period_timer;
468 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
469 cfs_b->slack_timer.function = sched_cfs_slack_timer;
470}
471
472static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
473{
474 cfs_rq->runtime_enabled = 0;
475 INIT_LIST_HEAD(&cfs_rq->throttled_list);
476}
477
478/* requires cfs_b->lock, may release to reprogram timer */
479static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
480{
481 /*
482 * The timer may be active because we're trying to set a new bandwidth
483 * period or because we're racing with the tear-down path
484 * (timer_active==0 becomes visible before the hrtimer call-back
485 * terminates). In either case we ensure that it's re-programmed
486 */
487 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
488 raw_spin_unlock(&cfs_b->lock);
489 /* ensure cfs_b->lock is available while we wait */
490 hrtimer_cancel(&cfs_b->period_timer);
491
492 raw_spin_lock(&cfs_b->lock);
493 /* if someone else restarted the timer then we're done */
494 if (cfs_b->timer_active)
495 return;
496 }
497
498 cfs_b->timer_active = 1;
499 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
500}
501
502static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
503{
504 hrtimer_cancel(&cfs_b->period_timer);
505 hrtimer_cancel(&cfs_b->slack_timer);
506}
507
508#ifdef HAVE_JUMP_LABEL
509static struct jump_label_key __cfs_bandwidth_used;
510
511static inline bool cfs_bandwidth_used(void)
512{
513 return static_branch(&__cfs_bandwidth_used);
514}
515
516static void account_cfs_bandwidth_used(int enabled, int was_enabled)
517{
518 /* only need to count groups transitioning between enabled/!enabled */
519 if (enabled && !was_enabled)
520 jump_label_inc(&__cfs_bandwidth_used);
521 else if (!enabled && was_enabled)
522 jump_label_dec(&__cfs_bandwidth_used);
523}
524#else /* !HAVE_JUMP_LABEL */
525/* static_branch doesn't help unless supported */
526static int cfs_bandwidth_used(void)
527{
528 return 1;
529}
530static void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
531#endif /* HAVE_JUMP_LABEL */
532#else /* !CONFIG_CFS_BANDWIDTH */
533static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
534static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
535static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
536
537static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
538{
539 return NULL;
540}
541#endif /* CONFIG_CFS_BANDWIDTH */
542#endif /* CONFIG_FAIR_GROUP_SCHED */
543
544/* Real-Time classes' related field in a runqueue: */
545struct rt_rq {
546 struct rt_prio_array active;
547 unsigned long rt_nr_running;
548#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
549 struct {
550 int curr; /* highest queued rt task prio */
551#ifdef CONFIG_SMP
552 int next; /* next highest */
553#endif
554 } highest_prio;
555#endif
556#ifdef CONFIG_SMP
557 unsigned long rt_nr_migratory;
558 unsigned long rt_nr_total;
559 int overloaded;
560 struct plist_head pushable_tasks;
561#endif
562 int rt_throttled;
563 u64 rt_time;
564 u64 rt_runtime;
565 /* Nests inside the rq lock: */
566 raw_spinlock_t rt_runtime_lock;
567
568#ifdef CONFIG_RT_GROUP_SCHED
569 unsigned long rt_nr_boosted;
570
571 struct rq *rq;
572 struct list_head leaf_rt_rq_list;
573 struct task_group *tg;
574#endif
575};
576
577#ifdef CONFIG_SMP
578
579/*
580 * We add the notion of a root-domain which will be used to define per-domain
581 * variables. Each exclusive cpuset essentially defines an island domain by
582 * fully partitioning the member cpus from any other cpuset. Whenever a new
583 * exclusive cpuset is created, we also create and attach a new root-domain
584 * object.
585 *
586 */
587struct root_domain {
588 atomic_t refcount;
589 atomic_t rto_count;
590 struct rcu_head rcu;
591 cpumask_var_t span;
592 cpumask_var_t online;
593
594 /*
595 * The "RT overload" flag: it gets set if a CPU has more than
596 * one runnable RT task.
597 */
598 cpumask_var_t rto_mask;
599 struct cpupri cpupri;
600};
601
602/*
603 * By default the system creates a single root-domain with all cpus as
604 * members (mimicking the global state we have today).
605 */
606static struct root_domain def_root_domain;
607
608#endif /* CONFIG_SMP */
609
610/*
611 * This is the main, per-CPU runqueue data structure.
612 *
613 * Locking rule: those places that want to lock multiple runqueues
614 * (such as the load balancing or the thread migration code), lock
615 * acquire operations must be ordered by ascending &runqueue.
616 */
617struct rq {
618 /* runqueue lock: */
619 raw_spinlock_t lock;
620
621 /*
622 * nr_running and cpu_load should be in the same cacheline because
623 * remote CPUs use both these fields when doing load calculation.
624 */
625 unsigned long nr_running;
626 #define CPU_LOAD_IDX_MAX 5
627 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
628 unsigned long last_load_update_tick;
629#ifdef CONFIG_NO_HZ
630 u64 nohz_stamp;
631 unsigned char nohz_balance_kick;
632#endif
633 int skip_clock_update;
634
635 /* capture load from *all* tasks on this cpu: */
636 struct load_weight load;
637 unsigned long nr_load_updates;
638 u64 nr_switches;
639
640 struct cfs_rq cfs;
641 struct rt_rq rt;
642
643#ifdef CONFIG_FAIR_GROUP_SCHED
644 /* list of leaf cfs_rq on this cpu: */
645 struct list_head leaf_cfs_rq_list;
646#endif
647#ifdef CONFIG_RT_GROUP_SCHED
648 struct list_head leaf_rt_rq_list;
649#endif
650
651 /*
652 * This is part of a global counter where only the total sum
653 * over all CPUs matters. A task can increase this counter on
654 * one CPU and if it got migrated afterwards it may decrease
655 * it on another CPU. Always updated under the runqueue lock:
656 */
657 unsigned long nr_uninterruptible;
658
659 struct task_struct *curr, *idle, *stop;
660 unsigned long next_balance;
661 struct mm_struct *prev_mm;
662
663 u64 clock;
664 u64 clock_task;
665
666 atomic_t nr_iowait;
667
668#ifdef CONFIG_SMP
669 struct root_domain *rd;
670 struct sched_domain *sd;
671
672 unsigned long cpu_power;
673
674 unsigned char idle_balance;
675 /* For active balancing */
676 int post_schedule;
677 int active_balance;
678 int push_cpu;
679 struct cpu_stop_work active_balance_work;
680 /* cpu of this runqueue: */
681 int cpu;
682 int online;
683
684 u64 rt_avg;
685 u64 age_stamp;
686 u64 idle_stamp;
687 u64 avg_idle;
688#endif
689
690#ifdef CONFIG_IRQ_TIME_ACCOUNTING
691 u64 prev_irq_time;
692#endif
693#ifdef CONFIG_PARAVIRT
694 u64 prev_steal_time;
695#endif
696#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
697 u64 prev_steal_time_rq;
698#endif
699
700 /* calc_load related fields */
701 unsigned long calc_load_update;
702 long calc_load_active;
703
704#ifdef CONFIG_SCHED_HRTICK
705#ifdef CONFIG_SMP
706 int hrtick_csd_pending;
707 struct call_single_data hrtick_csd;
708#endif
709 struct hrtimer hrtick_timer;
710#endif
711
712#ifdef CONFIG_SCHEDSTATS
713 /* latency stats */
714 struct sched_info rq_sched_info;
715 unsigned long long rq_cpu_time;
716 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
717
718 /* sys_sched_yield() stats */
719 unsigned int yld_count;
720
721 /* schedule() stats */
722 unsigned int sched_switch;
723 unsigned int sched_count;
724 unsigned int sched_goidle;
725
726 /* try_to_wake_up() stats */
727 unsigned int ttwu_count;
728 unsigned int ttwu_local;
729#endif
730
731#ifdef CONFIG_SMP
732 struct llist_head wake_list;
733#endif
734};
735
736static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
737
738
739static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
740
741static inline int cpu_of(struct rq *rq)
742{
743#ifdef CONFIG_SMP
744 return rq->cpu;
745#else
746 return 0;
747#endif
748}
749
750#define rcu_dereference_check_sched_domain(p) \
751 rcu_dereference_check((p), \
752 lockdep_is_held(&sched_domains_mutex))
753
754/*
755 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
756 * See detach_destroy_domains: synchronize_sched for details.
757 *
758 * The domain tree of any CPU may only be accessed from within
759 * preempt-disabled sections.
760 */
761#define for_each_domain(cpu, __sd) \
762 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
763
764#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
765#define this_rq() (&__get_cpu_var(runqueues))
766#define task_rq(p) cpu_rq(task_cpu(p))
767#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
768#define raw_rq() (&__raw_get_cpu_var(runqueues))
769
770#ifdef CONFIG_CGROUP_SCHED
771
772/*
773 * Return the group to which this tasks belongs.
774 *
775 * We use task_subsys_state_check() and extend the RCU verification with
776 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
777 * task it moves into the cgroup. Therefore by holding either of those locks,
778 * we pin the task to the current cgroup.
779 */
780static inline struct task_group *task_group(struct task_struct *p)
781{
782 struct task_group *tg;
783 struct cgroup_subsys_state *css;
784
785 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
786 lockdep_is_held(&p->pi_lock) ||
787 lockdep_is_held(&task_rq(p)->lock));
788 tg = container_of(css, struct task_group, css);
789
790 return autogroup_task_group(p, tg);
791}
792
793/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
794static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
795{
796#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
797 struct task_group *tg = task_group(p);
798#endif
799
800#ifdef CONFIG_FAIR_GROUP_SCHED
801 p->se.cfs_rq = tg->cfs_rq[cpu];
802 p->se.parent = tg->se[cpu];
803#endif
804
805#ifdef CONFIG_RT_GROUP_SCHED
806 p->rt.rt_rq = tg->rt_rq[cpu];
807 p->rt.parent = tg->rt_se[cpu];
808#endif
809}
810
811#else /* CONFIG_CGROUP_SCHED */
812
813static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
814static inline struct task_group *task_group(struct task_struct *p)
815{
816 return NULL;
817}
818
819#endif /* CONFIG_CGROUP_SCHED */
820 109
821static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
822 111
823static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
824{ 113{
825 s64 delta; 114 s64 delta;
826 115
@@ -833,40 +122,10 @@ static void update_rq_clock(struct rq *rq)
833} 122}
834 123
835/* 124/*
836 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
837 */
838#ifdef CONFIG_SCHED_DEBUG
839# define const_debug __read_mostly
840#else
841# define const_debug static const
842#endif
843
844/**
845 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
846 * @cpu: the processor in question.
847 *
848 * This interface allows printk to be called with the runqueue lock
849 * held and know whether or not it is OK to wake up the klogd.
850 */
851int runqueue_is_locked(int cpu)
852{
853 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
854}
855
856/*
857 * Debugging: various feature bits 125 * Debugging: various feature bits
858 */ 126 */
859 127
860#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
861 __SCHED_FEAT_##name ,
862
863enum {
864#include "sched_features.h"
865};
866
867#undef SCHED_FEAT
868
869#define SCHED_FEAT(name, enabled) \
870 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
871 130
872const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
@@ -965,8 +224,6 @@ late_initcall(sched_init_debug);
965 224
966#endif 225#endif
967 226
968#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
969
970/* 227/*
971 * Number of tasks to iterate in a single balance run. 228 * Number of tasks to iterate in a single balance run.
972 * Limited because this is done with IRQs disabled. 229 * Limited because this is done with IRQs disabled.
@@ -987,7 +244,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
987 */ 244 */
988unsigned int sysctl_sched_rt_period = 1000000; 245unsigned int sysctl_sched_rt_period = 1000000;
989 246
990static __read_mostly int scheduler_running; 247__read_mostly int scheduler_running;
991 248
992/* 249/*
993 * part of the period that we allow rt tasks to run in us. 250 * part of the period that we allow rt tasks to run in us.
@@ -995,112 +252,7 @@ static __read_mostly int scheduler_running;
995 */ 252 */
996int sysctl_sched_rt_runtime = 950000; 253int sysctl_sched_rt_runtime = 950000;
997 254
998static inline u64 global_rt_period(void)
999{
1000 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1001}
1002 255
1003static inline u64 global_rt_runtime(void)
1004{
1005 if (sysctl_sched_rt_runtime < 0)
1006 return RUNTIME_INF;
1007
1008 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1009}
1010
1011#ifndef prepare_arch_switch
1012# define prepare_arch_switch(next) do { } while (0)
1013#endif
1014#ifndef finish_arch_switch
1015# define finish_arch_switch(prev) do { } while (0)
1016#endif
1017
1018static inline int task_current(struct rq *rq, struct task_struct *p)
1019{
1020 return rq->curr == p;
1021}
1022
1023static inline int task_running(struct rq *rq, struct task_struct *p)
1024{
1025#ifdef CONFIG_SMP
1026 return p->on_cpu;
1027#else
1028 return task_current(rq, p);
1029#endif
1030}
1031
1032#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1033static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1034{
1035#ifdef CONFIG_SMP
1036 /*
1037 * We can optimise this out completely for !SMP, because the
1038 * SMP rebalancing from interrupt is the only thing that cares
1039 * here.
1040 */
1041 next->on_cpu = 1;
1042#endif
1043}
1044
1045static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1046{
1047#ifdef CONFIG_SMP
1048 /*
1049 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1050 * We must ensure this doesn't happen until the switch is completely
1051 * finished.
1052 */
1053 smp_wmb();
1054 prev->on_cpu = 0;
1055#endif
1056#ifdef CONFIG_DEBUG_SPINLOCK
1057 /* this is a valid case when another task releases the spinlock */
1058 rq->lock.owner = current;
1059#endif
1060 /*
1061 * If we are tracking spinlock dependencies then we have to
1062 * fix up the runqueue lock - which gets 'carried over' from
1063 * prev into current:
1064 */
1065 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1066
1067 raw_spin_unlock_irq(&rq->lock);
1068}
1069
1070#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1071static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1072{
1073#ifdef CONFIG_SMP
1074 /*
1075 * We can optimise this out completely for !SMP, because the
1076 * SMP rebalancing from interrupt is the only thing that cares
1077 * here.
1078 */
1079 next->on_cpu = 1;
1080#endif
1081#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1082 raw_spin_unlock_irq(&rq->lock);
1083#else
1084 raw_spin_unlock(&rq->lock);
1085#endif
1086}
1087
1088static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1089{
1090#ifdef CONFIG_SMP
1091 /*
1092 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1093 * We must ensure this doesn't happen until the switch is completely
1094 * finished.
1095 */
1096 smp_wmb();
1097 prev->on_cpu = 0;
1098#endif
1099#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1100 local_irq_enable();
1101#endif
1102}
1103#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1104 256
1105/* 257/*
1106 * __task_rq_lock - lock the rq @p resides on. 258 * __task_rq_lock - lock the rq @p resides on.
@@ -1183,20 +335,6 @@ static struct rq *this_rq_lock(void)
1183 * rq->lock. 335 * rq->lock.
1184 */ 336 */
1185 337
1186/*
1187 * Use hrtick when:
1188 * - enabled by features
1189 * - hrtimer is actually high res
1190 */
1191static inline int hrtick_enabled(struct rq *rq)
1192{
1193 if (!sched_feat(HRTICK))
1194 return 0;
1195 if (!cpu_active(cpu_of(rq)))
1196 return 0;
1197 return hrtimer_is_hres_active(&rq->hrtick_timer);
1198}
1199
1200static void hrtick_clear(struct rq *rq) 338static void hrtick_clear(struct rq *rq)
1201{ 339{
1202 if (hrtimer_active(&rq->hrtick_timer)) 340 if (hrtimer_active(&rq->hrtick_timer))
@@ -1240,7 +378,7 @@ static void __hrtick_start(void *arg)
1240 * 378 *
1241 * called with rq->lock held and irqs disabled 379 * called with rq->lock held and irqs disabled
1242 */ 380 */
1243static void hrtick_start(struct rq *rq, u64 delay) 381void hrtick_start(struct rq *rq, u64 delay)
1244{ 382{
1245 struct hrtimer *timer = &rq->hrtick_timer; 383 struct hrtimer *timer = &rq->hrtick_timer;
1246 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 384 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1284,7 +422,7 @@ static __init void init_hrtick(void)
1284 * 422 *
1285 * called with rq->lock held and irqs disabled 423 * called with rq->lock held and irqs disabled
1286 */ 424 */
1287static void hrtick_start(struct rq *rq, u64 delay) 425void hrtick_start(struct rq *rq, u64 delay)
1288{ 426{
1289 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 427 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1290 HRTIMER_MODE_REL_PINNED, 0); 428 HRTIMER_MODE_REL_PINNED, 0);
@@ -1335,7 +473,7 @@ static inline void init_hrtick(void)
1335#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 473#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1336#endif 474#endif
1337 475
1338static void resched_task(struct task_struct *p) 476void resched_task(struct task_struct *p)
1339{ 477{
1340 int cpu; 478 int cpu;
1341 479
@@ -1356,7 +494,7 @@ static void resched_task(struct task_struct *p)
1356 smp_send_reschedule(cpu); 494 smp_send_reschedule(cpu);
1357} 495}
1358 496
1359static void resched_cpu(int cpu) 497void resched_cpu(int cpu)
1360{ 498{
1361 struct rq *rq = cpu_rq(cpu); 499 struct rq *rq = cpu_rq(cpu);
1362 unsigned long flags; 500 unsigned long flags;
@@ -1449,12 +587,7 @@ static inline bool got_nohz_idle_kick(void)
1449 587
1450#endif /* CONFIG_NO_HZ */ 588#endif /* CONFIG_NO_HZ */
1451 589
1452static u64 sched_avg_period(void) 590void sched_avg_update(struct rq *rq)
1453{
1454 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1455}
1456
1457static void sched_avg_update(struct rq *rq)
1458{ 591{
1459 s64 period = sched_avg_period(); 592 s64 period = sched_avg_period();
1460 593
@@ -1470,193 +603,23 @@ static void sched_avg_update(struct rq *rq)
1470 } 603 }
1471} 604}
1472 605
1473static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1474{
1475 rq->rt_avg += rt_delta;
1476 sched_avg_update(rq);
1477}
1478
1479#else /* !CONFIG_SMP */ 606#else /* !CONFIG_SMP */
1480static void resched_task(struct task_struct *p) 607void resched_task(struct task_struct *p)
1481{ 608{
1482 assert_raw_spin_locked(&task_rq(p)->lock); 609 assert_raw_spin_locked(&task_rq(p)->lock);
1483 set_tsk_need_resched(p); 610 set_tsk_need_resched(p);
1484} 611}
1485
1486static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1487{
1488}
1489
1490static void sched_avg_update(struct rq *rq)
1491{
1492}
1493#endif /* CONFIG_SMP */ 612#endif /* CONFIG_SMP */
1494 613
1495#if BITS_PER_LONG == 32
1496# define WMULT_CONST (~0UL)
1497#else
1498# define WMULT_CONST (1UL << 32)
1499#endif
1500
1501#define WMULT_SHIFT 32
1502
1503/*
1504 * Shift right and round:
1505 */
1506#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1507
1508/*
1509 * delta *= weight / lw
1510 */
1511static unsigned long
1512calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1513 struct load_weight *lw)
1514{
1515 u64 tmp;
1516
1517 /*
1518 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1519 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1520 * 2^SCHED_LOAD_RESOLUTION.
1521 */
1522 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1523 tmp = (u64)delta_exec * scale_load_down(weight);
1524 else
1525 tmp = (u64)delta_exec;
1526
1527 if (!lw->inv_weight) {
1528 unsigned long w = scale_load_down(lw->weight);
1529
1530 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1531 lw->inv_weight = 1;
1532 else if (unlikely(!w))
1533 lw->inv_weight = WMULT_CONST;
1534 else
1535 lw->inv_weight = WMULT_CONST / w;
1536 }
1537
1538 /*
1539 * Check whether we'd overflow the 64-bit multiplication:
1540 */
1541 if (unlikely(tmp > WMULT_CONST))
1542 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1543 WMULT_SHIFT/2);
1544 else
1545 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1546
1547 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1548}
1549
1550static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1551{
1552 lw->weight += inc;
1553 lw->inv_weight = 0;
1554}
1555
1556static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1557{
1558 lw->weight -= dec;
1559 lw->inv_weight = 0;
1560}
1561
1562static inline void update_load_set(struct load_weight *lw, unsigned long w)
1563{
1564 lw->weight = w;
1565 lw->inv_weight = 0;
1566}
1567
1568/*
1569 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1570 * of tasks with abnormal "nice" values across CPUs the contribution that
1571 * each task makes to its run queue's load is weighted according to its
1572 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1573 * scaled version of the new time slice allocation that they receive on time
1574 * slice expiry etc.
1575 */
1576
1577#define WEIGHT_IDLEPRIO 3
1578#define WMULT_IDLEPRIO 1431655765
1579
1580/*
1581 * Nice levels are multiplicative, with a gentle 10% change for every
1582 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1583 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1584 * that remained on nice 0.
1585 *
1586 * The "10% effect" is relative and cumulative: from _any_ nice level,
1587 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1588 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1589 * If a task goes up by ~10% and another task goes down by ~10% then
1590 * the relative distance between them is ~25%.)
1591 */
1592static const int prio_to_weight[40] = {
1593 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1594 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1595 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1596 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1597 /* 0 */ 1024, 820, 655, 526, 423,
1598 /* 5 */ 335, 272, 215, 172, 137,
1599 /* 10 */ 110, 87, 70, 56, 45,
1600 /* 15 */ 36, 29, 23, 18, 15,
1601};
1602
1603/*
1604 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1605 *
1606 * In cases where the weight does not change often, we can use the
1607 * precalculated inverse to speed up arithmetics by turning divisions
1608 * into multiplications:
1609 */
1610static const u32 prio_to_wmult[40] = {
1611 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1612 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1613 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1614 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1615 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1616 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1617 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1618 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1619};
1620
1621/* Time spent by the tasks of the cpu accounting group executing in ... */
1622enum cpuacct_stat_index {
1623 CPUACCT_STAT_USER, /* ... user mode */
1624 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1625
1626 CPUACCT_STAT_NSTATS,
1627};
1628
1629#ifdef CONFIG_CGROUP_CPUACCT
1630static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1631static void cpuacct_update_stats(struct task_struct *tsk,
1632 enum cpuacct_stat_index idx, cputime_t val);
1633#else
1634static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1635static inline void cpuacct_update_stats(struct task_struct *tsk,
1636 enum cpuacct_stat_index idx, cputime_t val) {}
1637#endif
1638
1639static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1640{
1641 update_load_add(&rq->load, load);
1642}
1643
1644static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1645{
1646 update_load_sub(&rq->load, load);
1647}
1648
1649#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 614#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1650 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 615 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1651typedef int (*tg_visitor)(struct task_group *, void *);
1652
1653/* 616/*
1654 * Iterate task_group tree rooted at *from, calling @down when first entering a 617 * Iterate task_group tree rooted at *from, calling @down when first entering a
1655 * node and @up when leaving it for the final time. 618 * node and @up when leaving it for the final time.
1656 * 619 *
1657 * Caller must hold rcu_lock or sufficient equivalent. 620 * Caller must hold rcu_lock or sufficient equivalent.
1658 */ 621 */
1659static int walk_tg_tree_from(struct task_group *from, 622int walk_tg_tree_from(struct task_group *from,
1660 tg_visitor down, tg_visitor up, void *data) 623 tg_visitor down, tg_visitor up, void *data)
1661{ 624{
1662 struct task_group *parent, *child; 625 struct task_group *parent, *child;
@@ -1687,270 +650,13 @@ out:
1687 return ret; 650 return ret;
1688} 651}
1689 652
1690/* 653int tg_nop(struct task_group *tg, void *data)
1691 * Iterate the full tree, calling @down when first entering a node and @up when
1692 * leaving it for the final time.
1693 *
1694 * Caller must hold rcu_lock or sufficient equivalent.
1695 */
1696
1697static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1698{
1699 return walk_tg_tree_from(&root_task_group, down, up, data);
1700}
1701
1702static int tg_nop(struct task_group *tg, void *data)
1703{ 654{
1704 return 0; 655 return 0;
1705} 656}
1706#endif 657#endif
1707 658
1708#ifdef CONFIG_SMP 659void update_cpu_load(struct rq *this_rq);
1709/* Used instead of source_load when we know the type == 0 */
1710static unsigned long weighted_cpuload(const int cpu)
1711{
1712 return cpu_rq(cpu)->load.weight;
1713}
1714
1715/*
1716 * Return a low guess at the load of a migration-source cpu weighted
1717 * according to the scheduling class and "nice" value.
1718 *
1719 * We want to under-estimate the load of migration sources, to
1720 * balance conservatively.
1721 */
1722static unsigned long source_load(int cpu, int type)
1723{
1724 struct rq *rq = cpu_rq(cpu);
1725 unsigned long total = weighted_cpuload(cpu);
1726
1727 if (type == 0 || !sched_feat(LB_BIAS))
1728 return total;
1729
1730 return min(rq->cpu_load[type-1], total);
1731}
1732
1733/*
1734 * Return a high guess at the load of a migration-target cpu weighted
1735 * according to the scheduling class and "nice" value.
1736 */
1737static unsigned long target_load(int cpu, int type)
1738{
1739 struct rq *rq = cpu_rq(cpu);
1740 unsigned long total = weighted_cpuload(cpu);
1741
1742 if (type == 0 || !sched_feat(LB_BIAS))
1743 return total;
1744
1745 return max(rq->cpu_load[type-1], total);
1746}
1747
1748static unsigned long power_of(int cpu)
1749{
1750 return cpu_rq(cpu)->cpu_power;
1751}
1752
1753static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1754
1755static unsigned long cpu_avg_load_per_task(int cpu)
1756{
1757 struct rq *rq = cpu_rq(cpu);
1758 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1759
1760 if (nr_running)
1761 return rq->load.weight / nr_running;
1762
1763 return 0;
1764}
1765
1766#ifdef CONFIG_PREEMPT
1767
1768static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1769
1770/*
1771 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1772 * way at the expense of forcing extra atomic operations in all
1773 * invocations. This assures that the double_lock is acquired using the
1774 * same underlying policy as the spinlock_t on this architecture, which
1775 * reduces latency compared to the unfair variant below. However, it
1776 * also adds more overhead and therefore may reduce throughput.
1777 */
1778static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1779 __releases(this_rq->lock)
1780 __acquires(busiest->lock)
1781 __acquires(this_rq->lock)
1782{
1783 raw_spin_unlock(&this_rq->lock);
1784 double_rq_lock(this_rq, busiest);
1785
1786 return 1;
1787}
1788
1789#else
1790/*
1791 * Unfair double_lock_balance: Optimizes throughput at the expense of
1792 * latency by eliminating extra atomic operations when the locks are
1793 * already in proper order on entry. This favors lower cpu-ids and will
1794 * grant the double lock to lower cpus over higher ids under contention,
1795 * regardless of entry order into the function.
1796 */
1797static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1798 __releases(this_rq->lock)
1799 __acquires(busiest->lock)
1800 __acquires(this_rq->lock)
1801{
1802 int ret = 0;
1803
1804 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1805 if (busiest < this_rq) {
1806 raw_spin_unlock(&this_rq->lock);
1807 raw_spin_lock(&busiest->lock);
1808 raw_spin_lock_nested(&this_rq->lock,
1809 SINGLE_DEPTH_NESTING);
1810 ret = 1;
1811 } else
1812 raw_spin_lock_nested(&busiest->lock,
1813 SINGLE_DEPTH_NESTING);
1814 }
1815 return ret;
1816}
1817
1818#endif /* CONFIG_PREEMPT */
1819
1820/*
1821 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1822 */
1823static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1824{
1825 if (unlikely(!irqs_disabled())) {
1826 /* printk() doesn't work good under rq->lock */
1827 raw_spin_unlock(&this_rq->lock);
1828 BUG_ON(1);
1829 }
1830
1831 return _double_lock_balance(this_rq, busiest);
1832}
1833
1834static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1835 __releases(busiest->lock)
1836{
1837 raw_spin_unlock(&busiest->lock);
1838 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1839}
1840
1841/*
1842 * double_rq_lock - safely lock two runqueues
1843 *
1844 * Note this does not disable interrupts like task_rq_lock,
1845 * you need to do so manually before calling.
1846 */
1847static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1848 __acquires(rq1->lock)
1849 __acquires(rq2->lock)
1850{
1851 BUG_ON(!irqs_disabled());
1852 if (rq1 == rq2) {
1853 raw_spin_lock(&rq1->lock);
1854 __acquire(rq2->lock); /* Fake it out ;) */
1855 } else {
1856 if (rq1 < rq2) {
1857 raw_spin_lock(&rq1->lock);
1858 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1859 } else {
1860 raw_spin_lock(&rq2->lock);
1861 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1862 }
1863 }
1864}
1865
1866/*
1867 * double_rq_unlock - safely unlock two runqueues
1868 *
1869 * Note this does not restore interrupts like task_rq_unlock,
1870 * you need to do so manually after calling.
1871 */
1872static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1873 __releases(rq1->lock)
1874 __releases(rq2->lock)
1875{
1876 raw_spin_unlock(&rq1->lock);
1877 if (rq1 != rq2)
1878 raw_spin_unlock(&rq2->lock);
1879 else
1880 __release(rq2->lock);
1881}
1882
1883#else /* CONFIG_SMP */
1884
1885/*
1886 * double_rq_lock - safely lock two runqueues
1887 *
1888 * Note this does not disable interrupts like task_rq_lock,
1889 * you need to do so manually before calling.
1890 */
1891static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1892 __acquires(rq1->lock)
1893 __acquires(rq2->lock)
1894{
1895 BUG_ON(!irqs_disabled());
1896 BUG_ON(rq1 != rq2);
1897 raw_spin_lock(&rq1->lock);
1898 __acquire(rq2->lock); /* Fake it out ;) */
1899}
1900
1901/*
1902 * double_rq_unlock - safely unlock two runqueues
1903 *
1904 * Note this does not restore interrupts like task_rq_unlock,
1905 * you need to do so manually after calling.
1906 */
1907static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1908 __releases(rq1->lock)
1909 __releases(rq2->lock)
1910{
1911 BUG_ON(rq1 != rq2);
1912 raw_spin_unlock(&rq1->lock);
1913 __release(rq2->lock);
1914}
1915
1916#endif
1917
1918static void calc_load_account_idle(struct rq *this_rq);
1919static void update_sysctl(void);
1920static int get_update_sysctl_factor(void);
1921static void update_cpu_load(struct rq *this_rq);
1922
1923static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1924{
1925 set_task_rq(p, cpu);
1926#ifdef CONFIG_SMP
1927 /*
1928 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1929 * successfully executed on another CPU. We must ensure that updates of
1930 * per-task data have been completed by this moment.
1931 */
1932 smp_wmb();
1933 task_thread_info(p)->cpu = cpu;
1934#endif
1935}
1936
1937static const struct sched_class rt_sched_class;
1938
1939#define sched_class_highest (&stop_sched_class)
1940#define for_each_class(class) \
1941 for (class = sched_class_highest; class; class = class->next)
1942
1943#include "sched_stats.h"
1944
1945static void inc_nr_running(struct rq *rq)
1946{
1947 rq->nr_running++;
1948}
1949
1950static void dec_nr_running(struct rq *rq)
1951{
1952 rq->nr_running--;
1953}
1954 660
1955static void set_load_weight(struct task_struct *p) 661static void set_load_weight(struct task_struct *p)
1956{ 662{
@@ -1987,7 +693,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1987/* 693/*
1988 * activate_task - move a task to the runqueue. 694 * activate_task - move a task to the runqueue.
1989 */ 695 */
1990static void activate_task(struct rq *rq, struct task_struct *p, int flags) 696void activate_task(struct rq *rq, struct task_struct *p, int flags)
1991{ 697{
1992 if (task_contributes_to_load(p)) 698 if (task_contributes_to_load(p))
1993 rq->nr_uninterruptible--; 699 rq->nr_uninterruptible--;
@@ -1998,7 +704,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1998/* 704/*
1999 * deactivate_task - remove a task from the runqueue. 705 * deactivate_task - remove a task from the runqueue.
2000 */ 706 */
2001static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 707void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
2002{ 708{
2003 if (task_contributes_to_load(p)) 709 if (task_contributes_to_load(p))
2004 rq->nr_uninterruptible++; 710 rq->nr_uninterruptible++;
@@ -2223,15 +929,6 @@ static int irqtime_account_si_update(void)
2223 929
2224#endif 930#endif
2225 931
2226#include "sched_idletask.c"
2227#include "sched_fair.c"
2228#include "sched_rt.c"
2229#include "sched_autogroup.c"
2230#include "sched_stoptask.c"
2231#ifdef CONFIG_SCHED_DEBUG
2232# include "sched_debug.c"
2233#endif
2234
2235void sched_set_stop_task(int cpu, struct task_struct *stop) 932void sched_set_stop_task(int cpu, struct task_struct *stop)
2236{ 933{
2237 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 934 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2329,7 +1026,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2329 p->sched_class->prio_changed(rq, p, oldprio); 1026 p->sched_class->prio_changed(rq, p, oldprio);
2330} 1027}
2331 1028
2332static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1029void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2333{ 1030{
2334 const struct sched_class *class; 1031 const struct sched_class *class;
2335 1032
@@ -2355,38 +1052,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2355} 1052}
2356 1053
2357#ifdef CONFIG_SMP 1054#ifdef CONFIG_SMP
2358/*
2359 * Is this task likely cache-hot:
2360 */
2361static int
2362task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2363{
2364 s64 delta;
2365
2366 if (p->sched_class != &fair_sched_class)
2367 return 0;
2368
2369 if (unlikely(p->policy == SCHED_IDLE))
2370 return 0;
2371
2372 /*
2373 * Buddy candidates are cache hot:
2374 */
2375 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2376 (&p->se == cfs_rq_of(&p->se)->next ||
2377 &p->se == cfs_rq_of(&p->se)->last))
2378 return 1;
2379
2380 if (sysctl_sched_migration_cost == -1)
2381 return 1;
2382 if (sysctl_sched_migration_cost == 0)
2383 return 0;
2384
2385 delta = now - p->se.exec_start;
2386
2387 return delta < (s64)sysctl_sched_migration_cost;
2388}
2389
2390void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1055void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2391{ 1056{
2392#ifdef CONFIG_SCHED_DEBUG 1057#ifdef CONFIG_SCHED_DEBUG
@@ -3469,7 +2134,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3469 */ 2134 */
3470static atomic_long_t calc_load_tasks_idle; 2135static atomic_long_t calc_load_tasks_idle;
3471 2136
3472static void calc_load_account_idle(struct rq *this_rq) 2137void calc_load_account_idle(struct rq *this_rq)
3473{ 2138{
3474 long delta; 2139 long delta;
3475 2140
@@ -3613,7 +2278,7 @@ static void calc_global_nohz(unsigned long ticks)
3613 */ 2278 */
3614} 2279}
3615#else 2280#else
3616static void calc_load_account_idle(struct rq *this_rq) 2281void calc_load_account_idle(struct rq *this_rq)
3617{ 2282{
3618} 2283}
3619 2284
@@ -3756,7 +2421,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3756 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2421 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3757 * every tick. We fix it up based on jiffies. 2422 * every tick. We fix it up based on jiffies.
3758 */ 2423 */
3759static void update_cpu_load(struct rq *this_rq) 2424void update_cpu_load(struct rq *this_rq)
3760{ 2425{
3761 unsigned long this_load = this_rq->load.weight; 2426 unsigned long this_load = this_rq->load.weight;
3762 unsigned long curr_jiffies = jiffies; 2427 unsigned long curr_jiffies = jiffies;
@@ -6148,53 +4813,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6148#endif 4813#endif
6149} 4814}
6150 4815
6151/*
6152 * Increase the granularity value when there are more CPUs,
6153 * because with more CPUs the 'effective latency' as visible
6154 * to users decreases. But the relationship is not linear,
6155 * so pick a second-best guess by going with the log2 of the
6156 * number of CPUs.
6157 *
6158 * This idea comes from the SD scheduler of Con Kolivas:
6159 */
6160static int get_update_sysctl_factor(void)
6161{
6162 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6163 unsigned int factor;
6164
6165 switch (sysctl_sched_tunable_scaling) {
6166 case SCHED_TUNABLESCALING_NONE:
6167 factor = 1;
6168 break;
6169 case SCHED_TUNABLESCALING_LINEAR:
6170 factor = cpus;
6171 break;
6172 case SCHED_TUNABLESCALING_LOG:
6173 default:
6174 factor = 1 + ilog2(cpus);
6175 break;
6176 }
6177
6178 return factor;
6179}
6180
6181static void update_sysctl(void)
6182{
6183 unsigned int factor = get_update_sysctl_factor();
6184
6185#define SET_SYSCTL(name) \
6186 (sysctl_##name = (factor) * normalized_sysctl_##name)
6187 SET_SYSCTL(sched_min_granularity);
6188 SET_SYSCTL(sched_latency);
6189 SET_SYSCTL(sched_wakeup_granularity);
6190#undef SET_SYSCTL
6191}
6192
6193static inline void sched_init_granularity(void)
6194{
6195 update_sysctl();
6196}
6197
6198#ifdef CONFIG_SMP 4816#ifdef CONFIG_SMP
6199void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4817void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6200{ 4818{
@@ -6381,30 +4999,6 @@ static void calc_global_load_remove(struct rq *rq)
6381 rq->calc_load_active = 0; 4999 rq->calc_load_active = 0;
6382} 5000}
6383 5001
6384#ifdef CONFIG_CFS_BANDWIDTH
6385static void unthrottle_offline_cfs_rqs(struct rq *rq)
6386{
6387 struct cfs_rq *cfs_rq;
6388
6389 for_each_leaf_cfs_rq(rq, cfs_rq) {
6390 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6391
6392 if (!cfs_rq->runtime_enabled)
6393 continue;
6394
6395 /*
6396 * clock_task is not advancing so we just need to make sure
6397 * there's some valid quota amount
6398 */
6399 cfs_rq->runtime_remaining = cfs_b->quota;
6400 if (cfs_rq_throttled(cfs_rq))
6401 unthrottle_cfs_rq(cfs_rq);
6402 }
6403}
6404#else
6405static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6406#endif
6407
6408/* 5002/*
6409 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5003 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6410 * try_to_wake_up()->select_task_rq(). 5004 * try_to_wake_up()->select_task_rq().
@@ -7010,6 +5604,12 @@ out:
7010 return -ENOMEM; 5604 return -ENOMEM;
7011} 5605}
7012 5606
5607/*
5608 * By default the system creates a single root-domain with all cpus as
5609 * members (mimicking the global state we have today).
5610 */
5611struct root_domain def_root_domain;
5612
7013static void init_defrootdomain(void) 5613static void init_defrootdomain(void)
7014{ 5614{
7015 init_rootdomain(&def_root_domain); 5615 init_rootdomain(&def_root_domain);
@@ -7418,6 +6018,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7418 update_group_power(sd, cpu); 6018 update_group_power(sd, cpu);
7419} 6019}
7420 6020
6021int __weak arch_sd_sibling_asym_packing(void)
6022{
6023 return 0*SD_ASYM_PACKING;
6024}
6025
7421/* 6026/*
7422 * Initializers for schedule domains 6027 * Initializers for schedule domains
7423 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6028 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
@@ -8053,29 +6658,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8053 } 6658 }
8054} 6659}
8055 6660
8056static int update_runtime(struct notifier_block *nfb,
8057 unsigned long action, void *hcpu)
8058{
8059 int cpu = (int)(long)hcpu;
8060
8061 switch (action) {
8062 case CPU_DOWN_PREPARE:
8063 case CPU_DOWN_PREPARE_FROZEN:
8064 disable_runtime(cpu_rq(cpu));
8065 return NOTIFY_OK;
8066
8067 case CPU_DOWN_FAILED:
8068 case CPU_DOWN_FAILED_FROZEN:
8069 case CPU_ONLINE:
8070 case CPU_ONLINE_FROZEN:
8071 enable_runtime(cpu_rq(cpu));
8072 return NOTIFY_OK;
8073
8074 default:
8075 return NOTIFY_DONE;
8076 }
8077}
8078
8079void __init sched_init_smp(void) 6661void __init sched_init_smp(void)
8080{ 6662{
8081 cpumask_var_t non_isolated_cpus; 6663 cpumask_var_t non_isolated_cpus;
@@ -8124,104 +6706,11 @@ int in_sched_functions(unsigned long addr)
8124 && addr < (unsigned long)__sched_text_end); 6706 && addr < (unsigned long)__sched_text_end);
8125} 6707}
8126 6708
8127static void init_cfs_rq(struct cfs_rq *cfs_rq) 6709#ifdef CONFIG_CGROUP_SCHED
8128{ 6710struct task_group root_task_group;
8129 cfs_rq->tasks_timeline = RB_ROOT;
8130 INIT_LIST_HEAD(&cfs_rq->tasks);
8131 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8132#ifndef CONFIG_64BIT
8133 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8134#endif
8135}
8136
8137static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8138{
8139 struct rt_prio_array *array;
8140 int i;
8141
8142 array = &rt_rq->active;
8143 for (i = 0; i < MAX_RT_PRIO; i++) {
8144 INIT_LIST_HEAD(array->queue + i);
8145 __clear_bit(i, array->bitmap);
8146 }
8147 /* delimiter for bitsearch: */
8148 __set_bit(MAX_RT_PRIO, array->bitmap);
8149
8150#if defined CONFIG_SMP
8151 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8152 rt_rq->highest_prio.next = MAX_RT_PRIO;
8153 rt_rq->rt_nr_migratory = 0;
8154 rt_rq->overloaded = 0;
8155 plist_head_init(&rt_rq->pushable_tasks);
8156#endif
8157
8158 rt_rq->rt_time = 0;
8159 rt_rq->rt_throttled = 0;
8160 rt_rq->rt_runtime = 0;
8161 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8162}
8163
8164#ifdef CONFIG_FAIR_GROUP_SCHED
8165static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8166 struct sched_entity *se, int cpu,
8167 struct sched_entity *parent)
8168{
8169 struct rq *rq = cpu_rq(cpu);
8170
8171 cfs_rq->tg = tg;
8172 cfs_rq->rq = rq;
8173#ifdef CONFIG_SMP
8174 /* allow initial update_cfs_load() to truncate */
8175 cfs_rq->load_stamp = 1;
8176#endif
8177 init_cfs_rq_runtime(cfs_rq);
8178
8179 tg->cfs_rq[cpu] = cfs_rq;
8180 tg->se[cpu] = se;
8181
8182 /* se could be NULL for root_task_group */
8183 if (!se)
8184 return;
8185
8186 if (!parent)
8187 se->cfs_rq = &rq->cfs;
8188 else
8189 se->cfs_rq = parent->my_q;
8190
8191 se->my_q = cfs_rq;
8192 update_load_set(&se->load, 0);
8193 se->parent = parent;
8194}
8195#endif 6711#endif
8196 6712
8197#ifdef CONFIG_RT_GROUP_SCHED 6713DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8198static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8199 struct sched_rt_entity *rt_se, int cpu,
8200 struct sched_rt_entity *parent)
8201{
8202 struct rq *rq = cpu_rq(cpu);
8203
8204 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8205 rt_rq->rt_nr_boosted = 0;
8206 rt_rq->rq = rq;
8207 rt_rq->tg = tg;
8208
8209 tg->rt_rq[cpu] = rt_rq;
8210 tg->rt_se[cpu] = rt_se;
8211
8212 if (!rt_se)
8213 return;
8214
8215 if (!parent)
8216 rt_se->rt_rq = &rq->rt;
8217 else
8218 rt_se->rt_rq = parent->my_q;
8219
8220 rt_se->my_q = rt_rq;
8221 rt_se->parent = parent;
8222 INIT_LIST_HEAD(&rt_se->run_list);
8223}
8224#endif
8225 6714
8226void __init sched_init(void) 6715void __init sched_init(void)
8227{ 6716{
@@ -8294,7 +6783,7 @@ void __init sched_init(void)
8294 init_cfs_rq(&rq->cfs); 6783 init_cfs_rq(&rq->cfs);
8295 init_rt_rq(&rq->rt, rq); 6784 init_rt_rq(&rq->rt, rq);
8296#ifdef CONFIG_FAIR_GROUP_SCHED 6785#ifdef CONFIG_FAIR_GROUP_SCHED
8297 root_task_group.shares = root_task_group_load; 6786 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8298 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6787 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8299 /* 6788 /*
8300 * How much cpu bandwidth does root_task_group get? 6789 * How much cpu bandwidth does root_task_group get?
@@ -8357,10 +6846,6 @@ void __init sched_init(void)
8357 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6846 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8358#endif 6847#endif
8359 6848
8360#ifdef CONFIG_SMP
8361 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8362#endif
8363
8364#ifdef CONFIG_RT_MUTEXES 6849#ifdef CONFIG_RT_MUTEXES
8365 plist_head_init(&init_task.pi_waiters); 6850 plist_head_init(&init_task.pi_waiters);
8366#endif 6851#endif
@@ -8388,17 +6873,11 @@ void __init sched_init(void)
8388 6873
8389#ifdef CONFIG_SMP 6874#ifdef CONFIG_SMP
8390 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6875 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8391#ifdef CONFIG_NO_HZ
8392 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8393 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8394 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8395 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8396 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8397#endif
8398 /* May be allocated at isolcpus cmdline parse time */ 6876 /* May be allocated at isolcpus cmdline parse time */
8399 if (cpu_isolated_map == NULL) 6877 if (cpu_isolated_map == NULL)
8400 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6878 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8401#endif /* SMP */ 6879#endif
6880 init_sched_fair_class();
8402 6881
8403 scheduler_running = 1; 6882 scheduler_running = 1;
8404} 6883}
@@ -8550,169 +7029,14 @@ void set_curr_task(int cpu, struct task_struct *p)
8550 7029
8551#endif 7030#endif
8552 7031
8553#ifdef CONFIG_FAIR_GROUP_SCHED
8554static void free_fair_sched_group(struct task_group *tg)
8555{
8556 int i;
8557
8558 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8559
8560 for_each_possible_cpu(i) {
8561 if (tg->cfs_rq)
8562 kfree(tg->cfs_rq[i]);
8563 if (tg->se)
8564 kfree(tg->se[i]);
8565 }
8566
8567 kfree(tg->cfs_rq);
8568 kfree(tg->se);
8569}
8570
8571static
8572int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8573{
8574 struct cfs_rq *cfs_rq;
8575 struct sched_entity *se;
8576 int i;
8577
8578 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8579 if (!tg->cfs_rq)
8580 goto err;
8581 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8582 if (!tg->se)
8583 goto err;
8584
8585 tg->shares = NICE_0_LOAD;
8586
8587 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8588
8589 for_each_possible_cpu(i) {
8590 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8591 GFP_KERNEL, cpu_to_node(i));
8592 if (!cfs_rq)
8593 goto err;
8594
8595 se = kzalloc_node(sizeof(struct sched_entity),
8596 GFP_KERNEL, cpu_to_node(i));
8597 if (!se)
8598 goto err_free_rq;
8599
8600 init_cfs_rq(cfs_rq);
8601 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8602 }
8603
8604 return 1;
8605
8606err_free_rq:
8607 kfree(cfs_rq);
8608err:
8609 return 0;
8610}
8611
8612static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8613{
8614 struct rq *rq = cpu_rq(cpu);
8615 unsigned long flags;
8616
8617 /*
8618 * Only empty task groups can be destroyed; so we can speculatively
8619 * check on_list without danger of it being re-added.
8620 */
8621 if (!tg->cfs_rq[cpu]->on_list)
8622 return;
8623
8624 raw_spin_lock_irqsave(&rq->lock, flags);
8625 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8626 raw_spin_unlock_irqrestore(&rq->lock, flags);
8627}
8628#else /* !CONFIG_FAIR_GROUP_SCHED */
8629static inline void free_fair_sched_group(struct task_group *tg)
8630{
8631}
8632
8633static inline
8634int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8635{
8636 return 1;
8637}
8638
8639static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8640{
8641}
8642#endif /* CONFIG_FAIR_GROUP_SCHED */
8643
8644#ifdef CONFIG_RT_GROUP_SCHED 7032#ifdef CONFIG_RT_GROUP_SCHED
8645static void free_rt_sched_group(struct task_group *tg)
8646{
8647 int i;
8648
8649 if (tg->rt_se)
8650 destroy_rt_bandwidth(&tg->rt_bandwidth);
8651
8652 for_each_possible_cpu(i) {
8653 if (tg->rt_rq)
8654 kfree(tg->rt_rq[i]);
8655 if (tg->rt_se)
8656 kfree(tg->rt_se[i]);
8657 }
8658
8659 kfree(tg->rt_rq);
8660 kfree(tg->rt_se);
8661}
8662
8663static
8664int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8665{
8666 struct rt_rq *rt_rq;
8667 struct sched_rt_entity *rt_se;
8668 int i;
8669
8670 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8671 if (!tg->rt_rq)
8672 goto err;
8673 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8674 if (!tg->rt_se)
8675 goto err;
8676
8677 init_rt_bandwidth(&tg->rt_bandwidth,
8678 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8679
8680 for_each_possible_cpu(i) {
8681 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8682 GFP_KERNEL, cpu_to_node(i));
8683 if (!rt_rq)
8684 goto err;
8685
8686 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8687 GFP_KERNEL, cpu_to_node(i));
8688 if (!rt_se)
8689 goto err_free_rq;
8690
8691 init_rt_rq(rt_rq, cpu_rq(i));
8692 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8693 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8694 }
8695
8696 return 1;
8697
8698err_free_rq:
8699 kfree(rt_rq);
8700err:
8701 return 0;
8702}
8703#else /* !CONFIG_RT_GROUP_SCHED */ 7033#else /* !CONFIG_RT_GROUP_SCHED */
8704static inline void free_rt_sched_group(struct task_group *tg)
8705{
8706}
8707
8708static inline
8709int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8710{
8711 return 1;
8712}
8713#endif /* CONFIG_RT_GROUP_SCHED */ 7034#endif /* CONFIG_RT_GROUP_SCHED */
8714 7035
8715#ifdef CONFIG_CGROUP_SCHED 7036#ifdef CONFIG_CGROUP_SCHED
7037/* task_group_lock serializes the addition/removal of task groups */
7038static DEFINE_SPINLOCK(task_group_lock);
7039
8716static void free_sched_group(struct task_group *tg) 7040static void free_sched_group(struct task_group *tg)
8717{ 7041{
8718 free_fair_sched_group(tg); 7042 free_fair_sched_group(tg);
@@ -8818,47 +7142,6 @@ void sched_move_task(struct task_struct *tsk)
8818#endif /* CONFIG_CGROUP_SCHED */ 7142#endif /* CONFIG_CGROUP_SCHED */
8819 7143
8820#ifdef CONFIG_FAIR_GROUP_SCHED 7144#ifdef CONFIG_FAIR_GROUP_SCHED
8821static DEFINE_MUTEX(shares_mutex);
8822
8823int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8824{
8825 int i;
8826 unsigned long flags;
8827
8828 /*
8829 * We can't change the weight of the root cgroup.
8830 */
8831 if (!tg->se[0])
8832 return -EINVAL;
8833
8834 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8835
8836 mutex_lock(&shares_mutex);
8837 if (tg->shares == shares)
8838 goto done;
8839
8840 tg->shares = shares;
8841 for_each_possible_cpu(i) {
8842 struct rq *rq = cpu_rq(i);
8843 struct sched_entity *se;
8844
8845 se = tg->se[i];
8846 /* Propagate contribution to hierarchy */
8847 raw_spin_lock_irqsave(&rq->lock, flags);
8848 for_each_sched_entity(se)
8849 update_cfs_shares(group_cfs_rq(se));
8850 raw_spin_unlock_irqrestore(&rq->lock, flags);
8851 }
8852
8853done:
8854 mutex_unlock(&shares_mutex);
8855 return 0;
8856}
8857
8858unsigned long sched_group_shares(struct task_group *tg)
8859{
8860 return tg->shares;
8861}
8862#endif 7145#endif
8863 7146
8864#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7147#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -8883,7 +7166,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8883 struct task_struct *g, *p; 7166 struct task_struct *g, *p;
8884 7167
8885 do_each_thread(g, p) { 7168 do_each_thread(g, p) {
8886 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7169 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8887 return 1; 7170 return 1;
8888 } while_each_thread(g, p); 7171 } while_each_thread(g, p);
8889 7172
@@ -9235,7 +7518,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9235static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7518static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9236{ 7519{
9237 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7520 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9238 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7521 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9239 7522
9240 if (tg == &root_task_group) 7523 if (tg == &root_task_group)
9241 return -EINVAL; 7524 return -EINVAL;
@@ -9264,7 +7547,6 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9264 runtime_enabled = quota != RUNTIME_INF; 7547 runtime_enabled = quota != RUNTIME_INF;
9265 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7548 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9266 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7549 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9267
9268 raw_spin_lock_irq(&cfs_b->lock); 7550 raw_spin_lock_irq(&cfs_b->lock);
9269 cfs_b->period = ns_to_ktime(period); 7551 cfs_b->period = ns_to_ktime(period);
9270 cfs_b->quota = quota; 7552 cfs_b->quota = quota;
@@ -9280,13 +7562,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9280 7562
9281 for_each_possible_cpu(i) { 7563 for_each_possible_cpu(i) {
9282 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7564 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9283 struct rq *rq = rq_of(cfs_rq); 7565 struct rq *rq = cfs_rq->rq;
9284 7566
9285 raw_spin_lock_irq(&rq->lock); 7567 raw_spin_lock_irq(&rq->lock);
9286 cfs_rq->runtime_enabled = runtime_enabled; 7568 cfs_rq->runtime_enabled = runtime_enabled;
9287 cfs_rq->runtime_remaining = 0; 7569 cfs_rq->runtime_remaining = 0;
9288 7570
9289 if (cfs_rq_throttled(cfs_rq)) 7571 if (cfs_rq->throttled)
9290 unthrottle_cfs_rq(cfs_rq); 7572 unthrottle_cfs_rq(cfs_rq);
9291 raw_spin_unlock_irq(&rq->lock); 7573 raw_spin_unlock_irq(&rq->lock);
9292 } 7574 }
@@ -9300,7 +7582,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9300{ 7582{
9301 u64 quota, period; 7583 u64 quota, period;
9302 7584
9303 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7585 period = ktime_to_ns(tg->cfs_bandwidth.period);
9304 if (cfs_quota_us < 0) 7586 if (cfs_quota_us < 0)
9305 quota = RUNTIME_INF; 7587 quota = RUNTIME_INF;
9306 else 7588 else
@@ -9313,10 +7595,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9313{ 7595{
9314 u64 quota_us; 7596 u64 quota_us;
9315 7597
9316 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7598 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9317 return -1; 7599 return -1;
9318 7600
9319 quota_us = tg_cfs_bandwidth(tg)->quota; 7601 quota_us = tg->cfs_bandwidth.quota;
9320 do_div(quota_us, NSEC_PER_USEC); 7602 do_div(quota_us, NSEC_PER_USEC);
9321 7603
9322 return quota_us; 7604 return quota_us;
@@ -9327,7 +7609,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9327 u64 quota, period; 7609 u64 quota, period;
9328 7610
9329 period = (u64)cfs_period_us * NSEC_PER_USEC; 7611 period = (u64)cfs_period_us * NSEC_PER_USEC;
9330 quota = tg_cfs_bandwidth(tg)->quota; 7612 quota = tg->cfs_bandwidth.quota;
9331 7613
9332 if (period <= 0) 7614 if (period <= 0)
9333 return -EINVAL; 7615 return -EINVAL;
@@ -9339,7 +7621,7 @@ long tg_get_cfs_period(struct task_group *tg)
9339{ 7621{
9340 u64 cfs_period_us; 7622 u64 cfs_period_us;
9341 7623
9342 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7624 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9343 do_div(cfs_period_us, NSEC_PER_USEC); 7625 do_div(cfs_period_us, NSEC_PER_USEC);
9344 7626
9345 return cfs_period_us; 7627 return cfs_period_us;
@@ -9399,13 +7681,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9399static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7681static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9400{ 7682{
9401 struct cfs_schedulable_data *d = data; 7683 struct cfs_schedulable_data *d = data;
9402 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7684 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9403 s64 quota = 0, parent_quota = -1; 7685 s64 quota = 0, parent_quota = -1;
9404 7686
9405 if (!tg->parent) { 7687 if (!tg->parent) {
9406 quota = RUNTIME_INF; 7688 quota = RUNTIME_INF;
9407 } else { 7689 } else {
9408 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7690 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9409 7691
9410 quota = normalize_cfs_quota(tg, d); 7692 quota = normalize_cfs_quota(tg, d);
9411 parent_quota = parent_b->hierarchal_quota; 7693 parent_quota = parent_b->hierarchal_quota;
@@ -9449,7 +7731,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9449 struct cgroup_map_cb *cb) 7731 struct cgroup_map_cb *cb)
9450{ 7732{
9451 struct task_group *tg = cgroup_tg(cgrp); 7733 struct task_group *tg = cgroup_tg(cgrp);
9452 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7734 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9453 7735
9454 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7736 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9455 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7737 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9748,7 +8030,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9748 * 8030 *
9749 * called with rq->lock held. 8031 * called with rq->lock held.
9750 */ 8032 */
9751static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8033void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9752{ 8034{
9753 struct cpuacct *ca; 8035 struct cpuacct *ca;
9754 int cpu; 8036 int cpu;
@@ -9790,7 +8072,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9790/* 8072/*
9791 * Charge the system/user time to the task's accounting group. 8073 * Charge the system/user time to the task's accounting group.
9792 */ 8074 */
9793static void cpuacct_update_stats(struct task_struct *tsk, 8075void cpuacct_update_stats(struct task_struct *tsk,
9794 enum cpuacct_stat_index idx, cputime_t val) 8076 enum cpuacct_stat_index idx, cputime_t val)
9795{ 8077{
9796 struct cpuacct *ca; 8078 struct cpuacct *ca;
diff --git a/kernel/sched.h b/kernel/sched.h
new file mode 100644
index 000000000000..675261ce3c4a
--- /dev/null
+++ b/kernel/sched.h
@@ -0,0 +1,1064 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "sched_cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned char nohz_balance_kick;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define rcu_dereference_check_sched_domain(p) \
491 rcu_dereference_check((p), \
492 lockdep_is_held(&sched_domains_mutex))
493
494/*
495 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
496 * See detach_destroy_domains: synchronize_sched for details.
497 *
498 * The domain tree of any CPU may only be accessed from within
499 * preempt-disabled sections.
500 */
501#define for_each_domain(cpu, __sd) \
502 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
503
504#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
505#define this_rq() (&__get_cpu_var(runqueues))
506#define task_rq(p) cpu_rq(task_cpu(p))
507#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
508#define raw_rq() (&__raw_get_cpu_var(runqueues))
509
510#include "sched_stats.h"
511#include "sched_autogroup.h"
512
513#ifdef CONFIG_CGROUP_SCHED
514
515/*
516 * Return the group to which this tasks belongs.
517 *
518 * We use task_subsys_state_check() and extend the RCU verification with
519 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
520 * task it moves into the cgroup. Therefore by holding either of those locks,
521 * we pin the task to the current cgroup.
522 */
523static inline struct task_group *task_group(struct task_struct *p)
524{
525 struct task_group *tg;
526 struct cgroup_subsys_state *css;
527
528 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
529 lockdep_is_held(&p->pi_lock) ||
530 lockdep_is_held(&task_rq(p)->lock));
531 tg = container_of(css, struct task_group, css);
532
533 return autogroup_task_group(p, tg);
534}
535
536/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
537static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
538{
539#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
540 struct task_group *tg = task_group(p);
541#endif
542
543#ifdef CONFIG_FAIR_GROUP_SCHED
544 p->se.cfs_rq = tg->cfs_rq[cpu];
545 p->se.parent = tg->se[cpu];
546#endif
547
548#ifdef CONFIG_RT_GROUP_SCHED
549 p->rt.rt_rq = tg->rt_rq[cpu];
550 p->rt.parent = tg->rt_se[cpu];
551#endif
552}
553
554#else /* CONFIG_CGROUP_SCHED */
555
556static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
557static inline struct task_group *task_group(struct task_struct *p)
558{
559 return NULL;
560}
561
562#endif /* CONFIG_CGROUP_SCHED */
563
564static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
565{
566 set_task_rq(p, cpu);
567#ifdef CONFIG_SMP
568 /*
569 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
570 * successfuly executed on another CPU. We must ensure that updates of
571 * per-task data have been completed by this moment.
572 */
573 smp_wmb();
574 task_thread_info(p)->cpu = cpu;
575#endif
576}
577
578/*
579 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
580 */
581#ifdef CONFIG_SCHED_DEBUG
582# define const_debug __read_mostly
583#else
584# define const_debug const
585#endif
586
587extern const_debug unsigned int sysctl_sched_features;
588
589#define SCHED_FEAT(name, enabled) \
590 __SCHED_FEAT_##name ,
591
592enum {
593#include "sched_features.h"
594};
595
596#undef SCHED_FEAT
597
598#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
599
600static inline u64 global_rt_period(void)
601{
602 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
603}
604
605static inline u64 global_rt_runtime(void)
606{
607 if (sysctl_sched_rt_runtime < 0)
608 return RUNTIME_INF;
609
610 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
611}
612
613
614
615static inline int task_current(struct rq *rq, struct task_struct *p)
616{
617 return rq->curr == p;
618}
619
620static inline int task_running(struct rq *rq, struct task_struct *p)
621{
622#ifdef CONFIG_SMP
623 return p->on_cpu;
624#else
625 return task_current(rq, p);
626#endif
627}
628
629
630#ifndef prepare_arch_switch
631# define prepare_arch_switch(next) do { } while (0)
632#endif
633#ifndef finish_arch_switch
634# define finish_arch_switch(prev) do { } while (0)
635#endif
636
637#ifndef __ARCH_WANT_UNLOCKED_CTXSW
638static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
639{
640#ifdef CONFIG_SMP
641 /*
642 * We can optimise this out completely for !SMP, because the
643 * SMP rebalancing from interrupt is the only thing that cares
644 * here.
645 */
646 next->on_cpu = 1;
647#endif
648}
649
650static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
651{
652#ifdef CONFIG_SMP
653 /*
654 * After ->on_cpu is cleared, the task can be moved to a different CPU.
655 * We must ensure this doesn't happen until the switch is completely
656 * finished.
657 */
658 smp_wmb();
659 prev->on_cpu = 0;
660#endif
661#ifdef CONFIG_DEBUG_SPINLOCK
662 /* this is a valid case when another task releases the spinlock */
663 rq->lock.owner = current;
664#endif
665 /*
666 * If we are tracking spinlock dependencies then we have to
667 * fix up the runqueue lock - which gets 'carried over' from
668 * prev into current:
669 */
670 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
671
672 raw_spin_unlock_irq(&rq->lock);
673}
674
675#else /* __ARCH_WANT_UNLOCKED_CTXSW */
676static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
677{
678#ifdef CONFIG_SMP
679 /*
680 * We can optimise this out completely for !SMP, because the
681 * SMP rebalancing from interrupt is the only thing that cares
682 * here.
683 */
684 next->on_cpu = 1;
685#endif
686#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
687 raw_spin_unlock_irq(&rq->lock);
688#else
689 raw_spin_unlock(&rq->lock);
690#endif
691}
692
693static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
694{
695#ifdef CONFIG_SMP
696 /*
697 * After ->on_cpu is cleared, the task can be moved to a different CPU.
698 * We must ensure this doesn't happen until the switch is completely
699 * finished.
700 */
701 smp_wmb();
702 prev->on_cpu = 0;
703#endif
704#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
705 local_irq_enable();
706#endif
707}
708#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
709
710
711static inline void update_load_add(struct load_weight *lw, unsigned long inc)
712{
713 lw->weight += inc;
714 lw->inv_weight = 0;
715}
716
717static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
718{
719 lw->weight -= dec;
720 lw->inv_weight = 0;
721}
722
723static inline void update_load_set(struct load_weight *lw, unsigned long w)
724{
725 lw->weight = w;
726 lw->inv_weight = 0;
727}
728
729/*
730 * To aid in avoiding the subversion of "niceness" due to uneven distribution
731 * of tasks with abnormal "nice" values across CPUs the contribution that
732 * each task makes to its run queue's load is weighted according to its
733 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
734 * scaled version of the new time slice allocation that they receive on time
735 * slice expiry etc.
736 */
737
738#define WEIGHT_IDLEPRIO 3
739#define WMULT_IDLEPRIO 1431655765
740
741/*
742 * Nice levels are multiplicative, with a gentle 10% change for every
743 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
744 * nice 1, it will get ~10% less CPU time than another CPU-bound task
745 * that remained on nice 0.
746 *
747 * The "10% effect" is relative and cumulative: from _any_ nice level,
748 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
749 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
750 * If a task goes up by ~10% and another task goes down by ~10% then
751 * the relative distance between them is ~25%.)
752 */
753static const int prio_to_weight[40] = {
754 /* -20 */ 88761, 71755, 56483, 46273, 36291,
755 /* -15 */ 29154, 23254, 18705, 14949, 11916,
756 /* -10 */ 9548, 7620, 6100, 4904, 3906,
757 /* -5 */ 3121, 2501, 1991, 1586, 1277,
758 /* 0 */ 1024, 820, 655, 526, 423,
759 /* 5 */ 335, 272, 215, 172, 137,
760 /* 10 */ 110, 87, 70, 56, 45,
761 /* 15 */ 36, 29, 23, 18, 15,
762};
763
764/*
765 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
766 *
767 * In cases where the weight does not change often, we can use the
768 * precalculated inverse to speed up arithmetics by turning divisions
769 * into multiplications:
770 */
771static const u32 prio_to_wmult[40] = {
772 /* -20 */ 48388, 59856, 76040, 92818, 118348,
773 /* -15 */ 147320, 184698, 229616, 287308, 360437,
774 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
775 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
776 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
777 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
778 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
779 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
780};
781
782/* Time spent by the tasks of the cpu accounting group executing in ... */
783enum cpuacct_stat_index {
784 CPUACCT_STAT_USER, /* ... user mode */
785 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
786
787 CPUACCT_STAT_NSTATS,
788};
789
790
791#define sched_class_highest (&stop_sched_class)
792#define for_each_class(class) \
793 for (class = sched_class_highest; class; class = class->next)
794
795extern const struct sched_class stop_sched_class;
796extern const struct sched_class rt_sched_class;
797extern const struct sched_class fair_sched_class;
798extern const struct sched_class idle_sched_class;
799
800
801#ifdef CONFIG_SMP
802
803extern void trigger_load_balance(struct rq *rq, int cpu);
804extern void idle_balance(int this_cpu, struct rq *this_rq);
805
806#else /* CONFIG_SMP */
807
808static inline void idle_balance(int cpu, struct rq *rq)
809{
810}
811
812#endif
813
814extern void sysrq_sched_debug_show(void);
815extern void sched_init_granularity(void);
816extern void update_max_interval(void);
817extern void update_group_power(struct sched_domain *sd, int cpu);
818extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
819extern void init_sched_rt_class(void);
820extern void init_sched_fair_class(void);
821
822extern void resched_task(struct task_struct *p);
823extern void resched_cpu(int cpu);
824
825extern struct rt_bandwidth def_rt_bandwidth;
826extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
827
828extern void update_cpu_load(struct rq *this_rq);
829
830#ifdef CONFIG_CGROUP_CPUACCT
831extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
832extern void cpuacct_update_stats(struct task_struct *tsk,
833 enum cpuacct_stat_index idx, cputime_t val);
834#else
835static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
836static inline void cpuacct_update_stats(struct task_struct *tsk,
837 enum cpuacct_stat_index idx, cputime_t val) {}
838#endif
839
840static inline void inc_nr_running(struct rq *rq)
841{
842 rq->nr_running++;
843}
844
845static inline void dec_nr_running(struct rq *rq)
846{
847 rq->nr_running--;
848}
849
850extern void update_rq_clock(struct rq *rq);
851
852extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
853extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
854
855extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
856
857extern const_debug unsigned int sysctl_sched_time_avg;
858extern const_debug unsigned int sysctl_sched_nr_migrate;
859extern const_debug unsigned int sysctl_sched_migration_cost;
860
861static inline u64 sched_avg_period(void)
862{
863 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
864}
865
866void calc_load_account_idle(struct rq *this_rq);
867
868#ifdef CONFIG_SCHED_HRTICK
869
870/*
871 * Use hrtick when:
872 * - enabled by features
873 * - hrtimer is actually high res
874 */
875static inline int hrtick_enabled(struct rq *rq)
876{
877 if (!sched_feat(HRTICK))
878 return 0;
879 if (!cpu_active(cpu_of(rq)))
880 return 0;
881 return hrtimer_is_hres_active(&rq->hrtick_timer);
882}
883
884void hrtick_start(struct rq *rq, u64 delay);
885
886#endif /* CONFIG_SCHED_HRTICK */
887
888#ifdef CONFIG_SMP
889extern void sched_avg_update(struct rq *rq);
890static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
891{
892 rq->rt_avg += rt_delta;
893 sched_avg_update(rq);
894}
895#else
896static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
897static inline void sched_avg_update(struct rq *rq) { }
898#endif
899
900extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
901
902#ifdef CONFIG_SMP
903#ifdef CONFIG_PREEMPT
904
905static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
906
907/*
908 * fair double_lock_balance: Safely acquires both rq->locks in a fair
909 * way at the expense of forcing extra atomic operations in all
910 * invocations. This assures that the double_lock is acquired using the
911 * same underlying policy as the spinlock_t on this architecture, which
912 * reduces latency compared to the unfair variant below. However, it
913 * also adds more overhead and therefore may reduce throughput.
914 */
915static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
916 __releases(this_rq->lock)
917 __acquires(busiest->lock)
918 __acquires(this_rq->lock)
919{
920 raw_spin_unlock(&this_rq->lock);
921 double_rq_lock(this_rq, busiest);
922
923 return 1;
924}
925
926#else
927/*
928 * Unfair double_lock_balance: Optimizes throughput at the expense of
929 * latency by eliminating extra atomic operations when the locks are
930 * already in proper order on entry. This favors lower cpu-ids and will
931 * grant the double lock to lower cpus over higher ids under contention,
932 * regardless of entry order into the function.
933 */
934static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
935 __releases(this_rq->lock)
936 __acquires(busiest->lock)
937 __acquires(this_rq->lock)
938{
939 int ret = 0;
940
941 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
942 if (busiest < this_rq) {
943 raw_spin_unlock(&this_rq->lock);
944 raw_spin_lock(&busiest->lock);
945 raw_spin_lock_nested(&this_rq->lock,
946 SINGLE_DEPTH_NESTING);
947 ret = 1;
948 } else
949 raw_spin_lock_nested(&busiest->lock,
950 SINGLE_DEPTH_NESTING);
951 }
952 return ret;
953}
954
955#endif /* CONFIG_PREEMPT */
956
957/*
958 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
959 */
960static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
961{
962 if (unlikely(!irqs_disabled())) {
963 /* printk() doesn't work good under rq->lock */
964 raw_spin_unlock(&this_rq->lock);
965 BUG_ON(1);
966 }
967
968 return _double_lock_balance(this_rq, busiest);
969}
970
971static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
972 __releases(busiest->lock)
973{
974 raw_spin_unlock(&busiest->lock);
975 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
976}
977
978/*
979 * double_rq_lock - safely lock two runqueues
980 *
981 * Note this does not disable interrupts like task_rq_lock,
982 * you need to do so manually before calling.
983 */
984static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
985 __acquires(rq1->lock)
986 __acquires(rq2->lock)
987{
988 BUG_ON(!irqs_disabled());
989 if (rq1 == rq2) {
990 raw_spin_lock(&rq1->lock);
991 __acquire(rq2->lock); /* Fake it out ;) */
992 } else {
993 if (rq1 < rq2) {
994 raw_spin_lock(&rq1->lock);
995 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
996 } else {
997 raw_spin_lock(&rq2->lock);
998 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
999 }
1000 }
1001}
1002
1003/*
1004 * double_rq_unlock - safely unlock two runqueues
1005 *
1006 * Note this does not restore interrupts like task_rq_unlock,
1007 * you need to do so manually after calling.
1008 */
1009static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1010 __releases(rq1->lock)
1011 __releases(rq2->lock)
1012{
1013 raw_spin_unlock(&rq1->lock);
1014 if (rq1 != rq2)
1015 raw_spin_unlock(&rq2->lock);
1016 else
1017 __release(rq2->lock);
1018}
1019
1020#else /* CONFIG_SMP */
1021
1022/*
1023 * double_rq_lock - safely lock two runqueues
1024 *
1025 * Note this does not disable interrupts like task_rq_lock,
1026 * you need to do so manually before calling.
1027 */
1028static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1029 __acquires(rq1->lock)
1030 __acquires(rq2->lock)
1031{
1032 BUG_ON(!irqs_disabled());
1033 BUG_ON(rq1 != rq2);
1034 raw_spin_lock(&rq1->lock);
1035 __acquire(rq2->lock); /* Fake it out ;) */
1036}
1037
1038/*
1039 * double_rq_unlock - safely unlock two runqueues
1040 *
1041 * Note this does not restore interrupts like task_rq_unlock,
1042 * you need to do so manually after calling.
1043 */
1044static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1045 __releases(rq1->lock)
1046 __releases(rq2->lock)
1047{
1048 BUG_ON(rq1 != rq2);
1049 raw_spin_unlock(&rq1->lock);
1050 __release(rq2->lock);
1051}
1052
1053#endif
1054
1055extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1056extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1057extern void print_cfs_stats(struct seq_file *m, int cpu);
1058extern void print_rt_stats(struct seq_file *m, int cpu);
1059
1060extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1061extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1062extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1063
1064extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index a6710a112b4f..ce1a85f2ddcb 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a608593df243..cd3b64219d9f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif 111#endif
105 112
106static const struct sched_class fair_sched_class; 113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
107 217
108/************************************************************** 218/**************************************************************
109 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414} 524}
415 525
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{ 527{
418 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
419 529
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
434} 544}
435 545
436#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{ 548{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440 550
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{ 794{
685 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se)) 796 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
688 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{ 807{
698 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se)) 809 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
701 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1287 */ 1397 */
1288 1398
1289#ifdef CONFIG_CFS_BANDWIDTH 1399#ifdef CONFIG_CFS_BANDWIDTH
1400
1401#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used;
1403
1404static inline bool cfs_bandwidth_used(void)
1405{
1406 return static_branch(&__cfs_bandwidth_used);
1407}
1408
1409void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{
1411 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used);
1416}
1417#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void)
1419{
1420 return true;
1421}
1422
1423void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1424#endif /* HAVE_JUMP_LABEL */
1425
1290/* 1426/*
1291 * default period for cfs group bandwidth. 1427 * default period for cfs group bandwidth.
1292 * default: 0.1s, units: nanoseconds 1428 * default: 0.1s, units: nanoseconds
@@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1308 * 1444 *
1309 * requires cfs_b->lock 1445 * requires cfs_b->lock
1310 */ 1446 */
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1447void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{ 1448{
1313 u64 now; 1449 u64 now;
1314 1450
@@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1456 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321} 1457}
1322 1458
1459static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1460{
1461 return &tg->cfs_bandwidth;
1462}
1463
1323/* returns 0 on failure to allocate runtime */ 1464/* returns 0 on failure to allocate runtime */
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1465static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{ 1466{
@@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1530 raw_spin_unlock(&cfs_b->lock); 1671 raw_spin_unlock(&cfs_b->lock);
1531} 1672}
1532 1673
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1674void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{ 1675{
1535 struct rq *rq = rq_of(cfs_rq); 1676 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1677 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1839,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1839 1980
1840 throttle_cfs_rq(cfs_rq); 1981 throttle_cfs_rq(cfs_rq);
1841} 1982}
1842#else 1983
1984static inline u64 default_cfs_period(void);
1985static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1986static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1987
1988static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1989{
1990 struct cfs_bandwidth *cfs_b =
1991 container_of(timer, struct cfs_bandwidth, slack_timer);
1992 do_sched_cfs_slack_timer(cfs_b);
1993
1994 return HRTIMER_NORESTART;
1995}
1996
1997static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1998{
1999 struct cfs_bandwidth *cfs_b =
2000 container_of(timer, struct cfs_bandwidth, period_timer);
2001 ktime_t now;
2002 int overrun;
2003 int idle = 0;
2004
2005 for (;;) {
2006 now = hrtimer_cb_get_time(timer);
2007 overrun = hrtimer_forward(timer, now, cfs_b->period);
2008
2009 if (!overrun)
2010 break;
2011
2012 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2013 }
2014
2015 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2016}
2017
2018void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2019{
2020 raw_spin_lock_init(&cfs_b->lock);
2021 cfs_b->runtime = 0;
2022 cfs_b->quota = RUNTIME_INF;
2023 cfs_b->period = ns_to_ktime(default_cfs_period());
2024
2025 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2026 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2027 cfs_b->period_timer.function = sched_cfs_period_timer;
2028 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2030}
2031
2032static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2033{
2034 cfs_rq->runtime_enabled = 0;
2035 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2036}
2037
2038/* requires cfs_b->lock, may release to reprogram timer */
2039void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2040{
2041 /*
2042 * The timer may be active because we're trying to set a new bandwidth
2043 * period or because we're racing with the tear-down path
2044 * (timer_active==0 becomes visible before the hrtimer call-back
2045 * terminates). In either case we ensure that it's re-programmed
2046 */
2047 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2048 raw_spin_unlock(&cfs_b->lock);
2049 /* ensure cfs_b->lock is available while we wait */
2050 hrtimer_cancel(&cfs_b->period_timer);
2051
2052 raw_spin_lock(&cfs_b->lock);
2053 /* if someone else restarted the timer then we're done */
2054 if (cfs_b->timer_active)
2055 return;
2056 }
2057
2058 cfs_b->timer_active = 1;
2059 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2060}
2061
2062static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2063{
2064 hrtimer_cancel(&cfs_b->period_timer);
2065 hrtimer_cancel(&cfs_b->slack_timer);
2066}
2067
2068void unthrottle_offline_cfs_rqs(struct rq *rq)
2069{
2070 struct cfs_rq *cfs_rq;
2071
2072 for_each_leaf_cfs_rq(rq, cfs_rq) {
2073 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2074
2075 if (!cfs_rq->runtime_enabled)
2076 continue;
2077
2078 /*
2079 * clock_task is not advancing so we just need to make sure
2080 * there's some valid quota amount
2081 */
2082 cfs_rq->runtime_remaining = cfs_b->quota;
2083 if (cfs_rq_throttled(cfs_rq))
2084 unthrottle_cfs_rq(cfs_rq);
2085 }
2086}
2087
2088#else /* CONFIG_CFS_BANDWIDTH */
1843static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1844 unsigned long delta_exec) {} 2090 unsigned long delta_exec) {}
1845static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1861,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
1861{ 2107{
1862 return 0; 2108 return 0;
1863} 2109}
2110
2111void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2112
2113#ifdef CONFIG_FAIR_GROUP_SCHED
2114static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1864#endif 2115#endif
1865 2116
2117static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2118{
2119 return NULL;
2120}
2121static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2122void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2123
2124#endif /* CONFIG_CFS_BANDWIDTH */
2125
1866/************************************************** 2126/**************************************************
1867 * CFS operations on tasks: 2127 * CFS operations on tasks:
1868 */ 2128 */
@@ -2029,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2029} 2289}
2030 2290
2031#ifdef CONFIG_SMP 2291#ifdef CONFIG_SMP
2292/* Used instead of source_load when we know the type == 0 */
2293static unsigned long weighted_cpuload(const int cpu)
2294{
2295 return cpu_rq(cpu)->load.weight;
2296}
2297
2298/*
2299 * Return a low guess at the load of a migration-source cpu weighted
2300 * according to the scheduling class and "nice" value.
2301 *
2302 * We want to under-estimate the load of migration sources, to
2303 * balance conservatively.
2304 */
2305static unsigned long source_load(int cpu, int type)
2306{
2307 struct rq *rq = cpu_rq(cpu);
2308 unsigned long total = weighted_cpuload(cpu);
2309
2310 if (type == 0 || !sched_feat(LB_BIAS))
2311 return total;
2312
2313 return min(rq->cpu_load[type-1], total);
2314}
2315
2316/*
2317 * Return a high guess at the load of a migration-target cpu weighted
2318 * according to the scheduling class and "nice" value.
2319 */
2320static unsigned long target_load(int cpu, int type)
2321{
2322 struct rq *rq = cpu_rq(cpu);
2323 unsigned long total = weighted_cpuload(cpu);
2324
2325 if (type == 0 || !sched_feat(LB_BIAS))
2326 return total;
2327
2328 return max(rq->cpu_load[type-1], total);
2329}
2330
2331static unsigned long power_of(int cpu)
2332{
2333 return cpu_rq(cpu)->cpu_power;
2334}
2335
2336static unsigned long cpu_avg_load_per_task(int cpu)
2337{
2338 struct rq *rq = cpu_rq(cpu);
2339 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2340
2341 if (nr_running)
2342 return rq->load.weight / nr_running;
2343
2344 return 0;
2345}
2346
2032 2347
2033static void task_waking_fair(struct task_struct *p) 2348static void task_waking_fair(struct task_struct *p)
2034{ 2349{
@@ -2783,6 +3098,38 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2783} 3098}
2784 3099
2785/* 3100/*
3101 * Is this task likely cache-hot:
3102 */
3103static int
3104task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3105{
3106 s64 delta;
3107
3108 if (p->sched_class != &fair_sched_class)
3109 return 0;
3110
3111 if (unlikely(p->policy == SCHED_IDLE))
3112 return 0;
3113
3114 /*
3115 * Buddy candidates are cache hot:
3116 */
3117 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3118 (&p->se == cfs_rq_of(&p->se)->next ||
3119 &p->se == cfs_rq_of(&p->se)->last))
3120 return 1;
3121
3122 if (sysctl_sched_migration_cost == -1)
3123 return 1;
3124 if (sysctl_sched_migration_cost == 0)
3125 return 0;
3126
3127 delta = now - p->se.exec_start;
3128
3129 return delta < (s64)sysctl_sched_migration_cost;
3130}
3131
3132/*
2786 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3133 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2787 */ 3134 */
2788static 3135static
@@ -3162,15 +3509,6 @@ struct sg_lb_stats {
3162}; 3509};
3163 3510
3164/** 3511/**
3165 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3166 * @group: The group whose first cpu is to be returned.
3167 */
3168static inline unsigned int group_first_cpu(struct sched_group *group)
3169{
3170 return cpumask_first(sched_group_cpus(group));
3171}
3172
3173/**
3174 * get_sd_load_idx - Obtain the load index for a given sched domain. 3512 * get_sd_load_idx - Obtain the load index for a given sched domain.
3175 * @sd: The sched_domain whose load_idx is to be obtained. 3513 * @sd: The sched_domain whose load_idx is to be obtained.
3176 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3514 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3419,7 +3757,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3419 sdg->sgp->power = power; 3757 sdg->sgp->power = power;
3420} 3758}
3421 3759
3422static void update_group_power(struct sched_domain *sd, int cpu) 3760void update_group_power(struct sched_domain *sd, int cpu)
3423{ 3761{
3424 struct sched_domain *child = sd->child; 3762 struct sched_domain *child = sd->child;
3425 struct sched_group *group, *sdg = sd->groups; 3763 struct sched_group *group, *sdg = sd->groups;
@@ -3685,11 +4023,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3685 } while (sg != sd->groups); 4023 } while (sg != sd->groups);
3686} 4024}
3687 4025
3688int __weak arch_sd_sibling_asym_packing(void)
3689{
3690 return 0*SD_ASYM_PACKING;
3691}
3692
3693/** 4026/**
3694 * check_asym_packing - Check to see if the group is packed into the 4027 * check_asym_packing - Check to see if the group is packed into the
3695 * sched doman. 4028 * sched doman.
@@ -4053,7 +4386,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4053#define MAX_PINNED_INTERVAL 512 4386#define MAX_PINNED_INTERVAL 512
4054 4387
4055/* Working cpumask for load_balance and load_balance_newidle. */ 4388/* Working cpumask for load_balance and load_balance_newidle. */
4056static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4389DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4057 4390
4058static int need_active_balance(struct sched_domain *sd, int idle, 4391static int need_active_balance(struct sched_domain *sd, int idle,
4059 int busiest_cpu, int this_cpu) 4392 int busiest_cpu, int this_cpu)
@@ -4256,7 +4589,7 @@ out:
4256 * idle_balance is called by schedule() if this_cpu is about to become 4589 * idle_balance is called by schedule() if this_cpu is about to become
4257 * idle. Attempts to pull tasks from other CPUs. 4590 * idle. Attempts to pull tasks from other CPUs.
4258 */ 4591 */
4259static void idle_balance(int this_cpu, struct rq *this_rq) 4592void idle_balance(int this_cpu, struct rq *this_rq)
4260{ 4593{
4261 struct sched_domain *sd; 4594 struct sched_domain *sd;
4262 int pulled_task = 0; 4595 int pulled_task = 0;
@@ -4631,7 +4964,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4631 * Scale the max load_balance interval with the number of CPUs in the system. 4964 * Scale the max load_balance interval with the number of CPUs in the system.
4632 * This trades load-balance latency on larger machines for less cross talk. 4965 * This trades load-balance latency on larger machines for less cross talk.
4633 */ 4966 */
4634static void update_max_interval(void) 4967void update_max_interval(void)
4635{ 4968{
4636 max_load_balance_interval = HZ*num_online_cpus()/10; 4969 max_load_balance_interval = HZ*num_online_cpus()/10;
4637} 4970}
@@ -4833,7 +5166,7 @@ static inline int on_null_domain(int cpu)
4833/* 5166/*
4834 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5167 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4835 */ 5168 */
4836static inline void trigger_load_balance(struct rq *rq, int cpu) 5169void trigger_load_balance(struct rq *rq, int cpu)
4837{ 5170{
4838 /* Don't need to rebalance while attached to NULL domain */ 5171 /* Don't need to rebalance while attached to NULL domain */
4839 if (time_after_eq(jiffies, rq->next_balance) && 5172 if (time_after_eq(jiffies, rq->next_balance) &&
@@ -4855,15 +5188,6 @@ static void rq_offline_fair(struct rq *rq)
4855 update_sysctl(); 5188 update_sysctl();
4856} 5189}
4857 5190
4858#else /* CONFIG_SMP */
4859
4860/*
4861 * on UP we do not need to balance between CPUs:
4862 */
4863static inline void idle_balance(int cpu, struct rq *rq)
4864{
4865}
4866
4867#endif /* CONFIG_SMP */ 5191#endif /* CONFIG_SMP */
4868 5192
4869/* 5193/*
@@ -5006,6 +5330,16 @@ static void set_curr_task_fair(struct rq *rq)
5006 } 5330 }
5007} 5331}
5008 5332
5333void init_cfs_rq(struct cfs_rq *cfs_rq)
5334{
5335 cfs_rq->tasks_timeline = RB_ROOT;
5336 INIT_LIST_HEAD(&cfs_rq->tasks);
5337 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5338#ifndef CONFIG_64BIT
5339 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5340#endif
5341}
5342
5009#ifdef CONFIG_FAIR_GROUP_SCHED 5343#ifdef CONFIG_FAIR_GROUP_SCHED
5010static void task_move_group_fair(struct task_struct *p, int on_rq) 5344static void task_move_group_fair(struct task_struct *p, int on_rq)
5011{ 5345{
@@ -5028,7 +5362,161 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5028 if (!on_rq) 5362 if (!on_rq)
5029 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5363 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5030} 5364}
5365
5366void free_fair_sched_group(struct task_group *tg)
5367{
5368 int i;
5369
5370 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5371
5372 for_each_possible_cpu(i) {
5373 if (tg->cfs_rq)
5374 kfree(tg->cfs_rq[i]);
5375 if (tg->se)
5376 kfree(tg->se[i]);
5377 }
5378
5379 kfree(tg->cfs_rq);
5380 kfree(tg->se);
5381}
5382
5383int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5384{
5385 struct cfs_rq *cfs_rq;
5386 struct sched_entity *se;
5387 int i;
5388
5389 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5390 if (!tg->cfs_rq)
5391 goto err;
5392 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5393 if (!tg->se)
5394 goto err;
5395
5396 tg->shares = NICE_0_LOAD;
5397
5398 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5399
5400 for_each_possible_cpu(i) {
5401 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5402 GFP_KERNEL, cpu_to_node(i));
5403 if (!cfs_rq)
5404 goto err;
5405
5406 se = kzalloc_node(sizeof(struct sched_entity),
5407 GFP_KERNEL, cpu_to_node(i));
5408 if (!se)
5409 goto err_free_rq;
5410
5411 init_cfs_rq(cfs_rq);
5412 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5413 }
5414
5415 return 1;
5416
5417err_free_rq:
5418 kfree(cfs_rq);
5419err:
5420 return 0;
5421}
5422
5423void unregister_fair_sched_group(struct task_group *tg, int cpu)
5424{
5425 struct rq *rq = cpu_rq(cpu);
5426 unsigned long flags;
5427
5428 /*
5429 * Only empty task groups can be destroyed; so we can speculatively
5430 * check on_list without danger of it being re-added.
5431 */
5432 if (!tg->cfs_rq[cpu]->on_list)
5433 return;
5434
5435 raw_spin_lock_irqsave(&rq->lock, flags);
5436 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5437 raw_spin_unlock_irqrestore(&rq->lock, flags);
5438}
5439
5440void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5441 struct sched_entity *se, int cpu,
5442 struct sched_entity *parent)
5443{
5444 struct rq *rq = cpu_rq(cpu);
5445
5446 cfs_rq->tg = tg;
5447 cfs_rq->rq = rq;
5448#ifdef CONFIG_SMP
5449 /* allow initial update_cfs_load() to truncate */
5450 cfs_rq->load_stamp = 1;
5031#endif 5451#endif
5452 init_cfs_rq_runtime(cfs_rq);
5453
5454 tg->cfs_rq[cpu] = cfs_rq;
5455 tg->se[cpu] = se;
5456
5457 /* se could be NULL for root_task_group */
5458 if (!se)
5459 return;
5460
5461 if (!parent)
5462 se->cfs_rq = &rq->cfs;
5463 else
5464 se->cfs_rq = parent->my_q;
5465
5466 se->my_q = cfs_rq;
5467 update_load_set(&se->load, 0);
5468 se->parent = parent;
5469}
5470
5471static DEFINE_MUTEX(shares_mutex);
5472
5473int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5474{
5475 int i;
5476 unsigned long flags;
5477
5478 /*
5479 * We can't change the weight of the root cgroup.
5480 */
5481 if (!tg->se[0])
5482 return -EINVAL;
5483
5484 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5485
5486 mutex_lock(&shares_mutex);
5487 if (tg->shares == shares)
5488 goto done;
5489
5490 tg->shares = shares;
5491 for_each_possible_cpu(i) {
5492 struct rq *rq = cpu_rq(i);
5493 struct sched_entity *se;
5494
5495 se = tg->se[i];
5496 /* Propagate contribution to hierarchy */
5497 raw_spin_lock_irqsave(&rq->lock, flags);
5498 for_each_sched_entity(se)
5499 update_cfs_shares(group_cfs_rq(se));
5500 raw_spin_unlock_irqrestore(&rq->lock, flags);
5501 }
5502
5503done:
5504 mutex_unlock(&shares_mutex);
5505 return 0;
5506}
5507#else /* CONFIG_FAIR_GROUP_SCHED */
5508
5509void free_fair_sched_group(struct task_group *tg) { }
5510
5511int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5512{
5513 return 1;
5514}
5515
5516void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5517
5518#endif /* CONFIG_FAIR_GROUP_SCHED */
5519
5032 5520
5033static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5521static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5034{ 5522{
@@ -5048,7 +5536,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
5048/* 5536/*
5049 * All the scheduling class methods: 5537 * All the scheduling class methods:
5050 */ 5538 */
5051static const struct sched_class fair_sched_class = { 5539const struct sched_class fair_sched_class = {
5052 .next = &idle_sched_class, 5540 .next = &idle_sched_class,
5053 .enqueue_task = enqueue_task_fair, 5541 .enqueue_task = enqueue_task_fair,
5054 .dequeue_task = dequeue_task_fair, 5542 .dequeue_task = dequeue_task_fair,
@@ -5085,7 +5573,7 @@ static const struct sched_class fair_sched_class = {
5085}; 5573};
5086 5574
5087#ifdef CONFIG_SCHED_DEBUG 5575#ifdef CONFIG_SCHED_DEBUG
5088static void print_cfs_stats(struct seq_file *m, int cpu) 5576void print_cfs_stats(struct seq_file *m, int cpu)
5089{ 5577{
5090 struct cfs_rq *cfs_rq; 5578 struct cfs_rq *cfs_rq;
5091 5579
@@ -5095,3 +5583,19 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
5095 rcu_read_unlock(); 5583 rcu_read_unlock();
5096} 5584}
5097#endif 5585#endif
5586
5587__init void init_sched_fair_class(void)
5588{
5589#ifdef CONFIG_SMP
5590 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5591
5592#ifdef CONFIG_NO_HZ
5593 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5594 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
5595 atomic_set(&nohz.load_balancer, nr_cpu_ids);
5596 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
5597 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
5598#endif
5599#endif /* SMP */
5600
5601}
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d95e861122cf..023b35502509 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq)
556 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
557} 733}
558 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
559static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
560{ 758{
561 int more = 0; 759 int more = 0;
@@ -1178,8 +1376,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1178/* Only try algorithms three times */ 1376/* Only try algorithms three times */
1179#define RT_MAX_TRIES 3 1377#define RT_MAX_TRIES 3
1180 1378
1181static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1182
1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1379static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1184{ 1380{
1185 if (!task_running(rq, p) && 1381 if (!task_running(rq, p) &&
@@ -1653,13 +1849,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1653 pull_rt_task(rq); 1849 pull_rt_task(rq);
1654} 1850}
1655 1851
1656static inline void init_sched_rt_class(void) 1852void init_sched_rt_class(void)
1657{ 1853{
1658 unsigned int i; 1854 unsigned int i;
1659 1855
1660 for_each_possible_cpu(i) 1856 for_each_possible_cpu(i) {
1661 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1857 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1662 GFP_KERNEL, cpu_to_node(i)); 1858 GFP_KERNEL, cpu_to_node(i));
1859 }
1663} 1860}
1664#endif /* CONFIG_SMP */ 1861#endif /* CONFIG_SMP */
1665 1862
@@ -1800,7 +1997,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1800 return 0; 1997 return 0;
1801} 1998}
1802 1999
1803static const struct sched_class rt_sched_class = { 2000const struct sched_class rt_sched_class = {
1804 .next = &fair_sched_class, 2001 .next = &fair_sched_class,
1805 .enqueue_task = enqueue_task_rt, 2002 .enqueue_task = enqueue_task_rt,
1806 .dequeue_task = dequeue_task_rt, 2003 .dequeue_task = dequeue_task_rt,
@@ -1835,7 +2032,7 @@ static const struct sched_class rt_sched_class = {
1835#ifdef CONFIG_SCHED_DEBUG 2032#ifdef CONFIG_SCHED_DEBUG
1836extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2033extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1837 2034
1838static void print_rt_stats(struct seq_file *m, int cpu) 2035void print_rt_stats(struct seq_file *m, int cpu)
1839{ 2036{
1840 rt_rq_iter_t iter; 2037 rt_rq_iter_t iter;
1841 struct rt_rq *rt_rq; 2038 struct rt_rq *rt_rq;
diff --git a/kernel/sched_stats.c b/kernel/sched_stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched_stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 87f9e36ea56e..ea2b6f0ec868 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 8b44e7fa7fb3..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
80/* 82/*
81 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
82 */ 84 */
83static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class, 86 .next = &rt_sched_class,
85 87
86 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,