aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2011-10-25 04:00:11 -0400
committerIngo Molnar <mingo@elte.hu>2011-11-17 06:20:19 -0500
commit029632fbb7b7c9d85063cc9eb470de6c54873df3 (patch)
tree511303f0fa32f997c4b2f68364b032555b6a642e /kernel/sched.c
parent60686317da05049385eae86e44c710cde535f95f (diff)
sched: Make separate sched*.c translation units
Since once needs to do something at conferences and fixing compile warnings doesn't actually require much if any attention I decided to break up the sched.c #include "*.c" fest. This further modularizes the scheduler code. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/n/tip-x0fcd3mnp8f9c99grcpewmhi@git.kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1828
1 files changed, 55 insertions, 1773 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index c9e3ab6e299e..2ffcceed8862 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -72,133 +71,20 @@
72#include <linux/ftrace.h> 71#include <linux/ftrace.h>
73#include <linux/slab.h> 72#include <linux/slab.h>
74#include <linux/init_task.h> 73#include <linux/init_task.h>
75#include <linux/jump_label.h>
76 74
77#include <asm/tlb.h> 75#include <asm/tlb.h>
78#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
79#include <asm/mutex.h>
80#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
81#include <asm/paravirt.h> 78#include <asm/paravirt.h>
82#endif 79#endif
83 80
84#include "sched_cpupri.h" 81#include "sched.h"
85#include "workqueue_sched.h" 82#include "workqueue_sched.h"
86#include "sched_autogroup.h"
87 83
88#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
89#include <trace/events/sched.h> 85#include <trace/events/sched.h>
90 86
91/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
92 * Convert user-nice values [ -20 ... 0 ... 19 ]
93 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
94 * and back.
95 */
96#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
97#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
98#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
99
100/*
101 * 'User priority' is the nice value converted to something we
102 * can work with better when scaling various scheduler parameters,
103 * it's a [ 0 ... 39 ] range.
104 */
105#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
106#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
107#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
108
109/*
110 * Helpers for converting nanosecond timing to jiffy resolution
111 */
112#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
113
114#define NICE_0_LOAD SCHED_LOAD_SCALE
115#define NICE_0_SHIFT SCHED_LOAD_SHIFT
116
117/*
118 * These are the 'tuning knobs' of the scheduler:
119 *
120 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
121 * Timeslices get refilled after they expire.
122 */
123#define DEF_TIMESLICE (100 * HZ / 1000)
124
125/*
126 * single value that denotes runtime == period, ie unlimited time.
127 */
128#define RUNTIME_INF ((u64)~0ULL)
129
130static inline int rt_policy(int policy)
131{
132 if (policy == SCHED_FIFO || policy == SCHED_RR)
133 return 1;
134 return 0;
135}
136
137static inline int task_has_rt_policy(struct task_struct *p)
138{
139 return rt_policy(p->policy);
140}
141
142/*
143 * This is the priority-queue data structure of the RT scheduling class:
144 */
145struct rt_prio_array {
146 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
147 struct list_head queue[MAX_RT_PRIO];
148};
149
150struct rt_bandwidth {
151 /* nests inside the rq lock: */
152 raw_spinlock_t rt_runtime_lock;
153 ktime_t rt_period;
154 u64 rt_runtime;
155 struct hrtimer rt_period_timer;
156};
157
158static struct rt_bandwidth def_rt_bandwidth;
159
160static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
161
162static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
163{
164 struct rt_bandwidth *rt_b =
165 container_of(timer, struct rt_bandwidth, rt_period_timer);
166 ktime_t now;
167 int overrun;
168 int idle = 0;
169
170 for (;;) {
171 now = hrtimer_cb_get_time(timer);
172 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
173
174 if (!overrun)
175 break;
176
177 idle = do_sched_rt_period_timer(rt_b, overrun);
178 }
179
180 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
181}
182
183static
184void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
185{
186 rt_b->rt_period = ns_to_ktime(period);
187 rt_b->rt_runtime = runtime;
188
189 raw_spin_lock_init(&rt_b->rt_runtime_lock);
190
191 hrtimer_init(&rt_b->rt_period_timer,
192 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
193 rt_b->rt_period_timer.function = sched_rt_period_timer;
194}
195
196static inline int rt_bandwidth_enabled(void)
197{
198 return sysctl_sched_rt_runtime >= 0;
199}
200
201static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
202{ 88{
203 unsigned long delta; 89 unsigned long delta;
204 ktime_t soft, hard, now; 90 ktime_t soft, hard, now;
@@ -218,609 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
218 } 104 }
219} 105}
220 106
221static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 107DEFINE_MUTEX(sched_domains_mutex);
222{ 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
223 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
224 return;
225
226 if (hrtimer_active(&rt_b->rt_period_timer))
227 return;
228
229 raw_spin_lock(&rt_b->rt_runtime_lock);
230 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
231 raw_spin_unlock(&rt_b->rt_runtime_lock);
232}
233
234#ifdef CONFIG_RT_GROUP_SCHED
235static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
236{
237 hrtimer_cancel(&rt_b->rt_period_timer);
238}
239#endif
240
241/*
242 * sched_domains_mutex serializes calls to init_sched_domains,
243 * detach_destroy_domains and partition_sched_domains.
244 */
245static DEFINE_MUTEX(sched_domains_mutex);
246
247#ifdef CONFIG_CGROUP_SCHED
248
249#include <linux/cgroup.h>
250
251struct cfs_rq;
252
253static LIST_HEAD(task_groups);
254
255struct cfs_bandwidth {
256#ifdef CONFIG_CFS_BANDWIDTH
257 raw_spinlock_t lock;
258 ktime_t period;
259 u64 quota, runtime;
260 s64 hierarchal_quota;
261 u64 runtime_expires;
262
263 int idle, timer_active;
264 struct hrtimer period_timer, slack_timer;
265 struct list_head throttled_cfs_rq;
266
267 /* statistics */
268 int nr_periods, nr_throttled;
269 u64 throttled_time;
270#endif
271};
272
273/* task group related information */
274struct task_group {
275 struct cgroup_subsys_state css;
276
277#ifdef CONFIG_FAIR_GROUP_SCHED
278 /* schedulable entities of this group on each cpu */
279 struct sched_entity **se;
280 /* runqueue "owned" by this group on each cpu */
281 struct cfs_rq **cfs_rq;
282 unsigned long shares;
283
284 atomic_t load_weight;
285#endif
286
287#ifdef CONFIG_RT_GROUP_SCHED
288 struct sched_rt_entity **rt_se;
289 struct rt_rq **rt_rq;
290
291 struct rt_bandwidth rt_bandwidth;
292#endif
293
294 struct rcu_head rcu;
295 struct list_head list;
296
297 struct task_group *parent;
298 struct list_head siblings;
299 struct list_head children;
300
301#ifdef CONFIG_SCHED_AUTOGROUP
302 struct autogroup *autogroup;
303#endif
304
305 struct cfs_bandwidth cfs_bandwidth;
306};
307
308/* task_group_lock serializes the addition/removal of task groups */
309static DEFINE_SPINLOCK(task_group_lock);
310
311#ifdef CONFIG_FAIR_GROUP_SCHED
312
313# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
314
315/*
316 * A weight of 0 or 1 can cause arithmetics problems.
317 * A weight of a cfs_rq is the sum of weights of which entities
318 * are queued on this cfs_rq, so a weight of a entity should not be
319 * too large, so as the shares value of a task group.
320 * (The default weight is 1024 - so there's no practical
321 * limitation from this.)
322 */
323#define MIN_SHARES (1UL << 1)
324#define MAX_SHARES (1UL << 18)
325
326static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
327#endif
328
329/* Default task group.
330 * Every task in system belong to this group at bootup.
331 */
332struct task_group root_task_group;
333
334#endif /* CONFIG_CGROUP_SCHED */
335
336/* CFS-related fields in a runqueue */
337struct cfs_rq {
338 struct load_weight load;
339 unsigned long nr_running, h_nr_running;
340
341 u64 exec_clock;
342 u64 min_vruntime;
343#ifndef CONFIG_64BIT
344 u64 min_vruntime_copy;
345#endif
346
347 struct rb_root tasks_timeline;
348 struct rb_node *rb_leftmost;
349
350 struct list_head tasks;
351 struct list_head *balance_iterator;
352
353 /*
354 * 'curr' points to currently running entity on this cfs_rq.
355 * It is set to NULL otherwise (i.e when none are currently running).
356 */
357 struct sched_entity *curr, *next, *last, *skip;
358
359#ifdef CONFIG_SCHED_DEBUG
360 unsigned int nr_spread_over;
361#endif
362
363#ifdef CONFIG_FAIR_GROUP_SCHED
364 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
365
366 /*
367 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
368 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
369 * (like users, containers etc.)
370 *
371 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
372 * list is used during load balance.
373 */
374 int on_list;
375 struct list_head leaf_cfs_rq_list;
376 struct task_group *tg; /* group that "owns" this runqueue */
377
378#ifdef CONFIG_SMP
379 /*
380 * the part of load.weight contributed by tasks
381 */
382 unsigned long task_weight;
383
384 /*
385 * h_load = weight * f(tg)
386 *
387 * Where f(tg) is the recursive weight fraction assigned to
388 * this group.
389 */
390 unsigned long h_load;
391
392 /*
393 * Maintaining per-cpu shares distribution for group scheduling
394 *
395 * load_stamp is the last time we updated the load average
396 * load_last is the last time we updated the load average and saw load
397 * load_unacc_exec_time is currently unaccounted execution time
398 */
399 u64 load_avg;
400 u64 load_period;
401 u64 load_stamp, load_last, load_unacc_exec_time;
402
403 unsigned long load_contribution;
404#endif
405#ifdef CONFIG_CFS_BANDWIDTH
406 int runtime_enabled;
407 u64 runtime_expires;
408 s64 runtime_remaining;
409
410 u64 throttled_timestamp;
411 int throttled, throttle_count;
412 struct list_head throttled_list;
413#endif
414#endif
415};
416
417#ifdef CONFIG_FAIR_GROUP_SCHED
418#ifdef CONFIG_CFS_BANDWIDTH
419static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
420{
421 return &tg->cfs_bandwidth;
422}
423
424static inline u64 default_cfs_period(void);
425static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
426static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
427
428static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
429{
430 struct cfs_bandwidth *cfs_b =
431 container_of(timer, struct cfs_bandwidth, slack_timer);
432 do_sched_cfs_slack_timer(cfs_b);
433
434 return HRTIMER_NORESTART;
435}
436
437static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
438{
439 struct cfs_bandwidth *cfs_b =
440 container_of(timer, struct cfs_bandwidth, period_timer);
441 ktime_t now;
442 int overrun;
443 int idle = 0;
444
445 for (;;) {
446 now = hrtimer_cb_get_time(timer);
447 overrun = hrtimer_forward(timer, now, cfs_b->period);
448
449 if (!overrun)
450 break;
451
452 idle = do_sched_cfs_period_timer(cfs_b, overrun);
453 }
454
455 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
456}
457
458static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
459{
460 raw_spin_lock_init(&cfs_b->lock);
461 cfs_b->runtime = 0;
462 cfs_b->quota = RUNTIME_INF;
463 cfs_b->period = ns_to_ktime(default_cfs_period());
464
465 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
466 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->period_timer.function = sched_cfs_period_timer;
468 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
469 cfs_b->slack_timer.function = sched_cfs_slack_timer;
470}
471
472static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
473{
474 cfs_rq->runtime_enabled = 0;
475 INIT_LIST_HEAD(&cfs_rq->throttled_list);
476}
477
478/* requires cfs_b->lock, may release to reprogram timer */
479static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
480{
481 /*
482 * The timer may be active because we're trying to set a new bandwidth
483 * period or because we're racing with the tear-down path
484 * (timer_active==0 becomes visible before the hrtimer call-back
485 * terminates). In either case we ensure that it's re-programmed
486 */
487 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
488 raw_spin_unlock(&cfs_b->lock);
489 /* ensure cfs_b->lock is available while we wait */
490 hrtimer_cancel(&cfs_b->period_timer);
491
492 raw_spin_lock(&cfs_b->lock);
493 /* if someone else restarted the timer then we're done */
494 if (cfs_b->timer_active)
495 return;
496 }
497
498 cfs_b->timer_active = 1;
499 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
500}
501
502static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
503{
504 hrtimer_cancel(&cfs_b->period_timer);
505 hrtimer_cancel(&cfs_b->slack_timer);
506}
507
508#ifdef HAVE_JUMP_LABEL
509static struct jump_label_key __cfs_bandwidth_used;
510
511static inline bool cfs_bandwidth_used(void)
512{
513 return static_branch(&__cfs_bandwidth_used);
514}
515
516static void account_cfs_bandwidth_used(int enabled, int was_enabled)
517{
518 /* only need to count groups transitioning between enabled/!enabled */
519 if (enabled && !was_enabled)
520 jump_label_inc(&__cfs_bandwidth_used);
521 else if (!enabled && was_enabled)
522 jump_label_dec(&__cfs_bandwidth_used);
523}
524#else /* !HAVE_JUMP_LABEL */
525/* static_branch doesn't help unless supported */
526static int cfs_bandwidth_used(void)
527{
528 return 1;
529}
530static void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
531#endif /* HAVE_JUMP_LABEL */
532#else /* !CONFIG_CFS_BANDWIDTH */
533static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
534static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
535static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
536
537static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
538{
539 return NULL;
540}
541#endif /* CONFIG_CFS_BANDWIDTH */
542#endif /* CONFIG_FAIR_GROUP_SCHED */
543
544/* Real-Time classes' related field in a runqueue: */
545struct rt_rq {
546 struct rt_prio_array active;
547 unsigned long rt_nr_running;
548#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
549 struct {
550 int curr; /* highest queued rt task prio */
551#ifdef CONFIG_SMP
552 int next; /* next highest */
553#endif
554 } highest_prio;
555#endif
556#ifdef CONFIG_SMP
557 unsigned long rt_nr_migratory;
558 unsigned long rt_nr_total;
559 int overloaded;
560 struct plist_head pushable_tasks;
561#endif
562 int rt_throttled;
563 u64 rt_time;
564 u64 rt_runtime;
565 /* Nests inside the rq lock: */
566 raw_spinlock_t rt_runtime_lock;
567
568#ifdef CONFIG_RT_GROUP_SCHED
569 unsigned long rt_nr_boosted;
570
571 struct rq *rq;
572 struct list_head leaf_rt_rq_list;
573 struct task_group *tg;
574#endif
575};
576
577#ifdef CONFIG_SMP
578
579/*
580 * We add the notion of a root-domain which will be used to define per-domain
581 * variables. Each exclusive cpuset essentially defines an island domain by
582 * fully partitioning the member cpus from any other cpuset. Whenever a new
583 * exclusive cpuset is created, we also create and attach a new root-domain
584 * object.
585 *
586 */
587struct root_domain {
588 atomic_t refcount;
589 atomic_t rto_count;
590 struct rcu_head rcu;
591 cpumask_var_t span;
592 cpumask_var_t online;
593
594 /*
595 * The "RT overload" flag: it gets set if a CPU has more than
596 * one runnable RT task.
597 */
598 cpumask_var_t rto_mask;
599 struct cpupri cpupri;
600};
601
602/*
603 * By default the system creates a single root-domain with all cpus as
604 * members (mimicking the global state we have today).
605 */
606static struct root_domain def_root_domain;
607
608#endif /* CONFIG_SMP */
609
610/*
611 * This is the main, per-CPU runqueue data structure.
612 *
613 * Locking rule: those places that want to lock multiple runqueues
614 * (such as the load balancing or the thread migration code), lock
615 * acquire operations must be ordered by ascending &runqueue.
616 */
617struct rq {
618 /* runqueue lock: */
619 raw_spinlock_t lock;
620
621 /*
622 * nr_running and cpu_load should be in the same cacheline because
623 * remote CPUs use both these fields when doing load calculation.
624 */
625 unsigned long nr_running;
626 #define CPU_LOAD_IDX_MAX 5
627 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
628 unsigned long last_load_update_tick;
629#ifdef CONFIG_NO_HZ
630 u64 nohz_stamp;
631 unsigned char nohz_balance_kick;
632#endif
633 int skip_clock_update;
634
635 /* capture load from *all* tasks on this cpu: */
636 struct load_weight load;
637 unsigned long nr_load_updates;
638 u64 nr_switches;
639
640 struct cfs_rq cfs;
641 struct rt_rq rt;
642
643#ifdef CONFIG_FAIR_GROUP_SCHED
644 /* list of leaf cfs_rq on this cpu: */
645 struct list_head leaf_cfs_rq_list;
646#endif
647#ifdef CONFIG_RT_GROUP_SCHED
648 struct list_head leaf_rt_rq_list;
649#endif
650
651 /*
652 * This is part of a global counter where only the total sum
653 * over all CPUs matters. A task can increase this counter on
654 * one CPU and if it got migrated afterwards it may decrease
655 * it on another CPU. Always updated under the runqueue lock:
656 */
657 unsigned long nr_uninterruptible;
658
659 struct task_struct *curr, *idle, *stop;
660 unsigned long next_balance;
661 struct mm_struct *prev_mm;
662
663 u64 clock;
664 u64 clock_task;
665
666 atomic_t nr_iowait;
667
668#ifdef CONFIG_SMP
669 struct root_domain *rd;
670 struct sched_domain *sd;
671
672 unsigned long cpu_power;
673
674 unsigned char idle_balance;
675 /* For active balancing */
676 int post_schedule;
677 int active_balance;
678 int push_cpu;
679 struct cpu_stop_work active_balance_work;
680 /* cpu of this runqueue: */
681 int cpu;
682 int online;
683
684 u64 rt_avg;
685 u64 age_stamp;
686 u64 idle_stamp;
687 u64 avg_idle;
688#endif
689
690#ifdef CONFIG_IRQ_TIME_ACCOUNTING
691 u64 prev_irq_time;
692#endif
693#ifdef CONFIG_PARAVIRT
694 u64 prev_steal_time;
695#endif
696#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
697 u64 prev_steal_time_rq;
698#endif
699
700 /* calc_load related fields */
701 unsigned long calc_load_update;
702 long calc_load_active;
703
704#ifdef CONFIG_SCHED_HRTICK
705#ifdef CONFIG_SMP
706 int hrtick_csd_pending;
707 struct call_single_data hrtick_csd;
708#endif
709 struct hrtimer hrtick_timer;
710#endif
711
712#ifdef CONFIG_SCHEDSTATS
713 /* latency stats */
714 struct sched_info rq_sched_info;
715 unsigned long long rq_cpu_time;
716 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
717
718 /* sys_sched_yield() stats */
719 unsigned int yld_count;
720
721 /* schedule() stats */
722 unsigned int sched_switch;
723 unsigned int sched_count;
724 unsigned int sched_goidle;
725
726 /* try_to_wake_up() stats */
727 unsigned int ttwu_count;
728 unsigned int ttwu_local;
729#endif
730
731#ifdef CONFIG_SMP
732 struct llist_head wake_list;
733#endif
734};
735
736static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
737
738
739static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
740
741static inline int cpu_of(struct rq *rq)
742{
743#ifdef CONFIG_SMP
744 return rq->cpu;
745#else
746 return 0;
747#endif
748}
749
750#define rcu_dereference_check_sched_domain(p) \
751 rcu_dereference_check((p), \
752 lockdep_is_held(&sched_domains_mutex))
753
754/*
755 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
756 * See detach_destroy_domains: synchronize_sched for details.
757 *
758 * The domain tree of any CPU may only be accessed from within
759 * preempt-disabled sections.
760 */
761#define for_each_domain(cpu, __sd) \
762 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
763
764#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
765#define this_rq() (&__get_cpu_var(runqueues))
766#define task_rq(p) cpu_rq(task_cpu(p))
767#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
768#define raw_rq() (&__raw_get_cpu_var(runqueues))
769
770#ifdef CONFIG_CGROUP_SCHED
771
772/*
773 * Return the group to which this tasks belongs.
774 *
775 * We use task_subsys_state_check() and extend the RCU verification with
776 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
777 * task it moves into the cgroup. Therefore by holding either of those locks,
778 * we pin the task to the current cgroup.
779 */
780static inline struct task_group *task_group(struct task_struct *p)
781{
782 struct task_group *tg;
783 struct cgroup_subsys_state *css;
784
785 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
786 lockdep_is_held(&p->pi_lock) ||
787 lockdep_is_held(&task_rq(p)->lock));
788 tg = container_of(css, struct task_group, css);
789
790 return autogroup_task_group(p, tg);
791}
792
793/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
794static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
795{
796#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
797 struct task_group *tg = task_group(p);
798#endif
799
800#ifdef CONFIG_FAIR_GROUP_SCHED
801 p->se.cfs_rq = tg->cfs_rq[cpu];
802 p->se.parent = tg->se[cpu];
803#endif
804
805#ifdef CONFIG_RT_GROUP_SCHED
806 p->rt.rt_rq = tg->rt_rq[cpu];
807 p->rt.parent = tg->rt_se[cpu];
808#endif
809}
810
811#else /* CONFIG_CGROUP_SCHED */
812
813static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
814static inline struct task_group *task_group(struct task_struct *p)
815{
816 return NULL;
817}
818
819#endif /* CONFIG_CGROUP_SCHED */
820 109
821static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
822 111
823static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
824{ 113{
825 s64 delta; 114 s64 delta;
826 115
@@ -833,40 +122,10 @@ static void update_rq_clock(struct rq *rq)
833} 122}
834 123
835/* 124/*
836 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
837 */
838#ifdef CONFIG_SCHED_DEBUG
839# define const_debug __read_mostly
840#else
841# define const_debug static const
842#endif
843
844/**
845 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
846 * @cpu: the processor in question.
847 *
848 * This interface allows printk to be called with the runqueue lock
849 * held and know whether or not it is OK to wake up the klogd.
850 */
851int runqueue_is_locked(int cpu)
852{
853 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
854}
855
856/*
857 * Debugging: various feature bits 125 * Debugging: various feature bits
858 */ 126 */
859 127
860#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
861 __SCHED_FEAT_##name ,
862
863enum {
864#include "sched_features.h"
865};
866
867#undef SCHED_FEAT
868
869#define SCHED_FEAT(name, enabled) \
870 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
871 130
872const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
@@ -965,8 +224,6 @@ late_initcall(sched_init_debug);
965 224
966#endif 225#endif
967 226
968#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
969
970/* 227/*
971 * Number of tasks to iterate in a single balance run. 228 * Number of tasks to iterate in a single balance run.
972 * Limited because this is done with IRQs disabled. 229 * Limited because this is done with IRQs disabled.
@@ -987,7 +244,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
987 */ 244 */
988unsigned int sysctl_sched_rt_period = 1000000; 245unsigned int sysctl_sched_rt_period = 1000000;
989 246
990static __read_mostly int scheduler_running; 247__read_mostly int scheduler_running;
991 248
992/* 249/*
993 * part of the period that we allow rt tasks to run in us. 250 * part of the period that we allow rt tasks to run in us.
@@ -995,112 +252,7 @@ static __read_mostly int scheduler_running;
995 */ 252 */
996int sysctl_sched_rt_runtime = 950000; 253int sysctl_sched_rt_runtime = 950000;
997 254
998static inline u64 global_rt_period(void)
999{
1000 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
1001}
1002 255
1003static inline u64 global_rt_runtime(void)
1004{
1005 if (sysctl_sched_rt_runtime < 0)
1006 return RUNTIME_INF;
1007
1008 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
1009}
1010
1011#ifndef prepare_arch_switch
1012# define prepare_arch_switch(next) do { } while (0)
1013#endif
1014#ifndef finish_arch_switch
1015# define finish_arch_switch(prev) do { } while (0)
1016#endif
1017
1018static inline int task_current(struct rq *rq, struct task_struct *p)
1019{
1020 return rq->curr == p;
1021}
1022
1023static inline int task_running(struct rq *rq, struct task_struct *p)
1024{
1025#ifdef CONFIG_SMP
1026 return p->on_cpu;
1027#else
1028 return task_current(rq, p);
1029#endif
1030}
1031
1032#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1033static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1034{
1035#ifdef CONFIG_SMP
1036 /*
1037 * We can optimise this out completely for !SMP, because the
1038 * SMP rebalancing from interrupt is the only thing that cares
1039 * here.
1040 */
1041 next->on_cpu = 1;
1042#endif
1043}
1044
1045static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1046{
1047#ifdef CONFIG_SMP
1048 /*
1049 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1050 * We must ensure this doesn't happen until the switch is completely
1051 * finished.
1052 */
1053 smp_wmb();
1054 prev->on_cpu = 0;
1055#endif
1056#ifdef CONFIG_DEBUG_SPINLOCK
1057 /* this is a valid case when another task releases the spinlock */
1058 rq->lock.owner = current;
1059#endif
1060 /*
1061 * If we are tracking spinlock dependencies then we have to
1062 * fix up the runqueue lock - which gets 'carried over' from
1063 * prev into current:
1064 */
1065 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1066
1067 raw_spin_unlock_irq(&rq->lock);
1068}
1069
1070#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1071static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1072{
1073#ifdef CONFIG_SMP
1074 /*
1075 * We can optimise this out completely for !SMP, because the
1076 * SMP rebalancing from interrupt is the only thing that cares
1077 * here.
1078 */
1079 next->on_cpu = 1;
1080#endif
1081#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1082 raw_spin_unlock_irq(&rq->lock);
1083#else
1084 raw_spin_unlock(&rq->lock);
1085#endif
1086}
1087
1088static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1089{
1090#ifdef CONFIG_SMP
1091 /*
1092 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1093 * We must ensure this doesn't happen until the switch is completely
1094 * finished.
1095 */
1096 smp_wmb();
1097 prev->on_cpu = 0;
1098#endif
1099#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1100 local_irq_enable();
1101#endif
1102}
1103#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1104 256
1105/* 257/*
1106 * __task_rq_lock - lock the rq @p resides on. 258 * __task_rq_lock - lock the rq @p resides on.
@@ -1183,20 +335,6 @@ static struct rq *this_rq_lock(void)
1183 * rq->lock. 335 * rq->lock.
1184 */ 336 */
1185 337
1186/*
1187 * Use hrtick when:
1188 * - enabled by features
1189 * - hrtimer is actually high res
1190 */
1191static inline int hrtick_enabled(struct rq *rq)
1192{
1193 if (!sched_feat(HRTICK))
1194 return 0;
1195 if (!cpu_active(cpu_of(rq)))
1196 return 0;
1197 return hrtimer_is_hres_active(&rq->hrtick_timer);
1198}
1199
1200static void hrtick_clear(struct rq *rq) 338static void hrtick_clear(struct rq *rq)
1201{ 339{
1202 if (hrtimer_active(&rq->hrtick_timer)) 340 if (hrtimer_active(&rq->hrtick_timer))
@@ -1240,7 +378,7 @@ static void __hrtick_start(void *arg)
1240 * 378 *
1241 * called with rq->lock held and irqs disabled 379 * called with rq->lock held and irqs disabled
1242 */ 380 */
1243static void hrtick_start(struct rq *rq, u64 delay) 381void hrtick_start(struct rq *rq, u64 delay)
1244{ 382{
1245 struct hrtimer *timer = &rq->hrtick_timer; 383 struct hrtimer *timer = &rq->hrtick_timer;
1246 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 384 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1284,7 +422,7 @@ static __init void init_hrtick(void)
1284 * 422 *
1285 * called with rq->lock held and irqs disabled 423 * called with rq->lock held and irqs disabled
1286 */ 424 */
1287static void hrtick_start(struct rq *rq, u64 delay) 425void hrtick_start(struct rq *rq, u64 delay)
1288{ 426{
1289 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 427 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1290 HRTIMER_MODE_REL_PINNED, 0); 428 HRTIMER_MODE_REL_PINNED, 0);
@@ -1335,7 +473,7 @@ static inline void init_hrtick(void)
1335#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 473#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1336#endif 474#endif
1337 475
1338static void resched_task(struct task_struct *p) 476void resched_task(struct task_struct *p)
1339{ 477{
1340 int cpu; 478 int cpu;
1341 479
@@ -1356,7 +494,7 @@ static void resched_task(struct task_struct *p)
1356 smp_send_reschedule(cpu); 494 smp_send_reschedule(cpu);
1357} 495}
1358 496
1359static void resched_cpu(int cpu) 497void resched_cpu(int cpu)
1360{ 498{
1361 struct rq *rq = cpu_rq(cpu); 499 struct rq *rq = cpu_rq(cpu);
1362 unsigned long flags; 500 unsigned long flags;
@@ -1449,12 +587,7 @@ static inline bool got_nohz_idle_kick(void)
1449 587
1450#endif /* CONFIG_NO_HZ */ 588#endif /* CONFIG_NO_HZ */
1451 589
1452static u64 sched_avg_period(void) 590void sched_avg_update(struct rq *rq)
1453{
1454 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1455}
1456
1457static void sched_avg_update(struct rq *rq)
1458{ 591{
1459 s64 period = sched_avg_period(); 592 s64 period = sched_avg_period();
1460 593
@@ -1470,193 +603,23 @@ static void sched_avg_update(struct rq *rq)
1470 } 603 }
1471} 604}
1472 605
1473static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1474{
1475 rq->rt_avg += rt_delta;
1476 sched_avg_update(rq);
1477}
1478
1479#else /* !CONFIG_SMP */ 606#else /* !CONFIG_SMP */
1480static void resched_task(struct task_struct *p) 607void resched_task(struct task_struct *p)
1481{ 608{
1482 assert_raw_spin_locked(&task_rq(p)->lock); 609 assert_raw_spin_locked(&task_rq(p)->lock);
1483 set_tsk_need_resched(p); 610 set_tsk_need_resched(p);
1484} 611}
1485
1486static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1487{
1488}
1489
1490static void sched_avg_update(struct rq *rq)
1491{
1492}
1493#endif /* CONFIG_SMP */ 612#endif /* CONFIG_SMP */
1494 613
1495#if BITS_PER_LONG == 32
1496# define WMULT_CONST (~0UL)
1497#else
1498# define WMULT_CONST (1UL << 32)
1499#endif
1500
1501#define WMULT_SHIFT 32
1502
1503/*
1504 * Shift right and round:
1505 */
1506#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1507
1508/*
1509 * delta *= weight / lw
1510 */
1511static unsigned long
1512calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1513 struct load_weight *lw)
1514{
1515 u64 tmp;
1516
1517 /*
1518 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1519 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1520 * 2^SCHED_LOAD_RESOLUTION.
1521 */
1522 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1523 tmp = (u64)delta_exec * scale_load_down(weight);
1524 else
1525 tmp = (u64)delta_exec;
1526
1527 if (!lw->inv_weight) {
1528 unsigned long w = scale_load_down(lw->weight);
1529
1530 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1531 lw->inv_weight = 1;
1532 else if (unlikely(!w))
1533 lw->inv_weight = WMULT_CONST;
1534 else
1535 lw->inv_weight = WMULT_CONST / w;
1536 }
1537
1538 /*
1539 * Check whether we'd overflow the 64-bit multiplication:
1540 */
1541 if (unlikely(tmp > WMULT_CONST))
1542 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1543 WMULT_SHIFT/2);
1544 else
1545 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1546
1547 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1548}
1549
1550static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1551{
1552 lw->weight += inc;
1553 lw->inv_weight = 0;
1554}
1555
1556static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1557{
1558 lw->weight -= dec;
1559 lw->inv_weight = 0;
1560}
1561
1562static inline void update_load_set(struct load_weight *lw, unsigned long w)
1563{
1564 lw->weight = w;
1565 lw->inv_weight = 0;
1566}
1567
1568/*
1569 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1570 * of tasks with abnormal "nice" values across CPUs the contribution that
1571 * each task makes to its run queue's load is weighted according to its
1572 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1573 * scaled version of the new time slice allocation that they receive on time
1574 * slice expiry etc.
1575 */
1576
1577#define WEIGHT_IDLEPRIO 3
1578#define WMULT_IDLEPRIO 1431655765
1579
1580/*
1581 * Nice levels are multiplicative, with a gentle 10% change for every
1582 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1583 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1584 * that remained on nice 0.
1585 *
1586 * The "10% effect" is relative and cumulative: from _any_ nice level,
1587 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1588 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1589 * If a task goes up by ~10% and another task goes down by ~10% then
1590 * the relative distance between them is ~25%.)
1591 */
1592static const int prio_to_weight[40] = {
1593 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1594 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1595 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1596 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1597 /* 0 */ 1024, 820, 655, 526, 423,
1598 /* 5 */ 335, 272, 215, 172, 137,
1599 /* 10 */ 110, 87, 70, 56, 45,
1600 /* 15 */ 36, 29, 23, 18, 15,
1601};
1602
1603/*
1604 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1605 *
1606 * In cases where the weight does not change often, we can use the
1607 * precalculated inverse to speed up arithmetics by turning divisions
1608 * into multiplications:
1609 */
1610static const u32 prio_to_wmult[40] = {
1611 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1612 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1613 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1614 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1615 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1616 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1617 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1618 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1619};
1620
1621/* Time spent by the tasks of the cpu accounting group executing in ... */
1622enum cpuacct_stat_index {
1623 CPUACCT_STAT_USER, /* ... user mode */
1624 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1625
1626 CPUACCT_STAT_NSTATS,
1627};
1628
1629#ifdef CONFIG_CGROUP_CPUACCT
1630static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1631static void cpuacct_update_stats(struct task_struct *tsk,
1632 enum cpuacct_stat_index idx, cputime_t val);
1633#else
1634static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1635static inline void cpuacct_update_stats(struct task_struct *tsk,
1636 enum cpuacct_stat_index idx, cputime_t val) {}
1637#endif
1638
1639static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1640{
1641 update_load_add(&rq->load, load);
1642}
1643
1644static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1645{
1646 update_load_sub(&rq->load, load);
1647}
1648
1649#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 614#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1650 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 615 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1651typedef int (*tg_visitor)(struct task_group *, void *);
1652
1653/* 616/*
1654 * Iterate task_group tree rooted at *from, calling @down when first entering a 617 * Iterate task_group tree rooted at *from, calling @down when first entering a
1655 * node and @up when leaving it for the final time. 618 * node and @up when leaving it for the final time.
1656 * 619 *
1657 * Caller must hold rcu_lock or sufficient equivalent. 620 * Caller must hold rcu_lock or sufficient equivalent.
1658 */ 621 */
1659static int walk_tg_tree_from(struct task_group *from, 622int walk_tg_tree_from(struct task_group *from,
1660 tg_visitor down, tg_visitor up, void *data) 623 tg_visitor down, tg_visitor up, void *data)
1661{ 624{
1662 struct task_group *parent, *child; 625 struct task_group *parent, *child;
@@ -1687,270 +650,13 @@ out:
1687 return ret; 650 return ret;
1688} 651}
1689 652
1690/* 653int tg_nop(struct task_group *tg, void *data)
1691 * Iterate the full tree, calling @down when first entering a node and @up when
1692 * leaving it for the final time.
1693 *
1694 * Caller must hold rcu_lock or sufficient equivalent.
1695 */
1696
1697static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1698{
1699 return walk_tg_tree_from(&root_task_group, down, up, data);
1700}
1701
1702static int tg_nop(struct task_group *tg, void *data)
1703{ 654{
1704 return 0; 655 return 0;
1705} 656}
1706#endif 657#endif
1707 658
1708#ifdef CONFIG_SMP 659void update_cpu_load(struct rq *this_rq);
1709/* Used instead of source_load when we know the type == 0 */
1710static unsigned long weighted_cpuload(const int cpu)
1711{
1712 return cpu_rq(cpu)->load.weight;
1713}
1714
1715/*
1716 * Return a low guess at the load of a migration-source cpu weighted
1717 * according to the scheduling class and "nice" value.
1718 *
1719 * We want to under-estimate the load of migration sources, to
1720 * balance conservatively.
1721 */
1722static unsigned long source_load(int cpu, int type)
1723{
1724 struct rq *rq = cpu_rq(cpu);
1725 unsigned long total = weighted_cpuload(cpu);
1726
1727 if (type == 0 || !sched_feat(LB_BIAS))
1728 return total;
1729
1730 return min(rq->cpu_load[type-1], total);
1731}
1732
1733/*
1734 * Return a high guess at the load of a migration-target cpu weighted
1735 * according to the scheduling class and "nice" value.
1736 */
1737static unsigned long target_load(int cpu, int type)
1738{
1739 struct rq *rq = cpu_rq(cpu);
1740 unsigned long total = weighted_cpuload(cpu);
1741
1742 if (type == 0 || !sched_feat(LB_BIAS))
1743 return total;
1744
1745 return max(rq->cpu_load[type-1], total);
1746}
1747
1748static unsigned long power_of(int cpu)
1749{
1750 return cpu_rq(cpu)->cpu_power;
1751}
1752
1753static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1754
1755static unsigned long cpu_avg_load_per_task(int cpu)
1756{
1757 struct rq *rq = cpu_rq(cpu);
1758 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1759
1760 if (nr_running)
1761 return rq->load.weight / nr_running;
1762
1763 return 0;
1764}
1765
1766#ifdef CONFIG_PREEMPT
1767
1768static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1769
1770/*
1771 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1772 * way at the expense of forcing extra atomic operations in all
1773 * invocations. This assures that the double_lock is acquired using the
1774 * same underlying policy as the spinlock_t on this architecture, which
1775 * reduces latency compared to the unfair variant below. However, it
1776 * also adds more overhead and therefore may reduce throughput.
1777 */
1778static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1779 __releases(this_rq->lock)
1780 __acquires(busiest->lock)
1781 __acquires(this_rq->lock)
1782{
1783 raw_spin_unlock(&this_rq->lock);
1784 double_rq_lock(this_rq, busiest);
1785
1786 return 1;
1787}
1788
1789#else
1790/*
1791 * Unfair double_lock_balance: Optimizes throughput at the expense of
1792 * latency by eliminating extra atomic operations when the locks are
1793 * already in proper order on entry. This favors lower cpu-ids and will
1794 * grant the double lock to lower cpus over higher ids under contention,
1795 * regardless of entry order into the function.
1796 */
1797static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1798 __releases(this_rq->lock)
1799 __acquires(busiest->lock)
1800 __acquires(this_rq->lock)
1801{
1802 int ret = 0;
1803
1804 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1805 if (busiest < this_rq) {
1806 raw_spin_unlock(&this_rq->lock);
1807 raw_spin_lock(&busiest->lock);
1808 raw_spin_lock_nested(&this_rq->lock,
1809 SINGLE_DEPTH_NESTING);
1810 ret = 1;
1811 } else
1812 raw_spin_lock_nested(&busiest->lock,
1813 SINGLE_DEPTH_NESTING);
1814 }
1815 return ret;
1816}
1817
1818#endif /* CONFIG_PREEMPT */
1819
1820/*
1821 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1822 */
1823static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1824{
1825 if (unlikely(!irqs_disabled())) {
1826 /* printk() doesn't work good under rq->lock */
1827 raw_spin_unlock(&this_rq->lock);
1828 BUG_ON(1);
1829 }
1830
1831 return _double_lock_balance(this_rq, busiest);
1832}
1833
1834static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1835 __releases(busiest->lock)
1836{
1837 raw_spin_unlock(&busiest->lock);
1838 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1839}
1840
1841/*
1842 * double_rq_lock - safely lock two runqueues
1843 *
1844 * Note this does not disable interrupts like task_rq_lock,
1845 * you need to do so manually before calling.
1846 */
1847static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1848 __acquires(rq1->lock)
1849 __acquires(rq2->lock)
1850{
1851 BUG_ON(!irqs_disabled());
1852 if (rq1 == rq2) {
1853 raw_spin_lock(&rq1->lock);
1854 __acquire(rq2->lock); /* Fake it out ;) */
1855 } else {
1856 if (rq1 < rq2) {
1857 raw_spin_lock(&rq1->lock);
1858 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1859 } else {
1860 raw_spin_lock(&rq2->lock);
1861 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1862 }
1863 }
1864}
1865
1866/*
1867 * double_rq_unlock - safely unlock two runqueues
1868 *
1869 * Note this does not restore interrupts like task_rq_unlock,
1870 * you need to do so manually after calling.
1871 */
1872static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1873 __releases(rq1->lock)
1874 __releases(rq2->lock)
1875{
1876 raw_spin_unlock(&rq1->lock);
1877 if (rq1 != rq2)
1878 raw_spin_unlock(&rq2->lock);
1879 else
1880 __release(rq2->lock);
1881}
1882
1883#else /* CONFIG_SMP */
1884
1885/*
1886 * double_rq_lock - safely lock two runqueues
1887 *
1888 * Note this does not disable interrupts like task_rq_lock,
1889 * you need to do so manually before calling.
1890 */
1891static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1892 __acquires(rq1->lock)
1893 __acquires(rq2->lock)
1894{
1895 BUG_ON(!irqs_disabled());
1896 BUG_ON(rq1 != rq2);
1897 raw_spin_lock(&rq1->lock);
1898 __acquire(rq2->lock); /* Fake it out ;) */
1899}
1900
1901/*
1902 * double_rq_unlock - safely unlock two runqueues
1903 *
1904 * Note this does not restore interrupts like task_rq_unlock,
1905 * you need to do so manually after calling.
1906 */
1907static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1908 __releases(rq1->lock)
1909 __releases(rq2->lock)
1910{
1911 BUG_ON(rq1 != rq2);
1912 raw_spin_unlock(&rq1->lock);
1913 __release(rq2->lock);
1914}
1915
1916#endif
1917
1918static void calc_load_account_idle(struct rq *this_rq);
1919static void update_sysctl(void);
1920static int get_update_sysctl_factor(void);
1921static void update_cpu_load(struct rq *this_rq);
1922
1923static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1924{
1925 set_task_rq(p, cpu);
1926#ifdef CONFIG_SMP
1927 /*
1928 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1929 * successfully executed on another CPU. We must ensure that updates of
1930 * per-task data have been completed by this moment.
1931 */
1932 smp_wmb();
1933 task_thread_info(p)->cpu = cpu;
1934#endif
1935}
1936
1937static const struct sched_class rt_sched_class;
1938
1939#define sched_class_highest (&stop_sched_class)
1940#define for_each_class(class) \
1941 for (class = sched_class_highest; class; class = class->next)
1942
1943#include "sched_stats.h"
1944
1945static void inc_nr_running(struct rq *rq)
1946{
1947 rq->nr_running++;
1948}
1949
1950static void dec_nr_running(struct rq *rq)
1951{
1952 rq->nr_running--;
1953}
1954 660
1955static void set_load_weight(struct task_struct *p) 661static void set_load_weight(struct task_struct *p)
1956{ 662{
@@ -1987,7 +693,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1987/* 693/*
1988 * activate_task - move a task to the runqueue. 694 * activate_task - move a task to the runqueue.
1989 */ 695 */
1990static void activate_task(struct rq *rq, struct task_struct *p, int flags) 696void activate_task(struct rq *rq, struct task_struct *p, int flags)
1991{ 697{
1992 if (task_contributes_to_load(p)) 698 if (task_contributes_to_load(p))
1993 rq->nr_uninterruptible--; 699 rq->nr_uninterruptible--;
@@ -1998,7 +704,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1998/* 704/*
1999 * deactivate_task - remove a task from the runqueue. 705 * deactivate_task - remove a task from the runqueue.
2000 */ 706 */
2001static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 707void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
2002{ 708{
2003 if (task_contributes_to_load(p)) 709 if (task_contributes_to_load(p))
2004 rq->nr_uninterruptible++; 710 rq->nr_uninterruptible++;
@@ -2223,15 +929,6 @@ static int irqtime_account_si_update(void)
2223 929
2224#endif 930#endif
2225 931
2226#include "sched_idletask.c"
2227#include "sched_fair.c"
2228#include "sched_rt.c"
2229#include "sched_autogroup.c"
2230#include "sched_stoptask.c"
2231#ifdef CONFIG_SCHED_DEBUG
2232# include "sched_debug.c"
2233#endif
2234
2235void sched_set_stop_task(int cpu, struct task_struct *stop) 932void sched_set_stop_task(int cpu, struct task_struct *stop)
2236{ 933{
2237 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 934 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2329,7 +1026,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2329 p->sched_class->prio_changed(rq, p, oldprio); 1026 p->sched_class->prio_changed(rq, p, oldprio);
2330} 1027}
2331 1028
2332static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1029void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2333{ 1030{
2334 const struct sched_class *class; 1031 const struct sched_class *class;
2335 1032
@@ -2355,38 +1052,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2355} 1052}
2356 1053
2357#ifdef CONFIG_SMP 1054#ifdef CONFIG_SMP
2358/*
2359 * Is this task likely cache-hot:
2360 */
2361static int
2362task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2363{
2364 s64 delta;
2365
2366 if (p->sched_class != &fair_sched_class)
2367 return 0;
2368
2369 if (unlikely(p->policy == SCHED_IDLE))
2370 return 0;
2371
2372 /*
2373 * Buddy candidates are cache hot:
2374 */
2375 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2376 (&p->se == cfs_rq_of(&p->se)->next ||
2377 &p->se == cfs_rq_of(&p->se)->last))
2378 return 1;
2379
2380 if (sysctl_sched_migration_cost == -1)
2381 return 1;
2382 if (sysctl_sched_migration_cost == 0)
2383 return 0;
2384
2385 delta = now - p->se.exec_start;
2386
2387 return delta < (s64)sysctl_sched_migration_cost;
2388}
2389
2390void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1055void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2391{ 1056{
2392#ifdef CONFIG_SCHED_DEBUG 1057#ifdef CONFIG_SCHED_DEBUG
@@ -3469,7 +2134,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3469 */ 2134 */
3470static atomic_long_t calc_load_tasks_idle; 2135static atomic_long_t calc_load_tasks_idle;
3471 2136
3472static void calc_load_account_idle(struct rq *this_rq) 2137void calc_load_account_idle(struct rq *this_rq)
3473{ 2138{
3474 long delta; 2139 long delta;
3475 2140
@@ -3613,7 +2278,7 @@ static void calc_global_nohz(unsigned long ticks)
3613 */ 2278 */
3614} 2279}
3615#else 2280#else
3616static void calc_load_account_idle(struct rq *this_rq) 2281void calc_load_account_idle(struct rq *this_rq)
3617{ 2282{
3618} 2283}
3619 2284
@@ -3756,7 +2421,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3756 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2421 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3757 * every tick. We fix it up based on jiffies. 2422 * every tick. We fix it up based on jiffies.
3758 */ 2423 */
3759static void update_cpu_load(struct rq *this_rq) 2424void update_cpu_load(struct rq *this_rq)
3760{ 2425{
3761 unsigned long this_load = this_rq->load.weight; 2426 unsigned long this_load = this_rq->load.weight;
3762 unsigned long curr_jiffies = jiffies; 2427 unsigned long curr_jiffies = jiffies;
@@ -6148,53 +4813,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6148#endif 4813#endif
6149} 4814}
6150 4815
6151/*
6152 * Increase the granularity value when there are more CPUs,
6153 * because with more CPUs the 'effective latency' as visible
6154 * to users decreases. But the relationship is not linear,
6155 * so pick a second-best guess by going with the log2 of the
6156 * number of CPUs.
6157 *
6158 * This idea comes from the SD scheduler of Con Kolivas:
6159 */
6160static int get_update_sysctl_factor(void)
6161{
6162 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6163 unsigned int factor;
6164
6165 switch (sysctl_sched_tunable_scaling) {
6166 case SCHED_TUNABLESCALING_NONE:
6167 factor = 1;
6168 break;
6169 case SCHED_TUNABLESCALING_LINEAR:
6170 factor = cpus;
6171 break;
6172 case SCHED_TUNABLESCALING_LOG:
6173 default:
6174 factor = 1 + ilog2(cpus);
6175 break;
6176 }
6177
6178 return factor;
6179}
6180
6181static void update_sysctl(void)
6182{
6183 unsigned int factor = get_update_sysctl_factor();
6184
6185#define SET_SYSCTL(name) \
6186 (sysctl_##name = (factor) * normalized_sysctl_##name)
6187 SET_SYSCTL(sched_min_granularity);
6188 SET_SYSCTL(sched_latency);
6189 SET_SYSCTL(sched_wakeup_granularity);
6190#undef SET_SYSCTL
6191}
6192
6193static inline void sched_init_granularity(void)
6194{
6195 update_sysctl();
6196}
6197
6198#ifdef CONFIG_SMP 4816#ifdef CONFIG_SMP
6199void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4817void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6200{ 4818{
@@ -6381,30 +4999,6 @@ static void calc_global_load_remove(struct rq *rq)
6381 rq->calc_load_active = 0; 4999 rq->calc_load_active = 0;
6382} 5000}
6383 5001
6384#ifdef CONFIG_CFS_BANDWIDTH
6385static void unthrottle_offline_cfs_rqs(struct rq *rq)
6386{
6387 struct cfs_rq *cfs_rq;
6388
6389 for_each_leaf_cfs_rq(rq, cfs_rq) {
6390 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6391
6392 if (!cfs_rq->runtime_enabled)
6393 continue;
6394
6395 /*
6396 * clock_task is not advancing so we just need to make sure
6397 * there's some valid quota amount
6398 */
6399 cfs_rq->runtime_remaining = cfs_b->quota;
6400 if (cfs_rq_throttled(cfs_rq))
6401 unthrottle_cfs_rq(cfs_rq);
6402 }
6403}
6404#else
6405static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6406#endif
6407
6408/* 5002/*
6409 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5003 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6410 * try_to_wake_up()->select_task_rq(). 5004 * try_to_wake_up()->select_task_rq().
@@ -7010,6 +5604,12 @@ out:
7010 return -ENOMEM; 5604 return -ENOMEM;
7011} 5605}
7012 5606
5607/*
5608 * By default the system creates a single root-domain with all cpus as
5609 * members (mimicking the global state we have today).
5610 */
5611struct root_domain def_root_domain;
5612
7013static void init_defrootdomain(void) 5613static void init_defrootdomain(void)
7014{ 5614{
7015 init_rootdomain(&def_root_domain); 5615 init_rootdomain(&def_root_domain);
@@ -7418,6 +6018,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7418 update_group_power(sd, cpu); 6018 update_group_power(sd, cpu);
7419} 6019}
7420 6020
6021int __weak arch_sd_sibling_asym_packing(void)
6022{
6023 return 0*SD_ASYM_PACKING;
6024}
6025
7421/* 6026/*
7422 * Initializers for schedule domains 6027 * Initializers for schedule domains
7423 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6028 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
@@ -8053,29 +6658,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8053 } 6658 }
8054} 6659}
8055 6660
8056static int update_runtime(struct notifier_block *nfb,
8057 unsigned long action, void *hcpu)
8058{
8059 int cpu = (int)(long)hcpu;
8060
8061 switch (action) {
8062 case CPU_DOWN_PREPARE:
8063 case CPU_DOWN_PREPARE_FROZEN:
8064 disable_runtime(cpu_rq(cpu));
8065 return NOTIFY_OK;
8066
8067 case CPU_DOWN_FAILED:
8068 case CPU_DOWN_FAILED_FROZEN:
8069 case CPU_ONLINE:
8070 case CPU_ONLINE_FROZEN:
8071 enable_runtime(cpu_rq(cpu));
8072 return NOTIFY_OK;
8073
8074 default:
8075 return NOTIFY_DONE;
8076 }
8077}
8078
8079void __init sched_init_smp(void) 6661void __init sched_init_smp(void)
8080{ 6662{
8081 cpumask_var_t non_isolated_cpus; 6663 cpumask_var_t non_isolated_cpus;
@@ -8124,104 +6706,11 @@ int in_sched_functions(unsigned long addr)
8124 && addr < (unsigned long)__sched_text_end); 6706 && addr < (unsigned long)__sched_text_end);
8125} 6707}
8126 6708
8127static void init_cfs_rq(struct cfs_rq *cfs_rq) 6709#ifdef CONFIG_CGROUP_SCHED
8128{ 6710struct task_group root_task_group;
8129 cfs_rq->tasks_timeline = RB_ROOT;
8130 INIT_LIST_HEAD(&cfs_rq->tasks);
8131 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8132#ifndef CONFIG_64BIT
8133 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8134#endif
8135}
8136
8137static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8138{
8139 struct rt_prio_array *array;
8140 int i;
8141
8142 array = &rt_rq->active;
8143 for (i = 0; i < MAX_RT_PRIO; i++) {
8144 INIT_LIST_HEAD(array->queue + i);
8145 __clear_bit(i, array->bitmap);
8146 }
8147 /* delimiter for bitsearch: */
8148 __set_bit(MAX_RT_PRIO, array->bitmap);
8149
8150#if defined CONFIG_SMP
8151 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8152 rt_rq->highest_prio.next = MAX_RT_PRIO;
8153 rt_rq->rt_nr_migratory = 0;
8154 rt_rq->overloaded = 0;
8155 plist_head_init(&rt_rq->pushable_tasks);
8156#endif
8157
8158 rt_rq->rt_time = 0;
8159 rt_rq->rt_throttled = 0;
8160 rt_rq->rt_runtime = 0;
8161 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8162}
8163
8164#ifdef CONFIG_FAIR_GROUP_SCHED
8165static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8166 struct sched_entity *se, int cpu,
8167 struct sched_entity *parent)
8168{
8169 struct rq *rq = cpu_rq(cpu);
8170
8171 cfs_rq->tg = tg;
8172 cfs_rq->rq = rq;
8173#ifdef CONFIG_SMP
8174 /* allow initial update_cfs_load() to truncate */
8175 cfs_rq->load_stamp = 1;
8176#endif
8177 init_cfs_rq_runtime(cfs_rq);
8178
8179 tg->cfs_rq[cpu] = cfs_rq;
8180 tg->se[cpu] = se;
8181
8182 /* se could be NULL for root_task_group */
8183 if (!se)
8184 return;
8185
8186 if (!parent)
8187 se->cfs_rq = &rq->cfs;
8188 else
8189 se->cfs_rq = parent->my_q;
8190
8191 se->my_q = cfs_rq;
8192 update_load_set(&se->load, 0);
8193 se->parent = parent;
8194}
8195#endif 6711#endif
8196 6712
8197#ifdef CONFIG_RT_GROUP_SCHED 6713DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8198static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8199 struct sched_rt_entity *rt_se, int cpu,
8200 struct sched_rt_entity *parent)
8201{
8202 struct rq *rq = cpu_rq(cpu);
8203
8204 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8205 rt_rq->rt_nr_boosted = 0;
8206 rt_rq->rq = rq;
8207 rt_rq->tg = tg;
8208
8209 tg->rt_rq[cpu] = rt_rq;
8210 tg->rt_se[cpu] = rt_se;
8211
8212 if (!rt_se)
8213 return;
8214
8215 if (!parent)
8216 rt_se->rt_rq = &rq->rt;
8217 else
8218 rt_se->rt_rq = parent->my_q;
8219
8220 rt_se->my_q = rt_rq;
8221 rt_se->parent = parent;
8222 INIT_LIST_HEAD(&rt_se->run_list);
8223}
8224#endif
8225 6714
8226void __init sched_init(void) 6715void __init sched_init(void)
8227{ 6716{
@@ -8294,7 +6783,7 @@ void __init sched_init(void)
8294 init_cfs_rq(&rq->cfs); 6783 init_cfs_rq(&rq->cfs);
8295 init_rt_rq(&rq->rt, rq); 6784 init_rt_rq(&rq->rt, rq);
8296#ifdef CONFIG_FAIR_GROUP_SCHED 6785#ifdef CONFIG_FAIR_GROUP_SCHED
8297 root_task_group.shares = root_task_group_load; 6786 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8298 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6787 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8299 /* 6788 /*
8300 * How much cpu bandwidth does root_task_group get? 6789 * How much cpu bandwidth does root_task_group get?
@@ -8357,10 +6846,6 @@ void __init sched_init(void)
8357 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6846 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8358#endif 6847#endif
8359 6848
8360#ifdef CONFIG_SMP
8361 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8362#endif
8363
8364#ifdef CONFIG_RT_MUTEXES 6849#ifdef CONFIG_RT_MUTEXES
8365 plist_head_init(&init_task.pi_waiters); 6850 plist_head_init(&init_task.pi_waiters);
8366#endif 6851#endif
@@ -8388,17 +6873,11 @@ void __init sched_init(void)
8388 6873
8389#ifdef CONFIG_SMP 6874#ifdef CONFIG_SMP
8390 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6875 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8391#ifdef CONFIG_NO_HZ
8392 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8393 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8394 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8395 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8396 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8397#endif
8398 /* May be allocated at isolcpus cmdline parse time */ 6876 /* May be allocated at isolcpus cmdline parse time */
8399 if (cpu_isolated_map == NULL) 6877 if (cpu_isolated_map == NULL)
8400 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6878 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8401#endif /* SMP */ 6879#endif
6880 init_sched_fair_class();
8402 6881
8403 scheduler_running = 1; 6882 scheduler_running = 1;
8404} 6883}
@@ -8550,169 +7029,14 @@ void set_curr_task(int cpu, struct task_struct *p)
8550 7029
8551#endif 7030#endif
8552 7031
8553#ifdef CONFIG_FAIR_GROUP_SCHED
8554static void free_fair_sched_group(struct task_group *tg)
8555{
8556 int i;
8557
8558 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8559
8560 for_each_possible_cpu(i) {
8561 if (tg->cfs_rq)
8562 kfree(tg->cfs_rq[i]);
8563 if (tg->se)
8564 kfree(tg->se[i]);
8565 }
8566
8567 kfree(tg->cfs_rq);
8568 kfree(tg->se);
8569}
8570
8571static
8572int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8573{
8574 struct cfs_rq *cfs_rq;
8575 struct sched_entity *se;
8576 int i;
8577
8578 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8579 if (!tg->cfs_rq)
8580 goto err;
8581 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8582 if (!tg->se)
8583 goto err;
8584
8585 tg->shares = NICE_0_LOAD;
8586
8587 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8588
8589 for_each_possible_cpu(i) {
8590 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8591 GFP_KERNEL, cpu_to_node(i));
8592 if (!cfs_rq)
8593 goto err;
8594
8595 se = kzalloc_node(sizeof(struct sched_entity),
8596 GFP_KERNEL, cpu_to_node(i));
8597 if (!se)
8598 goto err_free_rq;
8599
8600 init_cfs_rq(cfs_rq);
8601 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8602 }
8603
8604 return 1;
8605
8606err_free_rq:
8607 kfree(cfs_rq);
8608err:
8609 return 0;
8610}
8611
8612static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8613{
8614 struct rq *rq = cpu_rq(cpu);
8615 unsigned long flags;
8616
8617 /*
8618 * Only empty task groups can be destroyed; so we can speculatively
8619 * check on_list without danger of it being re-added.
8620 */
8621 if (!tg->cfs_rq[cpu]->on_list)
8622 return;
8623
8624 raw_spin_lock_irqsave(&rq->lock, flags);
8625 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8626 raw_spin_unlock_irqrestore(&rq->lock, flags);
8627}
8628#else /* !CONFIG_FAIR_GROUP_SCHED */
8629static inline void free_fair_sched_group(struct task_group *tg)
8630{
8631}
8632
8633static inline
8634int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8635{
8636 return 1;
8637}
8638
8639static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8640{
8641}
8642#endif /* CONFIG_FAIR_GROUP_SCHED */
8643
8644#ifdef CONFIG_RT_GROUP_SCHED 7032#ifdef CONFIG_RT_GROUP_SCHED
8645static void free_rt_sched_group(struct task_group *tg)
8646{
8647 int i;
8648
8649 if (tg->rt_se)
8650 destroy_rt_bandwidth(&tg->rt_bandwidth);
8651
8652 for_each_possible_cpu(i) {
8653 if (tg->rt_rq)
8654 kfree(tg->rt_rq[i]);
8655 if (tg->rt_se)
8656 kfree(tg->rt_se[i]);
8657 }
8658
8659 kfree(tg->rt_rq);
8660 kfree(tg->rt_se);
8661}
8662
8663static
8664int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8665{
8666 struct rt_rq *rt_rq;
8667 struct sched_rt_entity *rt_se;
8668 int i;
8669
8670 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8671 if (!tg->rt_rq)
8672 goto err;
8673 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8674 if (!tg->rt_se)
8675 goto err;
8676
8677 init_rt_bandwidth(&tg->rt_bandwidth,
8678 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8679
8680 for_each_possible_cpu(i) {
8681 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8682 GFP_KERNEL, cpu_to_node(i));
8683 if (!rt_rq)
8684 goto err;
8685
8686 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8687 GFP_KERNEL, cpu_to_node(i));
8688 if (!rt_se)
8689 goto err_free_rq;
8690
8691 init_rt_rq(rt_rq, cpu_rq(i));
8692 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8693 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8694 }
8695
8696 return 1;
8697
8698err_free_rq:
8699 kfree(rt_rq);
8700err:
8701 return 0;
8702}
8703#else /* !CONFIG_RT_GROUP_SCHED */ 7033#else /* !CONFIG_RT_GROUP_SCHED */
8704static inline void free_rt_sched_group(struct task_group *tg)
8705{
8706}
8707
8708static inline
8709int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8710{
8711 return 1;
8712}
8713#endif /* CONFIG_RT_GROUP_SCHED */ 7034#endif /* CONFIG_RT_GROUP_SCHED */
8714 7035
8715#ifdef CONFIG_CGROUP_SCHED 7036#ifdef CONFIG_CGROUP_SCHED
7037/* task_group_lock serializes the addition/removal of task groups */
7038static DEFINE_SPINLOCK(task_group_lock);
7039
8716static void free_sched_group(struct task_group *tg) 7040static void free_sched_group(struct task_group *tg)
8717{ 7041{
8718 free_fair_sched_group(tg); 7042 free_fair_sched_group(tg);
@@ -8818,47 +7142,6 @@ void sched_move_task(struct task_struct *tsk)
8818#endif /* CONFIG_CGROUP_SCHED */ 7142#endif /* CONFIG_CGROUP_SCHED */
8819 7143
8820#ifdef CONFIG_FAIR_GROUP_SCHED 7144#ifdef CONFIG_FAIR_GROUP_SCHED
8821static DEFINE_MUTEX(shares_mutex);
8822
8823int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8824{
8825 int i;
8826 unsigned long flags;
8827
8828 /*
8829 * We can't change the weight of the root cgroup.
8830 */
8831 if (!tg->se[0])
8832 return -EINVAL;
8833
8834 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8835
8836 mutex_lock(&shares_mutex);
8837 if (tg->shares == shares)
8838 goto done;
8839
8840 tg->shares = shares;
8841 for_each_possible_cpu(i) {
8842 struct rq *rq = cpu_rq(i);
8843 struct sched_entity *se;
8844
8845 se = tg->se[i];
8846 /* Propagate contribution to hierarchy */
8847 raw_spin_lock_irqsave(&rq->lock, flags);
8848 for_each_sched_entity(se)
8849 update_cfs_shares(group_cfs_rq(se));
8850 raw_spin_unlock_irqrestore(&rq->lock, flags);
8851 }
8852
8853done:
8854 mutex_unlock(&shares_mutex);
8855 return 0;
8856}
8857
8858unsigned long sched_group_shares(struct task_group *tg)
8859{
8860 return tg->shares;
8861}
8862#endif 7145#endif
8863 7146
8864#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7147#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -8883,7 +7166,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8883 struct task_struct *g, *p; 7166 struct task_struct *g, *p;
8884 7167
8885 do_each_thread(g, p) { 7168 do_each_thread(g, p) {
8886 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7169 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8887 return 1; 7170 return 1;
8888 } while_each_thread(g, p); 7171 } while_each_thread(g, p);
8889 7172
@@ -9235,7 +7518,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9235static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7518static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9236{ 7519{
9237 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7520 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9238 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7521 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9239 7522
9240 if (tg == &root_task_group) 7523 if (tg == &root_task_group)
9241 return -EINVAL; 7524 return -EINVAL;
@@ -9264,7 +7547,6 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9264 runtime_enabled = quota != RUNTIME_INF; 7547 runtime_enabled = quota != RUNTIME_INF;
9265 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7548 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9266 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7549 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9267
9268 raw_spin_lock_irq(&cfs_b->lock); 7550 raw_spin_lock_irq(&cfs_b->lock);
9269 cfs_b->period = ns_to_ktime(period); 7551 cfs_b->period = ns_to_ktime(period);
9270 cfs_b->quota = quota; 7552 cfs_b->quota = quota;
@@ -9280,13 +7562,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9280 7562
9281 for_each_possible_cpu(i) { 7563 for_each_possible_cpu(i) {
9282 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7564 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9283 struct rq *rq = rq_of(cfs_rq); 7565 struct rq *rq = cfs_rq->rq;
9284 7566
9285 raw_spin_lock_irq(&rq->lock); 7567 raw_spin_lock_irq(&rq->lock);
9286 cfs_rq->runtime_enabled = runtime_enabled; 7568 cfs_rq->runtime_enabled = runtime_enabled;
9287 cfs_rq->runtime_remaining = 0; 7569 cfs_rq->runtime_remaining = 0;
9288 7570
9289 if (cfs_rq_throttled(cfs_rq)) 7571 if (cfs_rq->throttled)
9290 unthrottle_cfs_rq(cfs_rq); 7572 unthrottle_cfs_rq(cfs_rq);
9291 raw_spin_unlock_irq(&rq->lock); 7573 raw_spin_unlock_irq(&rq->lock);
9292 } 7574 }
@@ -9300,7 +7582,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9300{ 7582{
9301 u64 quota, period; 7583 u64 quota, period;
9302 7584
9303 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7585 period = ktime_to_ns(tg->cfs_bandwidth.period);
9304 if (cfs_quota_us < 0) 7586 if (cfs_quota_us < 0)
9305 quota = RUNTIME_INF; 7587 quota = RUNTIME_INF;
9306 else 7588 else
@@ -9313,10 +7595,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9313{ 7595{
9314 u64 quota_us; 7596 u64 quota_us;
9315 7597
9316 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7598 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9317 return -1; 7599 return -1;
9318 7600
9319 quota_us = tg_cfs_bandwidth(tg)->quota; 7601 quota_us = tg->cfs_bandwidth.quota;
9320 do_div(quota_us, NSEC_PER_USEC); 7602 do_div(quota_us, NSEC_PER_USEC);
9321 7603
9322 return quota_us; 7604 return quota_us;
@@ -9327,7 +7609,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9327 u64 quota, period; 7609 u64 quota, period;
9328 7610
9329 period = (u64)cfs_period_us * NSEC_PER_USEC; 7611 period = (u64)cfs_period_us * NSEC_PER_USEC;
9330 quota = tg_cfs_bandwidth(tg)->quota; 7612 quota = tg->cfs_bandwidth.quota;
9331 7613
9332 if (period <= 0) 7614 if (period <= 0)
9333 return -EINVAL; 7615 return -EINVAL;
@@ -9339,7 +7621,7 @@ long tg_get_cfs_period(struct task_group *tg)
9339{ 7621{
9340 u64 cfs_period_us; 7622 u64 cfs_period_us;
9341 7623
9342 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7624 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9343 do_div(cfs_period_us, NSEC_PER_USEC); 7625 do_div(cfs_period_us, NSEC_PER_USEC);
9344 7626
9345 return cfs_period_us; 7627 return cfs_period_us;
@@ -9399,13 +7681,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9399static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7681static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9400{ 7682{
9401 struct cfs_schedulable_data *d = data; 7683 struct cfs_schedulable_data *d = data;
9402 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7684 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9403 s64 quota = 0, parent_quota = -1; 7685 s64 quota = 0, parent_quota = -1;
9404 7686
9405 if (!tg->parent) { 7687 if (!tg->parent) {
9406 quota = RUNTIME_INF; 7688 quota = RUNTIME_INF;
9407 } else { 7689 } else {
9408 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7690 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9409 7691
9410 quota = normalize_cfs_quota(tg, d); 7692 quota = normalize_cfs_quota(tg, d);
9411 parent_quota = parent_b->hierarchal_quota; 7693 parent_quota = parent_b->hierarchal_quota;
@@ -9449,7 +7731,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9449 struct cgroup_map_cb *cb) 7731 struct cgroup_map_cb *cb)
9450{ 7732{
9451 struct task_group *tg = cgroup_tg(cgrp); 7733 struct task_group *tg = cgroup_tg(cgrp);
9452 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7734 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9453 7735
9454 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7736 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9455 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7737 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9748,7 +8030,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9748 * 8030 *
9749 * called with rq->lock held. 8031 * called with rq->lock held.
9750 */ 8032 */
9751static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8033void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9752{ 8034{
9753 struct cpuacct *ca; 8035 struct cpuacct *ca;
9754 int cpu; 8036 int cpu;
@@ -9790,7 +8072,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9790/* 8072/*
9791 * Charge the system/user time to the task's accounting group. 8073 * Charge the system/user time to the task's accounting group.
9792 */ 8074 */
9793static void cpuacct_update_stats(struct task_struct *tsk, 8075void cpuacct_update_stats(struct task_struct *tsk,
9794 enum cpuacct_stat_index idx, cputime_t val) 8076 enum cpuacct_stat_index idx, cputime_t val)
9795{ 8077{
9796 struct cpuacct *ca; 8078 struct cpuacct *ca;