aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1384
1 files changed, 1014 insertions, 370 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11ca6df3..524285e46fa7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
25 */ 27 */
26 28
27#include <linux/mm.h> 29#include <linux/mm.h>
@@ -63,6 +65,7 @@
63#include <linux/reciprocal_div.h> 65#include <linux/reciprocal_div.h>
64#include <linux/unistd.h> 66#include <linux/unistd.h>
65#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
66 69
67#include <asm/tlb.h> 70#include <asm/tlb.h>
68#include <asm/irq_regs.h> 71#include <asm/irq_regs.h>
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
96#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
97 100
98/* 101/*
99 * Some helpers for converting nanosecond timing to jiffy resolution 102 * Helpers for converting nanosecond timing to jiffy resolution
100 */ 103 */
101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
102#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
103 105
104#define NICE_0_LOAD SCHED_LOAD_SCALE 106#define NICE_0_LOAD SCHED_LOAD_SCALE
105#define NICE_0_SHIFT SCHED_LOAD_SHIFT 107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -159,6 +161,8 @@ struct rt_prio_array {
159 161
160struct cfs_rq; 162struct cfs_rq;
161 163
164static LIST_HEAD(task_groups);
165
162/* task group related information */ 166/* task group related information */
163struct task_group { 167struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -168,10 +172,50 @@ struct task_group {
168 struct sched_entity **se; 172 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /*
182 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is
184 * the cpu bandwidth allocated to it.
185 *
186 * For ex, lets say that there are three task groups, A, B and C which
187 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
188 * cpu bandwidth allocated by the scheduler to task groups A, B and C
189 * should be:
190 *
191 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
192 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
193 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
194 *
195 * The weight assigned to a task group's schedulable entities on every
196 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
197 * group's shares. For ex: lets say that task group A has been
198 * assigned shares of 1000 and there are two CPUs in a system. Then,
199 *
200 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
201 *
202 * Note: It's not necessary that each of a task's group schedulable
203 * entity have the same weight on all CPUs. If the group
204 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
205 * better distribution of weight could be:
206 *
207 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
208 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
209 *
210 * rebalance_shares() is responsible for distributing the shares of a
211 * task groups like this among the group's schedulable entities across
212 * cpus.
213 *
214 */
171 unsigned long shares; 215 unsigned long shares;
172 /* spinlock to serialize modification to shares */ 216
173 spinlock_t lock;
174 struct rcu_head rcu; 217 struct rcu_head rcu;
218 struct list_head list;
175}; 219};
176 220
177/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
179/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
180static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
181 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
182static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
183static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
184 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
235/* task_group_mutex serializes add/remove of task groups and also changes to
236 * a task group's cpu shares.
237 */
238static DEFINE_MUTEX(task_group_mutex);
239
240/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex);
242
243#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task;
246static int load_balance_monitor(void *unused);
247#endif
248
249static void set_se_shares(struct sched_entity *se, unsigned long shares);
250
185/* Default task group. 251/* Default task group.
186 * Every task in system belong to this group at bootup. 252 * Every task in system belong to this group at bootup.
187 */ 253 */
188struct task_group init_task_group = { 254struct task_group init_task_group = {
189 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
190 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
191}; 260};
192 261
193#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
194# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD 263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
195#else 264#else
196# define INIT_TASK_GRP_LOAD NICE_0_LOAD 265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
197#endif 266#endif
198 267
199static int init_task_group_load = INIT_TASK_GRP_LOAD; 268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
200 271
201/* return group to which a task belongs */ 272/* return group to which a task belongs */
202static inline struct task_group *task_group(struct task_struct *p) 273static inline struct task_group *task_group(struct task_struct *p)
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p)
215} 286}
216 287
217/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
218static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
219{ 290{
220 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
221 p->se.parent = task_group(p)->se[cpu]; 292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
296}
297
298static inline void lock_task_group_list(void)
299{
300 mutex_lock(&task_group_mutex);
301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306}
307
308static inline void lock_doms_cur(void)
309{
310 mutex_lock(&doms_cur_mutex);
311}
312
313static inline void unlock_doms_cur(void)
314{
315 mutex_unlock(&doms_cur_mutex);
222} 316}
223 317
224#else 318#else
225 319
226static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { }
227 325
228#endif /* CONFIG_FAIR_GROUP_SCHED */ 326#endif /* CONFIG_FAIR_GROUP_SCHED */
229 327
@@ -264,10 +362,56 @@ struct cfs_rq {
264/* Real-Time classes' related field in a runqueue: */ 362/* Real-Time classes' related field in a runqueue: */
265struct rt_rq { 363struct rt_rq {
266 struct rt_prio_array active; 364 struct rt_prio_array active;
267 int rt_load_balance_idx; 365 unsigned long rt_nr_running;
268 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
369#ifdef CONFIG_SMP
370 unsigned long rt_nr_migratory;
371 int overloaded;
372#endif
373 int rt_throttled;
374 u64 rt_time;
375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
269}; 382};
270 383
384#ifdef CONFIG_SMP
385
386/*
387 * We add the notion of a root-domain which will be used to define per-domain
388 * variables. Each exclusive cpuset essentially defines an island domain by
389 * fully partitioning the member cpus from any other cpuset. Whenever a new
390 * exclusive cpuset is created, we also create and attach a new root-domain
391 * object.
392 *
393 */
394struct root_domain {
395 atomic_t refcount;
396 cpumask_t span;
397 cpumask_t online;
398
399 /*
400 * The "RT overload" flag: it gets set if a CPU has more than
401 * one runnable RT task.
402 */
403 cpumask_t rto_mask;
404 atomic_t rto_count;
405};
406
407/*
408 * By default the system creates a single root-domain with all cpus as
409 * members (mimicking the global state we have today).
410 */
411static struct root_domain def_root_domain;
412
413#endif
414
271/* 415/*
272 * This is the main, per-CPU runqueue data structure. 416 * This is the main, per-CPU runqueue data structure.
273 * 417 *
@@ -296,11 +440,15 @@ struct rq {
296 u64 nr_switches; 440 u64 nr_switches;
297 441
298 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445 int rt_throttled;
446
299#ifdef CONFIG_FAIR_GROUP_SCHED 447#ifdef CONFIG_FAIR_GROUP_SCHED
300 /* list of leaf cfs_rq on this cpu: */ 448 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list; 449 struct list_head leaf_cfs_rq_list;
450 struct list_head leaf_rt_rq_list;
302#endif 451#endif
303 struct rt_rq rt;
304 452
305 /* 453 /*
306 * This is part of a global counter where only the total sum 454 * This is part of a global counter where only the total sum
@@ -317,7 +465,7 @@ struct rq {
317 u64 clock, prev_clock_raw; 465 u64 clock, prev_clock_raw;
318 s64 clock_max_delta; 466 s64 clock_max_delta;
319 467
320 unsigned int clock_warps, clock_overflows; 468 unsigned int clock_warps, clock_overflows, clock_underflows;
321 u64 idle_clock; 469 u64 idle_clock;
322 unsigned int clock_deep_idle_events; 470 unsigned int clock_deep_idle_events;
323 u64 tick_timestamp; 471 u64 tick_timestamp;
@@ -325,6 +473,7 @@ struct rq {
325 atomic_t nr_iowait; 473 atomic_t nr_iowait;
326 474
327#ifdef CONFIG_SMP 475#ifdef CONFIG_SMP
476 struct root_domain *rd;
328 struct sched_domain *sd; 477 struct sched_domain *sd;
329 478
330 /* For active balancing */ 479 /* For active balancing */
@@ -337,6 +486,12 @@ struct rq {
337 struct list_head migration_queue; 486 struct list_head migration_queue;
338#endif 487#endif
339 488
489#ifdef CONFIG_SCHED_HRTICK
490 unsigned long hrtick_flags;
491 ktime_t hrtick_expire;
492 struct hrtimer hrtick_timer;
493#endif
494
340#ifdef CONFIG_SCHEDSTATS 495#ifdef CONFIG_SCHEDSTATS
341 /* latency stats */ 496 /* latency stats */
342 struct sched_info rq_sched_info; 497 struct sched_info rq_sched_info;
@@ -363,7 +518,6 @@ struct rq {
363}; 518};
364 519
365static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 520static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
366static DEFINE_MUTEX(sched_hotcpu_mutex);
367 521
368static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 522static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
369{ 523{
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
441#define task_rq(p) cpu_rq(task_cpu(p)) 595#define task_rq(p) cpu_rq(task_cpu(p))
442#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 596#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
443 597
598unsigned long rt_needs_cpu(int cpu)
599{
600 struct rq *rq = cpu_rq(cpu);
601 u64 delta;
602
603 if (!rq->rt_throttled)
604 return 0;
605
606 if (rq->clock > rq->rt_period_expire)
607 return 1;
608
609 delta = rq->rt_period_expire - rq->clock;
610 do_div(delta, NSEC_PER_SEC / HZ);
611
612 return (unsigned long)delta;
613}
614
444/* 615/*
445 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 616 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
446 */ 617 */
@@ -459,6 +630,8 @@ enum {
459 SCHED_FEAT_START_DEBIT = 4, 630 SCHED_FEAT_START_DEBIT = 4,
460 SCHED_FEAT_TREE_AVG = 8, 631 SCHED_FEAT_TREE_AVG = 8,
461 SCHED_FEAT_APPROX_AVG = 16, 632 SCHED_FEAT_APPROX_AVG = 16,
633 SCHED_FEAT_HRTICK = 32,
634 SCHED_FEAT_DOUBLE_TICK = 64,
462}; 635};
463 636
464const_debug unsigned int sysctl_sched_features = 637const_debug unsigned int sysctl_sched_features =
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features =
466 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 639 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
467 SCHED_FEAT_START_DEBIT * 1 | 640 SCHED_FEAT_START_DEBIT * 1 |
468 SCHED_FEAT_TREE_AVG * 0 | 641 SCHED_FEAT_TREE_AVG * 0 |
469 SCHED_FEAT_APPROX_AVG * 0; 642 SCHED_FEAT_APPROX_AVG * 0 |
643 SCHED_FEAT_HRTICK * 1 |
644 SCHED_FEAT_DOUBLE_TICK * 0;
470 645
471#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 646#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
472 647
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features =
477const_debug unsigned int sysctl_sched_nr_migrate = 32; 652const_debug unsigned int sysctl_sched_nr_migrate = 32;
478 653
479/* 654/*
655 * period over which we measure -rt task cpu usage in ms.
656 * default: 1s
657 */
658const_debug unsigned int sysctl_sched_rt_period = 1000;
659
660#define SCHED_RT_FRAC_SHIFT 16
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
662
663/*
664 * ratio of time -rt tasks may consume.
665 * default: 95%
666 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259;
668
669/*
480 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
481 * clock constructed from sched_clock(): 671 * clock constructed from sched_clock():
482 */ 672 */
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
668 struct rq *rq = cpu_rq(smp_processor_id()); 858 struct rq *rq = cpu_rq(smp_processor_id());
669 u64 now = sched_clock(); 859 u64 now = sched_clock();
670 860
671 touch_softlockup_watchdog();
672 rq->idle_clock += delta_ns; 861 rq->idle_clock += delta_ns;
673 /* 862 /*
674 * Override the previous timestamp and ignore all 863 * Override the previous timestamp and ignore all
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
680 rq->prev_clock_raw = now; 869 rq->prev_clock_raw = now;
681 rq->clock += delta_ns; 870 rq->clock += delta_ns;
682 spin_unlock(&rq->lock); 871 spin_unlock(&rq->lock);
872 touch_softlockup_watchdog();
683} 873}
684EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 874EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
685 875
876static void __resched_task(struct task_struct *p, int tif_bit);
877
878static inline void resched_task(struct task_struct *p)
879{
880 __resched_task(p, TIF_NEED_RESCHED);
881}
882
883#ifdef CONFIG_SCHED_HRTICK
884/*
885 * Use HR-timers to deliver accurate preemption points.
886 *
887 * Its all a bit involved since we cannot program an hrt while holding the
888 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
889 * reschedule event.
890 *
891 * When we get rescheduled we reprogram the hrtick_timer outside of the
892 * rq->lock.
893 */
894static inline void resched_hrt(struct task_struct *p)
895{
896 __resched_task(p, TIF_HRTICK_RESCHED);
897}
898
899static inline void resched_rq(struct rq *rq)
900{
901 unsigned long flags;
902
903 spin_lock_irqsave(&rq->lock, flags);
904 resched_task(rq->curr);
905 spin_unlock_irqrestore(&rq->lock, flags);
906}
907
908enum {
909 HRTICK_SET, /* re-programm hrtick_timer */
910 HRTICK_RESET, /* not a new slice */
911};
912
913/*
914 * Use hrtick when:
915 * - enabled by features
916 * - hrtimer is actually high res
917 */
918static inline int hrtick_enabled(struct rq *rq)
919{
920 if (!sched_feat(HRTICK))
921 return 0;
922 return hrtimer_is_hres_active(&rq->hrtick_timer);
923}
924
925/*
926 * Called to set the hrtick timer state.
927 *
928 * called with rq->lock held and irqs disabled
929 */
930static void hrtick_start(struct rq *rq, u64 delay, int reset)
931{
932 assert_spin_locked(&rq->lock);
933
934 /*
935 * preempt at: now + delay
936 */
937 rq->hrtick_expire =
938 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
939 /*
940 * indicate we need to program the timer
941 */
942 __set_bit(HRTICK_SET, &rq->hrtick_flags);
943 if (reset)
944 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
945
946 /*
947 * New slices are called from the schedule path and don't need a
948 * forced reschedule.
949 */
950 if (reset)
951 resched_hrt(rq->curr);
952}
953
954static void hrtick_clear(struct rq *rq)
955{
956 if (hrtimer_active(&rq->hrtick_timer))
957 hrtimer_cancel(&rq->hrtick_timer);
958}
959
960/*
961 * Update the timer from the possible pending state.
962 */
963static void hrtick_set(struct rq *rq)
964{
965 ktime_t time;
966 int set, reset;
967 unsigned long flags;
968
969 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
970
971 spin_lock_irqsave(&rq->lock, flags);
972 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
973 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
974 time = rq->hrtick_expire;
975 clear_thread_flag(TIF_HRTICK_RESCHED);
976 spin_unlock_irqrestore(&rq->lock, flags);
977
978 if (set) {
979 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
980 if (reset && !hrtimer_active(&rq->hrtick_timer))
981 resched_rq(rq);
982 } else
983 hrtick_clear(rq);
984}
985
986/*
987 * High-resolution timer tick.
988 * Runs from hardirq context with interrupts disabled.
989 */
990static enum hrtimer_restart hrtick(struct hrtimer *timer)
991{
992 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
993
994 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
995
996 spin_lock(&rq->lock);
997 __update_rq_clock(rq);
998 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
999 spin_unlock(&rq->lock);
1000
1001 return HRTIMER_NORESTART;
1002}
1003
1004static inline void init_rq_hrtick(struct rq *rq)
1005{
1006 rq->hrtick_flags = 0;
1007 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1008 rq->hrtick_timer.function = hrtick;
1009 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1010}
1011
1012void hrtick_resched(void)
1013{
1014 struct rq *rq;
1015 unsigned long flags;
1016
1017 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1018 return;
1019
1020 local_irq_save(flags);
1021 rq = cpu_rq(smp_processor_id());
1022 hrtick_set(rq);
1023 local_irq_restore(flags);
1024}
1025#else
1026static inline void hrtick_clear(struct rq *rq)
1027{
1028}
1029
1030static inline void hrtick_set(struct rq *rq)
1031{
1032}
1033
1034static inline void init_rq_hrtick(struct rq *rq)
1035{
1036}
1037
1038void hrtick_resched(void)
1039{
1040}
1041#endif
1042
686/* 1043/*
687 * resched_task - mark a task 'to be rescheduled now'. 1044 * resched_task - mark a task 'to be rescheduled now'.
688 * 1045 *
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
696#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1053#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
697#endif 1054#endif
698 1055
699static void resched_task(struct task_struct *p) 1056static void __resched_task(struct task_struct *p, int tif_bit)
700{ 1057{
701 int cpu; 1058 int cpu;
702 1059
703 assert_spin_locked(&task_rq(p)->lock); 1060 assert_spin_locked(&task_rq(p)->lock);
704 1061
705 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1062 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
706 return; 1063 return;
707 1064
708 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1065 set_tsk_thread_flag(p, tif_bit);
709 1066
710 cpu = task_cpu(p); 1067 cpu = task_cpu(p);
711 if (cpu == smp_processor_id()) 1068 if (cpu == smp_processor_id())
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu)
728 spin_unlock_irqrestore(&rq->lock, flags); 1085 spin_unlock_irqrestore(&rq->lock, flags);
729} 1086}
730#else 1087#else
731static inline void resched_task(struct task_struct *p) 1088static void __resched_task(struct task_struct *p, int tif_bit)
732{ 1089{
733 assert_spin_locked(&task_rq(p)->lock); 1090 assert_spin_locked(&task_rq(p)->lock);
734 set_tsk_need_resched(p); 1091 set_tsk_thread_flag(p, tif_bit);
735} 1092}
736#endif 1093#endif
737 1094
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
871static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1228static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
872#endif 1229#endif
873 1230
1231static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1232{
1233 update_load_add(&rq->load, load);
1234}
1235
1236static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1237{
1238 update_load_sub(&rq->load, load);
1239}
1240
1241#ifdef CONFIG_SMP
1242static unsigned long source_load(int cpu, int type);
1243static unsigned long target_load(int cpu, int type);
1244static unsigned long cpu_avg_load_per_task(int cpu);
1245static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1246#endif /* CONFIG_SMP */
1247
874#include "sched_stats.h" 1248#include "sched_stats.h"
875#include "sched_idletask.c" 1249#include "sched_idletask.c"
876#include "sched_fair.c" 1250#include "sched_fair.c"
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
881 1255
882#define sched_class_highest (&rt_sched_class) 1256#define sched_class_highest (&rt_sched_class)
883 1257
884/*
885 * Update delta_exec, delta_fair fields for rq.
886 *
887 * delta_fair clock advances at a rate inversely proportional to
888 * total load (rq->load.weight) on the runqueue, while
889 * delta_exec advances at the same rate as wall-clock (provided
890 * cpu is not idle).
891 *
892 * delta_exec / delta_fair is a measure of the (smoothened) load on this
893 * runqueue over any given interval. This (smoothened) load is used
894 * during load balance.
895 *
896 * This function is called /before/ updating rq->load
897 * and when switching tasks.
898 */
899static inline void inc_load(struct rq *rq, const struct task_struct *p)
900{
901 update_load_add(&rq->load, p->se.load.weight);
902}
903
904static inline void dec_load(struct rq *rq, const struct task_struct *p)
905{
906 update_load_sub(&rq->load, p->se.load.weight);
907}
908
909static void inc_nr_running(struct task_struct *p, struct rq *rq) 1258static void inc_nr_running(struct task_struct *p, struct rq *rq)
910{ 1259{
911 rq->nr_running++; 1260 rq->nr_running++;
912 inc_load(rq, p);
913} 1261}
914 1262
915static void dec_nr_running(struct task_struct *p, struct rq *rq) 1263static void dec_nr_running(struct task_struct *p, struct rq *rq)
916{ 1264{
917 rq->nr_running--; 1265 rq->nr_running--;
918 dec_load(rq, p);
919} 1266}
920 1267
921static void set_load_weight(struct task_struct *p) 1268static void set_load_weight(struct task_struct *p)
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu)
1039 1386
1040static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1387static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1041{ 1388{
1042 set_task_cfs_rq(p, cpu); 1389 set_task_rq(p, cpu);
1043#ifdef CONFIG_SMP 1390#ifdef CONFIG_SMP
1044 /* 1391 /*
1045 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1392 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1051#endif 1398#endif
1052} 1399}
1053 1400
1401static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1402 const struct sched_class *prev_class,
1403 int oldprio, int running)
1404{
1405 if (prev_class != p->sched_class) {
1406 if (prev_class->switched_from)
1407 prev_class->switched_from(rq, p, running);
1408 p->sched_class->switched_to(rq, p, running);
1409 } else
1410 p->sched_class->prio_changed(rq, p, oldprio, running);
1411}
1412
1054#ifdef CONFIG_SMP 1413#ifdef CONFIG_SMP
1055 1414
1056/* 1415/*
1057 * Is this task likely cache-hot: 1416 * Is this task likely cache-hot:
1058 */ 1417 */
1059static inline int 1418static int
1060task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1419task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1061{ 1420{
1062 s64 delta; 1421 s64 delta;
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type)
1281/* 1640/*
1282 * Return the average load per task on the cpu's run queue 1641 * Return the average load per task on the cpu's run queue
1283 */ 1642 */
1284static inline unsigned long cpu_avg_load_per_task(int cpu) 1643static unsigned long cpu_avg_load_per_task(int cpu)
1285{ 1644{
1286 struct rq *rq = cpu_rq(cpu); 1645 struct rq *rq = cpu_rq(cpu);
1287 unsigned long total = weighted_cpuload(cpu); 1646 unsigned long total = weighted_cpuload(cpu);
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag)
1438 1797
1439#endif /* CONFIG_SMP */ 1798#endif /* CONFIG_SMP */
1440 1799
1441/*
1442 * wake_idle() will wake a task on an idle cpu if task->cpu is
1443 * not idle and an idle cpu is available. The span of cpus to
1444 * search starts with cpus closest then further out as needed,
1445 * so we always favor a closer, idle cpu.
1446 *
1447 * Returns the CPU we should wake onto.
1448 */
1449#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1450static int wake_idle(int cpu, struct task_struct *p)
1451{
1452 cpumask_t tmp;
1453 struct sched_domain *sd;
1454 int i;
1455
1456 /*
1457 * If it is idle, then it is the best cpu to run this task.
1458 *
1459 * This cpu is also the best, if it has more than one task already.
1460 * Siblings must be also busy(in most cases) as they didn't already
1461 * pickup the extra load from this cpu and hence we need not check
1462 * sibling runqueue info. This will avoid the checks and cache miss
1463 * penalities associated with that.
1464 */
1465 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1466 return cpu;
1467
1468 for_each_domain(cpu, sd) {
1469 if (sd->flags & SD_WAKE_IDLE) {
1470 cpus_and(tmp, sd->span, p->cpus_allowed);
1471 for_each_cpu_mask(i, tmp) {
1472 if (idle_cpu(i)) {
1473 if (i != task_cpu(p)) {
1474 schedstat_inc(p,
1475 se.nr_wakeups_idle);
1476 }
1477 return i;
1478 }
1479 }
1480 } else {
1481 break;
1482 }
1483 }
1484 return cpu;
1485}
1486#else
1487static inline int wake_idle(int cpu, struct task_struct *p)
1488{
1489 return cpu;
1490}
1491#endif
1492
1493/*** 1800/***
1494 * try_to_wake_up - wake up a thread 1801 * try_to_wake_up - wake up a thread
1495 * @p: the to-be-woken-up thread 1802 * @p: the to-be-woken-up thread
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1510 unsigned long flags; 1817 unsigned long flags;
1511 long old_state; 1818 long old_state;
1512 struct rq *rq; 1819 struct rq *rq;
1513#ifdef CONFIG_SMP
1514 struct sched_domain *sd, *this_sd = NULL;
1515 unsigned long load, this_load;
1516 int new_cpu;
1517#endif
1518 1820
1519 rq = task_rq_lock(p, &flags); 1821 rq = task_rq_lock(p, &flags);
1520 old_state = p->state; 1822 old_state = p->state;
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1532 if (unlikely(task_running(rq, p))) 1834 if (unlikely(task_running(rq, p)))
1533 goto out_activate; 1835 goto out_activate;
1534 1836
1535 new_cpu = cpu; 1837 cpu = p->sched_class->select_task_rq(p, sync);
1536 1838 if (cpu != orig_cpu) {
1537 schedstat_inc(rq, ttwu_count); 1839 set_task_cpu(p, cpu);
1538 if (cpu == this_cpu) {
1539 schedstat_inc(rq, ttwu_local);
1540 goto out_set_cpu;
1541 }
1542
1543 for_each_domain(this_cpu, sd) {
1544 if (cpu_isset(cpu, sd->span)) {
1545 schedstat_inc(sd, ttwu_wake_remote);
1546 this_sd = sd;
1547 break;
1548 }
1549 }
1550
1551 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1552 goto out_set_cpu;
1553
1554 /*
1555 * Check for affine wakeup and passive balancing possibilities.
1556 */
1557 if (this_sd) {
1558 int idx = this_sd->wake_idx;
1559 unsigned int imbalance;
1560
1561 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1562
1563 load = source_load(cpu, idx);
1564 this_load = target_load(this_cpu, idx);
1565
1566 new_cpu = this_cpu; /* Wake to this CPU if we can */
1567
1568 if (this_sd->flags & SD_WAKE_AFFINE) {
1569 unsigned long tl = this_load;
1570 unsigned long tl_per_task;
1571
1572 /*
1573 * Attract cache-cold tasks on sync wakeups:
1574 */
1575 if (sync && !task_hot(p, rq->clock, this_sd))
1576 goto out_set_cpu;
1577
1578 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1579 tl_per_task = cpu_avg_load_per_task(this_cpu);
1580
1581 /*
1582 * If sync wakeup then subtract the (maximum possible)
1583 * effect of the currently running task from the load
1584 * of the current CPU:
1585 */
1586 if (sync)
1587 tl -= current->se.load.weight;
1588
1589 if ((tl <= load &&
1590 tl + target_load(cpu, idx) <= tl_per_task) ||
1591 100*(tl + p->se.load.weight) <= imbalance*load) {
1592 /*
1593 * This domain has SD_WAKE_AFFINE and
1594 * p is cache cold in this domain, and
1595 * there is no bad imbalance.
1596 */
1597 schedstat_inc(this_sd, ttwu_move_affine);
1598 schedstat_inc(p, se.nr_wakeups_affine);
1599 goto out_set_cpu;
1600 }
1601 }
1602
1603 /*
1604 * Start passive balancing when half the imbalance_pct
1605 * limit is reached.
1606 */
1607 if (this_sd->flags & SD_WAKE_BALANCE) {
1608 if (imbalance*this_load <= 100*load) {
1609 schedstat_inc(this_sd, ttwu_move_balance);
1610 schedstat_inc(p, se.nr_wakeups_passive);
1611 goto out_set_cpu;
1612 }
1613 }
1614 }
1615
1616 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1617out_set_cpu:
1618 new_cpu = wake_idle(new_cpu, p);
1619 if (new_cpu != cpu) {
1620 set_task_cpu(p, new_cpu);
1621 task_rq_unlock(rq, &flags); 1840 task_rq_unlock(rq, &flags);
1622 /* might preempt at this point */ 1841 /* might preempt at this point */
1623 rq = task_rq_lock(p, &flags); 1842 rq = task_rq_lock(p, &flags);
@@ -1631,6 +1850,21 @@ out_set_cpu:
1631 cpu = task_cpu(p); 1850 cpu = task_cpu(p);
1632 } 1851 }
1633 1852
1853#ifdef CONFIG_SCHEDSTATS
1854 schedstat_inc(rq, ttwu_count);
1855 if (cpu == this_cpu)
1856 schedstat_inc(rq, ttwu_local);
1857 else {
1858 struct sched_domain *sd;
1859 for_each_domain(this_cpu, sd) {
1860 if (cpu_isset(cpu, sd->span)) {
1861 schedstat_inc(sd, ttwu_wake_remote);
1862 break;
1863 }
1864 }
1865 }
1866#endif
1867
1634out_activate: 1868out_activate:
1635#endif /* CONFIG_SMP */ 1869#endif /* CONFIG_SMP */
1636 schedstat_inc(p, se.nr_wakeups); 1870 schedstat_inc(p, se.nr_wakeups);
@@ -1649,6 +1883,10 @@ out_activate:
1649 1883
1650out_running: 1884out_running:
1651 p->state = TASK_RUNNING; 1885 p->state = TASK_RUNNING;
1886#ifdef CONFIG_SMP
1887 if (p->sched_class->task_wake_up)
1888 p->sched_class->task_wake_up(rq, p);
1889#endif
1652out: 1890out:
1653 task_rq_unlock(rq, &flags); 1891 task_rq_unlock(rq, &flags);
1654 1892
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p)
1691 p->se.wait_max = 0; 1929 p->se.wait_max = 0;
1692#endif 1930#endif
1693 1931
1694 INIT_LIST_HEAD(&p->run_list); 1932 INIT_LIST_HEAD(&p->rt.run_list);
1695 p->se.on_rq = 0; 1933 p->se.on_rq = 0;
1696 1934
1697#ifdef CONFIG_PREEMPT_NOTIFIERS 1935#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1771 inc_nr_running(p, rq); 2009 inc_nr_running(p, rq);
1772 } 2010 }
1773 check_preempt_curr(rq, p); 2011 check_preempt_curr(rq, p);
2012#ifdef CONFIG_SMP
2013 if (p->sched_class->task_wake_up)
2014 p->sched_class->task_wake_up(rq, p);
2015#endif
1774 task_rq_unlock(rq, &flags); 2016 task_rq_unlock(rq, &flags);
1775} 2017}
1776 2018
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1891 prev_state = prev->state; 2133 prev_state = prev->state;
1892 finish_arch_switch(prev); 2134 finish_arch_switch(prev);
1893 finish_lock_switch(rq, prev); 2135 finish_lock_switch(rq, prev);
2136#ifdef CONFIG_SMP
2137 if (current->sched_class->post_schedule)
2138 current->sched_class->post_schedule(rq);
2139#endif
2140
1894 fire_sched_in_preempt_notifiers(current); 2141 fire_sched_in_preempt_notifiers(current);
1895 if (mm) 2142 if (mm)
1896 mmdrop(mm); 2143 mmdrop(mm);
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2124/* 2371/*
2125 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2372 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2126 */ 2373 */
2127static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2374static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2128 __releases(this_rq->lock) 2375 __releases(this_rq->lock)
2129 __acquires(busiest->lock) 2376 __acquires(busiest->lock)
2130 __acquires(this_rq->lock) 2377 __acquires(this_rq->lock)
2131{ 2378{
2379 int ret = 0;
2380
2132 if (unlikely(!irqs_disabled())) { 2381 if (unlikely(!irqs_disabled())) {
2133 /* printk() doesn't work good under rq->lock */ 2382 /* printk() doesn't work good under rq->lock */
2134 spin_unlock(&this_rq->lock); 2383 spin_unlock(&this_rq->lock);
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2139 spin_unlock(&this_rq->lock); 2388 spin_unlock(&this_rq->lock);
2140 spin_lock(&busiest->lock); 2389 spin_lock(&busiest->lock);
2141 spin_lock(&this_rq->lock); 2390 spin_lock(&this_rq->lock);
2391 ret = 1;
2142 } else 2392 } else
2143 spin_lock(&busiest->lock); 2393 spin_lock(&busiest->lock);
2144 } 2394 }
2395 return ret;
2145} 2396}
2146 2397
2147/* 2398/*
@@ -3485,12 +3736,14 @@ void scheduler_tick(void)
3485 /* 3736 /*
3486 * Let rq->clock advance by at least TICK_NSEC: 3737 * Let rq->clock advance by at least TICK_NSEC:
3487 */ 3738 */
3488 if (unlikely(rq->clock < next_tick)) 3739 if (unlikely(rq->clock < next_tick)) {
3489 rq->clock = next_tick; 3740 rq->clock = next_tick;
3741 rq->clock_underflows++;
3742 }
3490 rq->tick_timestamp = rq->clock; 3743 rq->tick_timestamp = rq->clock;
3491 update_cpu_load(rq); 3744 update_cpu_load(rq);
3492 if (curr != rq->idle) /* FIXME: needed? */ 3745 curr->sched_class->task_tick(rq, curr, 0);
3493 curr->sched_class->task_tick(rq, curr); 3746 update_sched_rt_period(rq);
3494 spin_unlock(&rq->lock); 3747 spin_unlock(&rq->lock);
3495 3748
3496#ifdef CONFIG_SMP 3749#ifdef CONFIG_SMP
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible:
3636 3889
3637 schedule_debug(prev); 3890 schedule_debug(prev);
3638 3891
3892 hrtick_clear(rq);
3893
3639 /* 3894 /*
3640 * Do the rq-clock update outside the rq lock: 3895 * Do the rq-clock update outside the rq lock:
3641 */ 3896 */
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible:
3654 switch_count = &prev->nvcsw; 3909 switch_count = &prev->nvcsw;
3655 } 3910 }
3656 3911
3912#ifdef CONFIG_SMP
3913 if (prev->sched_class->pre_schedule)
3914 prev->sched_class->pre_schedule(rq, prev);
3915#endif
3916
3657 if (unlikely(!rq->nr_running)) 3917 if (unlikely(!rq->nr_running))
3658 idle_balance(cpu, rq); 3918 idle_balance(cpu, rq);
3659 3919
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible:
3668 ++*switch_count; 3928 ++*switch_count;
3669 3929
3670 context_switch(rq, prev, next); /* unlocks the rq */ 3930 context_switch(rq, prev, next); /* unlocks the rq */
3931 /*
3932 * the context switch might have flipped the stack from under
3933 * us, hence refresh the local variables.
3934 */
3935 cpu = smp_processor_id();
3936 rq = cpu_rq(cpu);
3671 } else 3937 } else
3672 spin_unlock_irq(&rq->lock); 3938 spin_unlock_irq(&rq->lock);
3673 3939
3674 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3940 hrtick_set(rq);
3675 cpu = smp_processor_id(); 3941
3676 rq = cpu_rq(cpu); 3942 if (unlikely(reacquire_kernel_lock(current) < 0))
3677 goto need_resched_nonpreemptible; 3943 goto need_resched_nonpreemptible;
3678 } 3944
3679 preempt_enable_no_resched(); 3945 preempt_enable_no_resched();
3680 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3946 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3681 goto need_resched; 3947 goto need_resched;
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule);
3691asmlinkage void __sched preempt_schedule(void) 3957asmlinkage void __sched preempt_schedule(void)
3692{ 3958{
3693 struct thread_info *ti = current_thread_info(); 3959 struct thread_info *ti = current_thread_info();
3694#ifdef CONFIG_PREEMPT_BKL
3695 struct task_struct *task = current; 3960 struct task_struct *task = current;
3696 int saved_lock_depth; 3961 int saved_lock_depth;
3697#endif 3962
3698 /* 3963 /*
3699 * If there is a non-zero preempt_count or interrupts are disabled, 3964 * If there is a non-zero preempt_count or interrupts are disabled,
3700 * we do not want to preempt the current task. Just return.. 3965 * we do not want to preempt the current task. Just return..
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void)
3710 * clear ->lock_depth so that schedule() doesnt 3975 * clear ->lock_depth so that schedule() doesnt
3711 * auto-release the semaphore: 3976 * auto-release the semaphore:
3712 */ 3977 */
3713#ifdef CONFIG_PREEMPT_BKL
3714 saved_lock_depth = task->lock_depth; 3978 saved_lock_depth = task->lock_depth;
3715 task->lock_depth = -1; 3979 task->lock_depth = -1;
3716#endif
3717 schedule(); 3980 schedule();
3718#ifdef CONFIG_PREEMPT_BKL
3719 task->lock_depth = saved_lock_depth; 3981 task->lock_depth = saved_lock_depth;
3720#endif
3721 sub_preempt_count(PREEMPT_ACTIVE); 3982 sub_preempt_count(PREEMPT_ACTIVE);
3722 3983
3723 /* 3984 /*
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule);
3738asmlinkage void __sched preempt_schedule_irq(void) 3999asmlinkage void __sched preempt_schedule_irq(void)
3739{ 4000{
3740 struct thread_info *ti = current_thread_info(); 4001 struct thread_info *ti = current_thread_info();
3741#ifdef CONFIG_PREEMPT_BKL
3742 struct task_struct *task = current; 4002 struct task_struct *task = current;
3743 int saved_lock_depth; 4003 int saved_lock_depth;
3744#endif 4004
3745 /* Catch callers which need to be fixed */ 4005 /* Catch callers which need to be fixed */
3746 BUG_ON(ti->preempt_count || !irqs_disabled()); 4006 BUG_ON(ti->preempt_count || !irqs_disabled());
3747 4007
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
3753 * clear ->lock_depth so that schedule() doesnt 4013 * clear ->lock_depth so that schedule() doesnt
3754 * auto-release the semaphore: 4014 * auto-release the semaphore:
3755 */ 4015 */
3756#ifdef CONFIG_PREEMPT_BKL
3757 saved_lock_depth = task->lock_depth; 4016 saved_lock_depth = task->lock_depth;
3758 task->lock_depth = -1; 4017 task->lock_depth = -1;
3759#endif
3760 local_irq_enable(); 4018 local_irq_enable();
3761 schedule(); 4019 schedule();
3762 local_irq_disable(); 4020 local_irq_disable();
3763#ifdef CONFIG_PREEMPT_BKL
3764 task->lock_depth = saved_lock_depth; 4021 task->lock_depth = saved_lock_depth;
3765#endif
3766 sub_preempt_count(PREEMPT_ACTIVE); 4022 sub_preempt_count(PREEMPT_ACTIVE);
3767 4023
3768 /* 4024 /*
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4019 unsigned long flags; 4275 unsigned long flags;
4020 int oldprio, on_rq, running; 4276 int oldprio, on_rq, running;
4021 struct rq *rq; 4277 struct rq *rq;
4278 const struct sched_class *prev_class = p->sched_class;
4022 4279
4023 BUG_ON(prio < 0 || prio > MAX_PRIO); 4280 BUG_ON(prio < 0 || prio > MAX_PRIO);
4024 4281
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4044 if (on_rq) { 4301 if (on_rq) {
4045 if (running) 4302 if (running)
4046 p->sched_class->set_curr_task(rq); 4303 p->sched_class->set_curr_task(rq);
4304
4047 enqueue_task(rq, p, 0); 4305 enqueue_task(rq, p, 0);
4048 /* 4306
4049 * Reschedule if we are currently running on this runqueue and 4307 check_class_changed(rq, p, prev_class, oldprio, running);
4050 * our priority decreased, or if we are not currently running on
4051 * this runqueue and our priority is higher than the current's
4052 */
4053 if (running) {
4054 if (p->prio > oldprio)
4055 resched_task(rq->curr);
4056 } else {
4057 check_preempt_curr(rq, p);
4058 }
4059 } 4308 }
4060 task_rq_unlock(rq, &flags); 4309 task_rq_unlock(rq, &flags);
4061} 4310}
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice)
4087 goto out_unlock; 4336 goto out_unlock;
4088 } 4337 }
4089 on_rq = p->se.on_rq; 4338 on_rq = p->se.on_rq;
4090 if (on_rq) { 4339 if (on_rq)
4091 dequeue_task(rq, p, 0); 4340 dequeue_task(rq, p, 0);
4092 dec_load(rq, p);
4093 }
4094 4341
4095 p->static_prio = NICE_TO_PRIO(nice); 4342 p->static_prio = NICE_TO_PRIO(nice);
4096 set_load_weight(p); 4343 set_load_weight(p);
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice)
4100 4347
4101 if (on_rq) { 4348 if (on_rq) {
4102 enqueue_task(rq, p, 0); 4349 enqueue_task(rq, p, 0);
4103 inc_load(rq, p);
4104 /* 4350 /*
4105 * If the task increased its priority or is running and 4351 * If the task increased its priority or is running and
4106 * lowered its priority, then reschedule its CPU: 4352 * lowered its priority, then reschedule its CPU:
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
4258{ 4504{
4259 int retval, oldprio, oldpolicy = -1, on_rq, running; 4505 int retval, oldprio, oldpolicy = -1, on_rq, running;
4260 unsigned long flags; 4506 unsigned long flags;
4507 const struct sched_class *prev_class = p->sched_class;
4261 struct rq *rq; 4508 struct rq *rq;
4262 4509
4263 /* may grab non-irq protected spin_locks */ 4510 /* may grab non-irq protected spin_locks */
@@ -4351,18 +4598,10 @@ recheck:
4351 if (on_rq) { 4598 if (on_rq) {
4352 if (running) 4599 if (running)
4353 p->sched_class->set_curr_task(rq); 4600 p->sched_class->set_curr_task(rq);
4601
4354 activate_task(rq, p, 0); 4602 activate_task(rq, p, 0);
4355 /* 4603
4356 * Reschedule if we are currently running on this runqueue and 4604 check_class_changed(rq, p, prev_class, oldprio, running);
4357 * our priority decreased, or if we are not currently running on
4358 * this runqueue and our priority is higher than the current's
4359 */
4360 if (running) {
4361 if (p->prio > oldprio)
4362 resched_task(rq->curr);
4363 } else {
4364 check_preempt_curr(rq, p);
4365 }
4366 } 4605 }
4367 __task_rq_unlock(rq); 4606 __task_rq_unlock(rq);
4368 spin_unlock_irqrestore(&p->pi_lock, flags); 4607 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4490 struct task_struct *p; 4729 struct task_struct *p;
4491 int retval; 4730 int retval;
4492 4731
4493 mutex_lock(&sched_hotcpu_mutex); 4732 get_online_cpus();
4494 read_lock(&tasklist_lock); 4733 read_lock(&tasklist_lock);
4495 4734
4496 p = find_process_by_pid(pid); 4735 p = find_process_by_pid(pid);
4497 if (!p) { 4736 if (!p) {
4498 read_unlock(&tasklist_lock); 4737 read_unlock(&tasklist_lock);
4499 mutex_unlock(&sched_hotcpu_mutex); 4738 put_online_cpus();
4500 return -ESRCH; 4739 return -ESRCH;
4501 } 4740 }
4502 4741
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4536 } 4775 }
4537out_unlock: 4776out_unlock:
4538 put_task_struct(p); 4777 put_task_struct(p);
4539 mutex_unlock(&sched_hotcpu_mutex); 4778 put_online_cpus();
4540 return retval; 4779 return retval;
4541} 4780}
4542 4781
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4593 struct task_struct *p; 4832 struct task_struct *p;
4594 int retval; 4833 int retval;
4595 4834
4596 mutex_lock(&sched_hotcpu_mutex); 4835 get_online_cpus();
4597 read_lock(&tasklist_lock); 4836 read_lock(&tasklist_lock);
4598 4837
4599 retval = -ESRCH; 4838 retval = -ESRCH;
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4609 4848
4610out_unlock: 4849out_unlock:
4611 read_unlock(&tasklist_lock); 4850 read_unlock(&tasklist_lock);
4612 mutex_unlock(&sched_hotcpu_mutex); 4851 put_online_cpus();
4613 4852
4614 return retval; 4853 return retval;
4615} 4854}
@@ -4683,7 +4922,8 @@ static void __cond_resched(void)
4683 } while (need_resched()); 4922 } while (need_resched());
4684} 4923}
4685 4924
4686int __sched cond_resched(void) 4925#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
4926int __sched _cond_resched(void)
4687{ 4927{
4688 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4928 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4689 system_state == SYSTEM_RUNNING) { 4929 system_state == SYSTEM_RUNNING) {
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void)
4692 } 4932 }
4693 return 0; 4933 return 0;
4694} 4934}
4695EXPORT_SYMBOL(cond_resched); 4935EXPORT_SYMBOL(_cond_resched);
4936#endif
4696 4937
4697/* 4938/*
4698 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4939 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4890,7 +5131,7 @@ out_unlock:
4890 5131
4891static const char stat_nam[] = "RSDTtZX"; 5132static const char stat_nam[] = "RSDTtZX";
4892 5133
4893static void show_task(struct task_struct *p) 5134void sched_show_task(struct task_struct *p)
4894{ 5135{
4895 unsigned long free = 0; 5136 unsigned long free = 0;
4896 unsigned state; 5137 unsigned state;
@@ -4920,8 +5161,7 @@ static void show_task(struct task_struct *p)
4920 printk(KERN_CONT "%5lu %5d %6d\n", free, 5161 printk(KERN_CONT "%5lu %5d %6d\n", free,
4921 task_pid_nr(p), task_pid_nr(p->real_parent)); 5162 task_pid_nr(p), task_pid_nr(p->real_parent));
4922 5163
4923 if (state != TASK_RUNNING) 5164 show_stack(p, NULL);
4924 show_stack(p, NULL);
4925} 5165}
4926 5166
4927void show_state_filter(unsigned long state_filter) 5167void show_state_filter(unsigned long state_filter)
@@ -4943,7 +5183,7 @@ void show_state_filter(unsigned long state_filter)
4943 */ 5183 */
4944 touch_nmi_watchdog(); 5184 touch_nmi_watchdog();
4945 if (!state_filter || (p->state & state_filter)) 5185 if (!state_filter || (p->state & state_filter))
4946 show_task(p); 5186 sched_show_task(p);
4947 } while_each_thread(g, p); 5187 } while_each_thread(g, p);
4948 5188
4949 touch_all_softlockup_watchdogs(); 5189 touch_all_softlockup_watchdogs();
@@ -4992,11 +5232,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4992 spin_unlock_irqrestore(&rq->lock, flags); 5232 spin_unlock_irqrestore(&rq->lock, flags);
4993 5233
4994 /* Set the preempt count _outside_ the spinlocks! */ 5234 /* Set the preempt count _outside_ the spinlocks! */
4995#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4996 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4997#else
4998 task_thread_info(idle)->preempt_count = 0; 5235 task_thread_info(idle)->preempt_count = 0;
4999#endif 5236
5000 /* 5237 /*
5001 * The idle tasks have their own, simple scheduling class: 5238 * The idle tasks have their own, simple scheduling class:
5002 */ 5239 */
@@ -5077,7 +5314,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5077 goto out; 5314 goto out;
5078 } 5315 }
5079 5316
5080 p->cpus_allowed = new_mask; 5317 if (p->sched_class->set_cpus_allowed)
5318 p->sched_class->set_cpus_allowed(p, &new_mask);
5319 else {
5320 p->cpus_allowed = new_mask;
5321 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5322 }
5323
5081 /* Can the task run on the task's current CPU? If so, we're done */ 5324 /* Can the task run on the task's current CPU? If so, we're done */
5082 if (cpu_isset(task_cpu(p), new_mask)) 5325 if (cpu_isset(task_cpu(p), new_mask))
5083 goto out; 5326 goto out;
@@ -5569,9 +5812,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5569 struct rq *rq; 5812 struct rq *rq;
5570 5813
5571 switch (action) { 5814 switch (action) {
5572 case CPU_LOCK_ACQUIRE:
5573 mutex_lock(&sched_hotcpu_mutex);
5574 break;
5575 5815
5576 case CPU_UP_PREPARE: 5816 case CPU_UP_PREPARE:
5577 case CPU_UP_PREPARE_FROZEN: 5817 case CPU_UP_PREPARE_FROZEN:
@@ -5590,6 +5830,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5590 case CPU_ONLINE_FROZEN: 5830 case CPU_ONLINE_FROZEN:
5591 /* Strictly unnecessary, as first user will wake it. */ 5831 /* Strictly unnecessary, as first user will wake it. */
5592 wake_up_process(cpu_rq(cpu)->migration_thread); 5832 wake_up_process(cpu_rq(cpu)->migration_thread);
5833
5834 /* Update our root-domain */
5835 rq = cpu_rq(cpu);
5836 spin_lock_irqsave(&rq->lock, flags);
5837 if (rq->rd) {
5838 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5839 cpu_set(cpu, rq->rd->online);
5840 }
5841 spin_unlock_irqrestore(&rq->lock, flags);
5593 break; 5842 break;
5594 5843
5595#ifdef CONFIG_HOTPLUG_CPU 5844#ifdef CONFIG_HOTPLUG_CPU
@@ -5640,10 +5889,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5640 } 5889 }
5641 spin_unlock_irq(&rq->lock); 5890 spin_unlock_irq(&rq->lock);
5642 break; 5891 break;
5643#endif 5892
5644 case CPU_LOCK_RELEASE: 5893 case CPU_DOWN_PREPARE:
5645 mutex_unlock(&sched_hotcpu_mutex); 5894 /* Update our root-domain */
5895 rq = cpu_rq(cpu);
5896 spin_lock_irqsave(&rq->lock, flags);
5897 if (rq->rd) {
5898 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5899 cpu_clear(cpu, rq->rd->online);
5900 }
5901 spin_unlock_irqrestore(&rq->lock, flags);
5646 break; 5902 break;
5903#endif
5647 } 5904 }
5648 return NOTIFY_OK; 5905 return NOTIFY_OK;
5649} 5906}
@@ -5831,11 +6088,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5831 return 1; 6088 return 1;
5832} 6089}
5833 6090
6091static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6092{
6093 unsigned long flags;
6094 const struct sched_class *class;
6095
6096 spin_lock_irqsave(&rq->lock, flags);
6097
6098 if (rq->rd) {
6099 struct root_domain *old_rd = rq->rd;
6100
6101 for (class = sched_class_highest; class; class = class->next) {
6102 if (class->leave_domain)
6103 class->leave_domain(rq);
6104 }
6105
6106 cpu_clear(rq->cpu, old_rd->span);
6107 cpu_clear(rq->cpu, old_rd->online);
6108
6109 if (atomic_dec_and_test(&old_rd->refcount))
6110 kfree(old_rd);
6111 }
6112
6113 atomic_inc(&rd->refcount);
6114 rq->rd = rd;
6115
6116 cpu_set(rq->cpu, rd->span);
6117 if (cpu_isset(rq->cpu, cpu_online_map))
6118 cpu_set(rq->cpu, rd->online);
6119
6120 for (class = sched_class_highest; class; class = class->next) {
6121 if (class->join_domain)
6122 class->join_domain(rq);
6123 }
6124
6125 spin_unlock_irqrestore(&rq->lock, flags);
6126}
6127
6128static void init_rootdomain(struct root_domain *rd)
6129{
6130 memset(rd, 0, sizeof(*rd));
6131
6132 cpus_clear(rd->span);
6133 cpus_clear(rd->online);
6134}
6135
6136static void init_defrootdomain(void)
6137{
6138 init_rootdomain(&def_root_domain);
6139 atomic_set(&def_root_domain.refcount, 1);
6140}
6141
6142static struct root_domain *alloc_rootdomain(void)
6143{
6144 struct root_domain *rd;
6145
6146 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6147 if (!rd)
6148 return NULL;
6149
6150 init_rootdomain(rd);
6151
6152 return rd;
6153}
6154
5834/* 6155/*
5835 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6156 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5836 * hold the hotplug lock. 6157 * hold the hotplug lock.
5837 */ 6158 */
5838static void cpu_attach_domain(struct sched_domain *sd, int cpu) 6159static void
6160cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5839{ 6161{
5840 struct rq *rq = cpu_rq(cpu); 6162 struct rq *rq = cpu_rq(cpu);
5841 struct sched_domain *tmp; 6163 struct sched_domain *tmp;
@@ -5860,6 +6182,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5860 6182
5861 sched_domain_debug(sd, cpu); 6183 sched_domain_debug(sd, cpu);
5862 6184
6185 rq_attach_root(rq, rd);
5863 rcu_assign_pointer(rq->sd, sd); 6186 rcu_assign_pointer(rq->sd, sd);
5864} 6187}
5865 6188
@@ -6228,6 +6551,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6228static int build_sched_domains(const cpumask_t *cpu_map) 6551static int build_sched_domains(const cpumask_t *cpu_map)
6229{ 6552{
6230 int i; 6553 int i;
6554 struct root_domain *rd;
6231#ifdef CONFIG_NUMA 6555#ifdef CONFIG_NUMA
6232 struct sched_group **sched_group_nodes = NULL; 6556 struct sched_group **sched_group_nodes = NULL;
6233 int sd_allnodes = 0; 6557 int sd_allnodes = 0;
@@ -6244,6 +6568,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6244 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6568 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6245#endif 6569#endif
6246 6570
6571 rd = alloc_rootdomain();
6572 if (!rd) {
6573 printk(KERN_WARNING "Cannot alloc root domain\n");
6574 return -ENOMEM;
6575 }
6576
6247 /* 6577 /*
6248 * Set up domains for cpus specified by the cpu_map. 6578 * Set up domains for cpus specified by the cpu_map.
6249 */ 6579 */
@@ -6460,7 +6790,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6460#else 6790#else
6461 sd = &per_cpu(phys_domains, i); 6791 sd = &per_cpu(phys_domains, i);
6462#endif 6792#endif
6463 cpu_attach_domain(sd, i); 6793 cpu_attach_domain(sd, rd, i);
6464 } 6794 }
6465 6795
6466 return 0; 6796 return 0;
@@ -6518,7 +6848,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6518 unregister_sched_domain_sysctl(); 6848 unregister_sched_domain_sysctl();
6519 6849
6520 for_each_cpu_mask(i, *cpu_map) 6850 for_each_cpu_mask(i, *cpu_map)
6521 cpu_attach_domain(NULL, i); 6851 cpu_attach_domain(NULL, &def_root_domain, i);
6522 synchronize_sched(); 6852 synchronize_sched();
6523 arch_destroy_sched_domains(cpu_map); 6853 arch_destroy_sched_domains(cpu_map);
6524} 6854}
@@ -6548,6 +6878,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6548{ 6878{
6549 int i, j; 6879 int i, j;
6550 6880
6881 lock_doms_cur();
6882
6551 /* always unregister in case we don't destroy any domains */ 6883 /* always unregister in case we don't destroy any domains */
6552 unregister_sched_domain_sysctl(); 6884 unregister_sched_domain_sysctl();
6553 6885
@@ -6588,6 +6920,8 @@ match2:
6588 ndoms_cur = ndoms_new; 6920 ndoms_cur = ndoms_new;
6589 6921
6590 register_sched_domain_sysctl(); 6922 register_sched_domain_sysctl();
6923
6924 unlock_doms_cur();
6591} 6925}
6592 6926
6593#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6927#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6595,10 +6929,10 @@ static int arch_reinit_sched_domains(void)
6595{ 6929{
6596 int err; 6930 int err;
6597 6931
6598 mutex_lock(&sched_hotcpu_mutex); 6932 get_online_cpus();
6599 detach_destroy_domains(&cpu_online_map); 6933 detach_destroy_domains(&cpu_online_map);
6600 err = arch_init_sched_domains(&cpu_online_map); 6934 err = arch_init_sched_domains(&cpu_online_map);
6601 mutex_unlock(&sched_hotcpu_mutex); 6935 put_online_cpus();
6602 6936
6603 return err; 6937 return err;
6604} 6938}
@@ -6709,12 +7043,12 @@ void __init sched_init_smp(void)
6709{ 7043{
6710 cpumask_t non_isolated_cpus; 7044 cpumask_t non_isolated_cpus;
6711 7045
6712 mutex_lock(&sched_hotcpu_mutex); 7046 get_online_cpus();
6713 arch_init_sched_domains(&cpu_online_map); 7047 arch_init_sched_domains(&cpu_online_map);
6714 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7048 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6715 if (cpus_empty(non_isolated_cpus)) 7049 if (cpus_empty(non_isolated_cpus))
6716 cpu_set(smp_processor_id(), non_isolated_cpus); 7050 cpu_set(smp_processor_id(), non_isolated_cpus);
6717 mutex_unlock(&sched_hotcpu_mutex); 7051 put_online_cpus();
6718 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7052 /* XXX: Theoretical race here - CPU may be hotplugged now */
6719 hotcpu_notifier(update_sched_domains, 0); 7053 hotcpu_notifier(update_sched_domains, 0);
6720 7054
@@ -6722,6 +7056,21 @@ void __init sched_init_smp(void)
6722 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7056 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6723 BUG(); 7057 BUG();
6724 sched_init_granularity(); 7058 sched_init_granularity();
7059
7060#ifdef CONFIG_FAIR_GROUP_SCHED
7061 if (nr_cpu_ids == 1)
7062 return;
7063
7064 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7065 "group_balance");
7066 if (!IS_ERR(lb_monitor_task)) {
7067 lb_monitor_task->flags |= PF_NOFREEZE;
7068 wake_up_process(lb_monitor_task);
7069 } else {
7070 printk(KERN_ERR "Could not create load balance monitor thread"
7071 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7072 }
7073#endif
6725} 7074}
6726#else 7075#else
6727void __init sched_init_smp(void) 7076void __init sched_init_smp(void)
@@ -6746,13 +7095,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6746 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7095 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6747} 7096}
6748 7097
7098static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7099{
7100 struct rt_prio_array *array;
7101 int i;
7102
7103 array = &rt_rq->active;
7104 for (i = 0; i < MAX_RT_PRIO; i++) {
7105 INIT_LIST_HEAD(array->queue + i);
7106 __clear_bit(i, array->bitmap);
7107 }
7108 /* delimiter for bitsearch: */
7109 __set_bit(MAX_RT_PRIO, array->bitmap);
7110
7111#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
7112 rt_rq->highest_prio = MAX_RT_PRIO;
7113#endif
7114#ifdef CONFIG_SMP
7115 rt_rq->rt_nr_migratory = 0;
7116 rt_rq->overloaded = 0;
7117#endif
7118
7119 rt_rq->rt_time = 0;
7120 rt_rq->rt_throttled = 0;
7121
7122#ifdef CONFIG_FAIR_GROUP_SCHED
7123 rt_rq->rq = rq;
7124#endif
7125}
7126
7127#ifdef CONFIG_FAIR_GROUP_SCHED
7128static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7129 struct cfs_rq *cfs_rq, struct sched_entity *se,
7130 int cpu, int add)
7131{
7132 tg->cfs_rq[cpu] = cfs_rq;
7133 init_cfs_rq(cfs_rq, rq);
7134 cfs_rq->tg = tg;
7135 if (add)
7136 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7137
7138 tg->se[cpu] = se;
7139 se->cfs_rq = &rq->cfs;
7140 se->my_q = cfs_rq;
7141 se->load.weight = tg->shares;
7142 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7143 se->parent = NULL;
7144}
7145
7146static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7147 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7148 int cpu, int add)
7149{
7150 tg->rt_rq[cpu] = rt_rq;
7151 init_rt_rq(rt_rq, rq);
7152 rt_rq->tg = tg;
7153 rt_rq->rt_se = rt_se;
7154 if (add)
7155 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7156
7157 tg->rt_se[cpu] = rt_se;
7158 rt_se->rt_rq = &rq->rt;
7159 rt_se->my_q = rt_rq;
7160 rt_se->parent = NULL;
7161 INIT_LIST_HEAD(&rt_se->run_list);
7162}
7163#endif
7164
6749void __init sched_init(void) 7165void __init sched_init(void)
6750{ 7166{
6751 int highest_cpu = 0; 7167 int highest_cpu = 0;
6752 int i, j; 7168 int i, j;
6753 7169
7170#ifdef CONFIG_SMP
7171 init_defrootdomain();
7172#endif
7173
7174#ifdef CONFIG_FAIR_GROUP_SCHED
7175 list_add(&init_task_group.list, &task_groups);
7176#endif
7177
6754 for_each_possible_cpu(i) { 7178 for_each_possible_cpu(i) {
6755 struct rt_prio_array *array;
6756 struct rq *rq; 7179 struct rq *rq;
6757 7180
6758 rq = cpu_rq(i); 7181 rq = cpu_rq(i);
@@ -6761,52 +7184,39 @@ void __init sched_init(void)
6761 rq->nr_running = 0; 7184 rq->nr_running = 0;
6762 rq->clock = 1; 7185 rq->clock = 1;
6763 init_cfs_rq(&rq->cfs, rq); 7186 init_cfs_rq(&rq->cfs, rq);
7187 init_rt_rq(&rq->rt, rq);
6764#ifdef CONFIG_FAIR_GROUP_SCHED 7188#ifdef CONFIG_FAIR_GROUP_SCHED
6765 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6766 {
6767 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6768 struct sched_entity *se =
6769 &per_cpu(init_sched_entity, i);
6770
6771 init_cfs_rq_p[i] = cfs_rq;
6772 init_cfs_rq(cfs_rq, rq);
6773 cfs_rq->tg = &init_task_group;
6774 list_add(&cfs_rq->leaf_cfs_rq_list,
6775 &rq->leaf_cfs_rq_list);
6776
6777 init_sched_entity_p[i] = se;
6778 se->cfs_rq = &rq->cfs;
6779 se->my_q = cfs_rq;
6780 se->load.weight = init_task_group_load;
6781 se->load.inv_weight =
6782 div64_64(1ULL<<32, init_task_group_load);
6783 se->parent = NULL;
6784 }
6785 init_task_group.shares = init_task_group_load; 7189 init_task_group.shares = init_task_group_load;
6786 spin_lock_init(&init_task_group.lock); 7190 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7191 init_tg_cfs_entry(rq, &init_task_group,
7192 &per_cpu(init_cfs_rq, i),
7193 &per_cpu(init_sched_entity, i), i, 1);
7194
7195 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7196 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7197 init_tg_rt_entry(rq, &init_task_group,
7198 &per_cpu(init_rt_rq, i),
7199 &per_cpu(init_sched_rt_entity, i), i, 1);
6787#endif 7200#endif
7201 rq->rt_period_expire = 0;
7202 rq->rt_throttled = 0;
6788 7203
6789 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7204 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6790 rq->cpu_load[j] = 0; 7205 rq->cpu_load[j] = 0;
6791#ifdef CONFIG_SMP 7206#ifdef CONFIG_SMP
6792 rq->sd = NULL; 7207 rq->sd = NULL;
7208 rq->rd = NULL;
6793 rq->active_balance = 0; 7209 rq->active_balance = 0;
6794 rq->next_balance = jiffies; 7210 rq->next_balance = jiffies;
6795 rq->push_cpu = 0; 7211 rq->push_cpu = 0;
6796 rq->cpu = i; 7212 rq->cpu = i;
6797 rq->migration_thread = NULL; 7213 rq->migration_thread = NULL;
6798 INIT_LIST_HEAD(&rq->migration_queue); 7214 INIT_LIST_HEAD(&rq->migration_queue);
7215 rq_attach_root(rq, &def_root_domain);
6799#endif 7216#endif
7217 init_rq_hrtick(rq);
6800 atomic_set(&rq->nr_iowait, 0); 7218 atomic_set(&rq->nr_iowait, 0);
6801
6802 array = &rq->rt.active;
6803 for (j = 0; j < MAX_RT_PRIO; j++) {
6804 INIT_LIST_HEAD(array->queue + j);
6805 __clear_bit(j, array->bitmap);
6806 }
6807 highest_cpu = i; 7219 highest_cpu = i;
6808 /* delimiter for bitsearch: */
6809 __set_bit(MAX_RT_PRIO, array->bitmap);
6810 } 7220 }
6811 7221
6812 set_load_weight(&init_task); 7222 set_load_weight(&init_task);
@@ -6975,12 +7385,187 @@ void set_curr_task(int cpu, struct task_struct *p)
6975 7385
6976#ifdef CONFIG_FAIR_GROUP_SCHED 7386#ifdef CONFIG_FAIR_GROUP_SCHED
6977 7387
7388#ifdef CONFIG_SMP
7389/*
7390 * distribute shares of all task groups among their schedulable entities,
7391 * to reflect load distribution across cpus.
7392 */
7393static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7394{
7395 struct cfs_rq *cfs_rq;
7396 struct rq *rq = cpu_rq(this_cpu);
7397 cpumask_t sdspan = sd->span;
7398 int balanced = 1;
7399
7400 /* Walk thr' all the task groups that we have */
7401 for_each_leaf_cfs_rq(rq, cfs_rq) {
7402 int i;
7403 unsigned long total_load = 0, total_shares;
7404 struct task_group *tg = cfs_rq->tg;
7405
7406 /* Gather total task load of this group across cpus */
7407 for_each_cpu_mask(i, sdspan)
7408 total_load += tg->cfs_rq[i]->load.weight;
7409
7410 /* Nothing to do if this group has no load */
7411 if (!total_load)
7412 continue;
7413
7414 /*
7415 * tg->shares represents the number of cpu shares the task group
7416 * is eligible to hold on a single cpu. On N cpus, it is
7417 * eligible to hold (N * tg->shares) number of cpu shares.
7418 */
7419 total_shares = tg->shares * cpus_weight(sdspan);
7420
7421 /*
7422 * redistribute total_shares across cpus as per the task load
7423 * distribution.
7424 */
7425 for_each_cpu_mask(i, sdspan) {
7426 unsigned long local_load, local_shares;
7427
7428 local_load = tg->cfs_rq[i]->load.weight;
7429 local_shares = (local_load * total_shares) / total_load;
7430 if (!local_shares)
7431 local_shares = MIN_GROUP_SHARES;
7432 if (local_shares == tg->se[i]->load.weight)
7433 continue;
7434
7435 spin_lock_irq(&cpu_rq(i)->lock);
7436 set_se_shares(tg->se[i], local_shares);
7437 spin_unlock_irq(&cpu_rq(i)->lock);
7438 balanced = 0;
7439 }
7440 }
7441
7442 return balanced;
7443}
7444
7445/*
7446 * How frequently should we rebalance_shares() across cpus?
7447 *
7448 * The more frequently we rebalance shares, the more accurate is the fairness
7449 * of cpu bandwidth distribution between task groups. However higher frequency
7450 * also implies increased scheduling overhead.
7451 *
7452 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7453 * consecutive calls to rebalance_shares() in the same sched domain.
7454 *
7455 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7456 * consecutive calls to rebalance_shares() in the same sched domain.
7457 *
7458 * These settings allows for the appropriate trade-off between accuracy of
7459 * fairness and the associated overhead.
7460 *
7461 */
7462
7463/* default: 8ms, units: milliseconds */
7464const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7465
7466/* default: 128ms, units: milliseconds */
7467const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7468
7469/* kernel thread that runs rebalance_shares() periodically */
7470static int load_balance_monitor(void *unused)
7471{
7472 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7473 struct sched_param schedparm;
7474 int ret;
7475
7476 /*
7477 * We don't want this thread's execution to be limited by the shares
7478 * assigned to default group (init_task_group). Hence make it run
7479 * as a SCHED_RR RT task at the lowest priority.
7480 */
7481 schedparm.sched_priority = 1;
7482 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7483 if (ret)
7484 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7485 " monitor thread (error = %d) \n", ret);
7486
7487 while (!kthread_should_stop()) {
7488 int i, cpu, balanced = 1;
7489
7490 /* Prevent cpus going down or coming up */
7491 get_online_cpus();
7492 /* lockout changes to doms_cur[] array */
7493 lock_doms_cur();
7494 /*
7495 * Enter a rcu read-side critical section to safely walk rq->sd
7496 * chain on various cpus and to walk task group list
7497 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7498 */
7499 rcu_read_lock();
7500
7501 for (i = 0; i < ndoms_cur; i++) {
7502 cpumask_t cpumap = doms_cur[i];
7503 struct sched_domain *sd = NULL, *sd_prev = NULL;
7504
7505 cpu = first_cpu(cpumap);
7506
7507 /* Find the highest domain at which to balance shares */
7508 for_each_domain(cpu, sd) {
7509 if (!(sd->flags & SD_LOAD_BALANCE))
7510 continue;
7511 sd_prev = sd;
7512 }
7513
7514 sd = sd_prev;
7515 /* sd == NULL? No load balance reqd in this domain */
7516 if (!sd)
7517 continue;
7518
7519 balanced &= rebalance_shares(sd, cpu);
7520 }
7521
7522 rcu_read_unlock();
7523
7524 unlock_doms_cur();
7525 put_online_cpus();
7526
7527 if (!balanced)
7528 timeout = sysctl_sched_min_bal_int_shares;
7529 else if (timeout < sysctl_sched_max_bal_int_shares)
7530 timeout *= 2;
7531
7532 msleep_interruptible(timeout);
7533 }
7534
7535 return 0;
7536}
7537#endif /* CONFIG_SMP */
7538
7539static void free_sched_group(struct task_group *tg)
7540{
7541 int i;
7542
7543 for_each_possible_cpu(i) {
7544 if (tg->cfs_rq)
7545 kfree(tg->cfs_rq[i]);
7546 if (tg->se)
7547 kfree(tg->se[i]);
7548 if (tg->rt_rq)
7549 kfree(tg->rt_rq[i]);
7550 if (tg->rt_se)
7551 kfree(tg->rt_se[i]);
7552 }
7553
7554 kfree(tg->cfs_rq);
7555 kfree(tg->se);
7556 kfree(tg->rt_rq);
7557 kfree(tg->rt_se);
7558 kfree(tg);
7559}
7560
6978/* allocate runqueue etc for a new task group */ 7561/* allocate runqueue etc for a new task group */
6979struct task_group *sched_create_group(void) 7562struct task_group *sched_create_group(void)
6980{ 7563{
6981 struct task_group *tg; 7564 struct task_group *tg;
6982 struct cfs_rq *cfs_rq; 7565 struct cfs_rq *cfs_rq;
6983 struct sched_entity *se; 7566 struct sched_entity *se;
7567 struct rt_rq *rt_rq;
7568 struct sched_rt_entity *rt_se;
6984 struct rq *rq; 7569 struct rq *rq;
6985 int i; 7570 int i;
6986 7571
@@ -6994,97 +7579,89 @@ struct task_group *sched_create_group(void)
6994 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7579 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6995 if (!tg->se) 7580 if (!tg->se)
6996 goto err; 7581 goto err;
7582 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7583 if (!tg->rt_rq)
7584 goto err;
7585 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7586 if (!tg->rt_se)
7587 goto err;
7588
7589 tg->shares = NICE_0_LOAD;
7590 tg->rt_ratio = 0; /* XXX */
6997 7591
6998 for_each_possible_cpu(i) { 7592 for_each_possible_cpu(i) {
6999 rq = cpu_rq(i); 7593 rq = cpu_rq(i);
7000 7594
7001 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7595 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7002 cpu_to_node(i)); 7596 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7003 if (!cfs_rq) 7597 if (!cfs_rq)
7004 goto err; 7598 goto err;
7005 7599
7006 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7600 se = kmalloc_node(sizeof(struct sched_entity),
7007 cpu_to_node(i)); 7601 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7008 if (!se) 7602 if (!se)
7009 goto err; 7603 goto err;
7010 7604
7011 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7605 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7012 memset(se, 0, sizeof(struct sched_entity)); 7606 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7607 if (!rt_rq)
7608 goto err;
7013 7609
7014 tg->cfs_rq[i] = cfs_rq; 7610 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7015 init_cfs_rq(cfs_rq, rq); 7611 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7016 cfs_rq->tg = tg; 7612 if (!rt_se)
7613 goto err;
7017 7614
7018 tg->se[i] = se; 7615 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7019 se->cfs_rq = &rq->cfs; 7616 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7020 se->my_q = cfs_rq;
7021 se->load.weight = NICE_0_LOAD;
7022 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7023 se->parent = NULL;
7024 } 7617 }
7025 7618
7619 lock_task_group_list();
7026 for_each_possible_cpu(i) { 7620 for_each_possible_cpu(i) {
7027 rq = cpu_rq(i); 7621 rq = cpu_rq(i);
7028 cfs_rq = tg->cfs_rq[i]; 7622 cfs_rq = tg->cfs_rq[i];
7029 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7623 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7624 rt_rq = tg->rt_rq[i];
7625 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7030 } 7626 }
7031 7627 list_add_rcu(&tg->list, &task_groups);
7032 tg->shares = NICE_0_LOAD; 7628 unlock_task_group_list();
7033 spin_lock_init(&tg->lock);
7034 7629
7035 return tg; 7630 return tg;
7036 7631
7037err: 7632err:
7038 for_each_possible_cpu(i) { 7633 free_sched_group(tg);
7039 if (tg->cfs_rq)
7040 kfree(tg->cfs_rq[i]);
7041 if (tg->se)
7042 kfree(tg->se[i]);
7043 }
7044 kfree(tg->cfs_rq);
7045 kfree(tg->se);
7046 kfree(tg);
7047
7048 return ERR_PTR(-ENOMEM); 7634 return ERR_PTR(-ENOMEM);
7049} 7635}
7050 7636
7051/* rcu callback to free various structures associated with a task group */ 7637/* rcu callback to free various structures associated with a task group */
7052static void free_sched_group(struct rcu_head *rhp) 7638static void free_sched_group_rcu(struct rcu_head *rhp)
7053{ 7639{
7054 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7055 struct cfs_rq *cfs_rq;
7056 struct sched_entity *se;
7057 int i;
7058
7059 /* now it should be safe to free those cfs_rqs */ 7640 /* now it should be safe to free those cfs_rqs */
7060 for_each_possible_cpu(i) { 7641 free_sched_group(container_of(rhp, struct task_group, rcu));
7061 cfs_rq = tg->cfs_rq[i];
7062 kfree(cfs_rq);
7063
7064 se = tg->se[i];
7065 kfree(se);
7066 }
7067
7068 kfree(tg->cfs_rq);
7069 kfree(tg->se);
7070 kfree(tg);
7071} 7642}
7072 7643
7073/* Destroy runqueue etc associated with a task group */ 7644/* Destroy runqueue etc associated with a task group */
7074void sched_destroy_group(struct task_group *tg) 7645void sched_destroy_group(struct task_group *tg)
7075{ 7646{
7076 struct cfs_rq *cfs_rq = NULL; 7647 struct cfs_rq *cfs_rq = NULL;
7648 struct rt_rq *rt_rq = NULL;
7077 int i; 7649 int i;
7078 7650
7651 lock_task_group_list();
7079 for_each_possible_cpu(i) { 7652 for_each_possible_cpu(i) {
7080 cfs_rq = tg->cfs_rq[i]; 7653 cfs_rq = tg->cfs_rq[i];
7081 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7654 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7655 rt_rq = tg->rt_rq[i];
7656 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7082 } 7657 }
7658 list_del_rcu(&tg->list);
7659 unlock_task_group_list();
7083 7660
7084 BUG_ON(!cfs_rq); 7661 BUG_ON(!cfs_rq);
7085 7662
7086 /* wait for possible concurrent references to cfs_rqs complete */ 7663 /* wait for possible concurrent references to cfs_rqs complete */
7087 call_rcu(&tg->rcu, free_sched_group); 7664 call_rcu(&tg->rcu, free_sched_group_rcu);
7088} 7665}
7089 7666
7090/* change task's runqueue when it moves between groups. 7667/* change task's runqueue when it moves between groups.
@@ -7100,11 +7677,6 @@ void sched_move_task(struct task_struct *tsk)
7100 7677
7101 rq = task_rq_lock(tsk, &flags); 7678 rq = task_rq_lock(tsk, &flags);
7102 7679
7103 if (tsk->sched_class != &fair_sched_class) {
7104 set_task_cfs_rq(tsk, task_cpu(tsk));
7105 goto done;
7106 }
7107
7108 update_rq_clock(rq); 7680 update_rq_clock(rq);
7109 7681
7110 running = task_current(rq, tsk); 7682 running = task_current(rq, tsk);
@@ -7116,7 +7688,7 @@ void sched_move_task(struct task_struct *tsk)
7116 tsk->sched_class->put_prev_task(rq, tsk); 7688 tsk->sched_class->put_prev_task(rq, tsk);
7117 } 7689 }
7118 7690
7119 set_task_cfs_rq(tsk, task_cpu(tsk)); 7691 set_task_rq(tsk, task_cpu(tsk));
7120 7692
7121 if (on_rq) { 7693 if (on_rq) {
7122 if (unlikely(running)) 7694 if (unlikely(running))
@@ -7124,53 +7696,82 @@ void sched_move_task(struct task_struct *tsk)
7124 enqueue_task(rq, tsk, 0); 7696 enqueue_task(rq, tsk, 0);
7125 } 7697 }
7126 7698
7127done:
7128 task_rq_unlock(rq, &flags); 7699 task_rq_unlock(rq, &flags);
7129} 7700}
7130 7701
7702/* rq->lock to be locked by caller */
7131static void set_se_shares(struct sched_entity *se, unsigned long shares) 7703static void set_se_shares(struct sched_entity *se, unsigned long shares)
7132{ 7704{
7133 struct cfs_rq *cfs_rq = se->cfs_rq; 7705 struct cfs_rq *cfs_rq = se->cfs_rq;
7134 struct rq *rq = cfs_rq->rq; 7706 struct rq *rq = cfs_rq->rq;
7135 int on_rq; 7707 int on_rq;
7136 7708
7137 spin_lock_irq(&rq->lock); 7709 if (!shares)
7710 shares = MIN_GROUP_SHARES;
7138 7711
7139 on_rq = se->on_rq; 7712 on_rq = se->on_rq;
7140 if (on_rq) 7713 if (on_rq) {
7141 dequeue_entity(cfs_rq, se, 0); 7714 dequeue_entity(cfs_rq, se, 0);
7715 dec_cpu_load(rq, se->load.weight);
7716 }
7142 7717
7143 se->load.weight = shares; 7718 se->load.weight = shares;
7144 se->load.inv_weight = div64_64((1ULL<<32), shares); 7719 se->load.inv_weight = div64_64((1ULL<<32), shares);
7145 7720
7146 if (on_rq) 7721 if (on_rq) {
7147 enqueue_entity(cfs_rq, se, 0); 7722 enqueue_entity(cfs_rq, se, 0);
7148 7723 inc_cpu_load(rq, se->load.weight);
7149 spin_unlock_irq(&rq->lock); 7724 }
7150} 7725}
7151 7726
7152int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7727int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7153{ 7728{
7154 int i; 7729 int i;
7730 struct cfs_rq *cfs_rq;
7731 struct rq *rq;
7732
7733 lock_task_group_list();
7734 if (tg->shares == shares)
7735 goto done;
7736
7737 if (shares < MIN_GROUP_SHARES)
7738 shares = MIN_GROUP_SHARES;
7155 7739
7156 /* 7740 /*
7157 * A weight of 0 or 1 can cause arithmetics problems. 7741 * Prevent any load balance activity (rebalance_shares,
7158 * (The default weight is 1024 - so there's no practical 7742 * load_balance_fair) from referring to this group first,
7159 * limitation from this.) 7743 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7160 */ 7744 */
7161 if (shares < 2) 7745 for_each_possible_cpu(i) {
7162 shares = 2; 7746 cfs_rq = tg->cfs_rq[i];
7747 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7748 }
7163 7749
7164 spin_lock(&tg->lock); 7750 /* wait for any ongoing reference to this group to finish */
7165 if (tg->shares == shares) 7751 synchronize_sched();
7166 goto done;
7167 7752
7753 /*
7754 * Now we are free to modify the group's share on each cpu
7755 * w/o tripping rebalance_share or load_balance_fair.
7756 */
7168 tg->shares = shares; 7757 tg->shares = shares;
7169 for_each_possible_cpu(i) 7758 for_each_possible_cpu(i) {
7759 spin_lock_irq(&cpu_rq(i)->lock);
7170 set_se_shares(tg->se[i], shares); 7760 set_se_shares(tg->se[i], shares);
7761 spin_unlock_irq(&cpu_rq(i)->lock);
7762 }
7171 7763
7764 /*
7765 * Enable load balance activity on this group, by inserting it back on
7766 * each cpu's rq->leaf_cfs_rq_list.
7767 */
7768 for_each_possible_cpu(i) {
7769 rq = cpu_rq(i);
7770 cfs_rq = tg->cfs_rq[i];
7771 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7772 }
7172done: 7773done:
7173 spin_unlock(&tg->lock); 7774 unlock_task_group_list();
7174 return 0; 7775 return 0;
7175} 7776}
7176 7777
@@ -7179,6 +7780,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7179 return tg->shares; 7780 return tg->shares;
7180} 7781}
7181 7782
7783/*
7784 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7785 */
7786int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7787{
7788 struct task_group *tgi;
7789 unsigned long total = 0;
7790
7791 rcu_read_lock();
7792 list_for_each_entry_rcu(tgi, &task_groups, list)
7793 total += tgi->rt_ratio;
7794 rcu_read_unlock();
7795
7796 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7797 return -EINVAL;
7798
7799 tg->rt_ratio = rt_ratio;
7800 return 0;
7801}
7802
7803unsigned long sched_group_rt_ratio(struct task_group *tg)
7804{
7805 return tg->rt_ratio;
7806}
7807
7182#endif /* CONFIG_FAIR_GROUP_SCHED */ 7808#endif /* CONFIG_FAIR_GROUP_SCHED */
7183 7809
7184#ifdef CONFIG_FAIR_CGROUP_SCHED 7810#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7254,12 +7880,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7254 return (u64) tg->shares; 7880 return (u64) tg->shares;
7255} 7881}
7256 7882
7883static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7884 u64 rt_ratio_val)
7885{
7886 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7887}
7888
7889static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7890{
7891 struct task_group *tg = cgroup_tg(cgrp);
7892
7893 return (u64) tg->rt_ratio;
7894}
7895
7257static struct cftype cpu_files[] = { 7896static struct cftype cpu_files[] = {
7258 { 7897 {
7259 .name = "shares", 7898 .name = "shares",
7260 .read_uint = cpu_shares_read_uint, 7899 .read_uint = cpu_shares_read_uint,
7261 .write_uint = cpu_shares_write_uint, 7900 .write_uint = cpu_shares_write_uint,
7262 }, 7901 },
7902 {
7903 .name = "rt_ratio",
7904 .read_uint = cpu_rt_ratio_read_uint,
7905 .write_uint = cpu_rt_ratio_write_uint,
7906 },
7263}; 7907};
7264 7908
7265static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7909static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)