aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1400
1 files changed, 1020 insertions, 380 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11ca6df3..ba4c88088f62 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
25 */ 27 */
26 28
27#include <linux/mm.h> 29#include <linux/mm.h>
@@ -63,6 +65,7 @@
63#include <linux/reciprocal_div.h> 65#include <linux/reciprocal_div.h>
64#include <linux/unistd.h> 66#include <linux/unistd.h>
65#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
66 69
67#include <asm/tlb.h> 70#include <asm/tlb.h>
68#include <asm/irq_regs.h> 71#include <asm/irq_regs.h>
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
96#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
97 100
98/* 101/*
99 * Some helpers for converting nanosecond timing to jiffy resolution 102 * Helpers for converting nanosecond timing to jiffy resolution
100 */ 103 */
101#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
102#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
103 105
104#define NICE_0_LOAD SCHED_LOAD_SCALE 106#define NICE_0_LOAD SCHED_LOAD_SCALE
105#define NICE_0_SHIFT SCHED_LOAD_SHIFT 107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -159,6 +161,8 @@ struct rt_prio_array {
159 161
160struct cfs_rq; 162struct cfs_rq;
161 163
164static LIST_HEAD(task_groups);
165
162/* task group related information */ 166/* task group related information */
163struct task_group { 167struct task_group {
164#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -168,10 +172,50 @@ struct task_group {
168 struct sched_entity **se; 172 struct sched_entity **se;
169 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
170 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /*
182 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is
184 * the cpu bandwidth allocated to it.
185 *
186 * For ex, lets say that there are three task groups, A, B and C which
187 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
188 * cpu bandwidth allocated by the scheduler to task groups A, B and C
189 * should be:
190 *
191 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
192 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
193 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
194 *
195 * The weight assigned to a task group's schedulable entities on every
196 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
197 * group's shares. For ex: lets say that task group A has been
198 * assigned shares of 1000 and there are two CPUs in a system. Then,
199 *
200 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
201 *
202 * Note: It's not necessary that each of a task's group schedulable
203 * entity have the same weight on all CPUs. If the group
204 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
205 * better distribution of weight could be:
206 *
207 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
208 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
209 *
210 * rebalance_shares() is responsible for distributing the shares of a
211 * task groups like this among the group's schedulable entities across
212 * cpus.
213 *
214 */
171 unsigned long shares; 215 unsigned long shares;
172 /* spinlock to serialize modification to shares */ 216
173 spinlock_t lock;
174 struct rcu_head rcu; 217 struct rcu_head rcu;
218 struct list_head list;
175}; 219};
176 220
177/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
179/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
180static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
181 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
182static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
183static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
184 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
235/* task_group_mutex serializes add/remove of task groups and also changes to
236 * a task group's cpu shares.
237 */
238static DEFINE_MUTEX(task_group_mutex);
239
240/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex);
242
243#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task;
246static int load_balance_monitor(void *unused);
247#endif
248
249static void set_se_shares(struct sched_entity *se, unsigned long shares);
250
185/* Default task group. 251/* Default task group.
186 * Every task in system belong to this group at bootup. 252 * Every task in system belong to this group at bootup.
187 */ 253 */
188struct task_group init_task_group = { 254struct task_group init_task_group = {
189 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
190 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
191}; 260};
192 261
193#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
194# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD 263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
195#else 264#else
196# define INIT_TASK_GRP_LOAD NICE_0_LOAD 265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
197#endif 266#endif
198 267
199static int init_task_group_load = INIT_TASK_GRP_LOAD; 268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
200 271
201/* return group to which a task belongs */ 272/* return group to which a task belongs */
202static inline struct task_group *task_group(struct task_struct *p) 273static inline struct task_group *task_group(struct task_struct *p)
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p)
215} 286}
216 287
217/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
218static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
219{ 290{
220 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
221 p->se.parent = task_group(p)->se[cpu]; 292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
296}
297
298static inline void lock_task_group_list(void)
299{
300 mutex_lock(&task_group_mutex);
301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306}
307
308static inline void lock_doms_cur(void)
309{
310 mutex_lock(&doms_cur_mutex);
311}
312
313static inline void unlock_doms_cur(void)
314{
315 mutex_unlock(&doms_cur_mutex);
222} 316}
223 317
224#else 318#else
225 319
226static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { }
227 325
228#endif /* CONFIG_FAIR_GROUP_SCHED */ 326#endif /* CONFIG_FAIR_GROUP_SCHED */
229 327
@@ -264,11 +362,57 @@ struct cfs_rq {
264/* Real-Time classes' related field in a runqueue: */ 362/* Real-Time classes' related field in a runqueue: */
265struct rt_rq { 363struct rt_rq {
266 struct rt_prio_array active; 364 struct rt_prio_array active;
267 int rt_load_balance_idx; 365 unsigned long rt_nr_running;
268 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
369#ifdef CONFIG_SMP
370 unsigned long rt_nr_migratory;
371 int overloaded;
372#endif
373 int rt_throttled;
374 u64 rt_time;
375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
382};
383
384#ifdef CONFIG_SMP
385
386/*
387 * We add the notion of a root-domain which will be used to define per-domain
388 * variables. Each exclusive cpuset essentially defines an island domain by
389 * fully partitioning the member cpus from any other cpuset. Whenever a new
390 * exclusive cpuset is created, we also create and attach a new root-domain
391 * object.
392 *
393 */
394struct root_domain {
395 atomic_t refcount;
396 cpumask_t span;
397 cpumask_t online;
398
399 /*
400 * The "RT overload" flag: it gets set if a CPU has more than
401 * one runnable RT task.
402 */
403 cpumask_t rto_mask;
404 atomic_t rto_count;
269}; 405};
270 406
271/* 407/*
408 * By default the system creates a single root-domain with all cpus as
409 * members (mimicking the global state we have today).
410 */
411static struct root_domain def_root_domain;
412
413#endif
414
415/*
272 * This is the main, per-CPU runqueue data structure. 416 * This is the main, per-CPU runqueue data structure.
273 * 417 *
274 * Locking rule: those places that want to lock multiple runqueues 418 * Locking rule: those places that want to lock multiple runqueues
@@ -296,11 +440,15 @@ struct rq {
296 u64 nr_switches; 440 u64 nr_switches;
297 441
298 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445 int rt_throttled;
446
299#ifdef CONFIG_FAIR_GROUP_SCHED 447#ifdef CONFIG_FAIR_GROUP_SCHED
300 /* list of leaf cfs_rq on this cpu: */ 448 /* list of leaf cfs_rq on this cpu: */
301 struct list_head leaf_cfs_rq_list; 449 struct list_head leaf_cfs_rq_list;
450 struct list_head leaf_rt_rq_list;
302#endif 451#endif
303 struct rt_rq rt;
304 452
305 /* 453 /*
306 * This is part of a global counter where only the total sum 454 * This is part of a global counter where only the total sum
@@ -317,7 +465,7 @@ struct rq {
317 u64 clock, prev_clock_raw; 465 u64 clock, prev_clock_raw;
318 s64 clock_max_delta; 466 s64 clock_max_delta;
319 467
320 unsigned int clock_warps, clock_overflows; 468 unsigned int clock_warps, clock_overflows, clock_underflows;
321 u64 idle_clock; 469 u64 idle_clock;
322 unsigned int clock_deep_idle_events; 470 unsigned int clock_deep_idle_events;
323 u64 tick_timestamp; 471 u64 tick_timestamp;
@@ -325,6 +473,7 @@ struct rq {
325 atomic_t nr_iowait; 473 atomic_t nr_iowait;
326 474
327#ifdef CONFIG_SMP 475#ifdef CONFIG_SMP
476 struct root_domain *rd;
328 struct sched_domain *sd; 477 struct sched_domain *sd;
329 478
330 /* For active balancing */ 479 /* For active balancing */
@@ -337,6 +486,12 @@ struct rq {
337 struct list_head migration_queue; 486 struct list_head migration_queue;
338#endif 487#endif
339 488
489#ifdef CONFIG_SCHED_HRTICK
490 unsigned long hrtick_flags;
491 ktime_t hrtick_expire;
492 struct hrtimer hrtick_timer;
493#endif
494
340#ifdef CONFIG_SCHEDSTATS 495#ifdef CONFIG_SCHEDSTATS
341 /* latency stats */ 496 /* latency stats */
342 struct sched_info rq_sched_info; 497 struct sched_info rq_sched_info;
@@ -363,7 +518,6 @@ struct rq {
363}; 518};
364 519
365static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 520static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
366static DEFINE_MUTEX(sched_hotcpu_mutex);
367 521
368static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 522static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
369{ 523{
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
441#define task_rq(p) cpu_rq(task_cpu(p)) 595#define task_rq(p) cpu_rq(task_cpu(p))
442#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 596#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
443 597
598unsigned long rt_needs_cpu(int cpu)
599{
600 struct rq *rq = cpu_rq(cpu);
601 u64 delta;
602
603 if (!rq->rt_throttled)
604 return 0;
605
606 if (rq->clock > rq->rt_period_expire)
607 return 1;
608
609 delta = rq->rt_period_expire - rq->clock;
610 do_div(delta, NSEC_PER_SEC / HZ);
611
612 return (unsigned long)delta;
613}
614
444/* 615/*
445 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 616 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
446 */ 617 */
@@ -459,6 +630,8 @@ enum {
459 SCHED_FEAT_START_DEBIT = 4, 630 SCHED_FEAT_START_DEBIT = 4,
460 SCHED_FEAT_TREE_AVG = 8, 631 SCHED_FEAT_TREE_AVG = 8,
461 SCHED_FEAT_APPROX_AVG = 16, 632 SCHED_FEAT_APPROX_AVG = 16,
633 SCHED_FEAT_HRTICK = 32,
634 SCHED_FEAT_DOUBLE_TICK = 64,
462}; 635};
463 636
464const_debug unsigned int sysctl_sched_features = 637const_debug unsigned int sysctl_sched_features =
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features =
466 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 639 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
467 SCHED_FEAT_START_DEBIT * 1 | 640 SCHED_FEAT_START_DEBIT * 1 |
468 SCHED_FEAT_TREE_AVG * 0 | 641 SCHED_FEAT_TREE_AVG * 0 |
469 SCHED_FEAT_APPROX_AVG * 0; 642 SCHED_FEAT_APPROX_AVG * 0 |
643 SCHED_FEAT_HRTICK * 1 |
644 SCHED_FEAT_DOUBLE_TICK * 0;
470 645
471#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 646#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
472 647
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features =
477const_debug unsigned int sysctl_sched_nr_migrate = 32; 652const_debug unsigned int sysctl_sched_nr_migrate = 32;
478 653
479/* 654/*
655 * period over which we measure -rt task cpu usage in ms.
656 * default: 1s
657 */
658const_debug unsigned int sysctl_sched_rt_period = 1000;
659
660#define SCHED_RT_FRAC_SHIFT 16
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
662
663/*
664 * ratio of time -rt tasks may consume.
665 * default: 95%
666 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259;
668
669/*
480 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
481 * clock constructed from sched_clock(): 671 * clock constructed from sched_clock():
482 */ 672 */
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
668 struct rq *rq = cpu_rq(smp_processor_id()); 858 struct rq *rq = cpu_rq(smp_processor_id());
669 u64 now = sched_clock(); 859 u64 now = sched_clock();
670 860
671 touch_softlockup_watchdog();
672 rq->idle_clock += delta_ns; 861 rq->idle_clock += delta_ns;
673 /* 862 /*
674 * Override the previous timestamp and ignore all 863 * Override the previous timestamp and ignore all
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
680 rq->prev_clock_raw = now; 869 rq->prev_clock_raw = now;
681 rq->clock += delta_ns; 870 rq->clock += delta_ns;
682 spin_unlock(&rq->lock); 871 spin_unlock(&rq->lock);
872 touch_softlockup_watchdog();
683} 873}
684EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 874EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
685 875
876static void __resched_task(struct task_struct *p, int tif_bit);
877
878static inline void resched_task(struct task_struct *p)
879{
880 __resched_task(p, TIF_NEED_RESCHED);
881}
882
883#ifdef CONFIG_SCHED_HRTICK
884/*
885 * Use HR-timers to deliver accurate preemption points.
886 *
887 * Its all a bit involved since we cannot program an hrt while holding the
888 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
889 * reschedule event.
890 *
891 * When we get rescheduled we reprogram the hrtick_timer outside of the
892 * rq->lock.
893 */
894static inline void resched_hrt(struct task_struct *p)
895{
896 __resched_task(p, TIF_HRTICK_RESCHED);
897}
898
899static inline void resched_rq(struct rq *rq)
900{
901 unsigned long flags;
902
903 spin_lock_irqsave(&rq->lock, flags);
904 resched_task(rq->curr);
905 spin_unlock_irqrestore(&rq->lock, flags);
906}
907
908enum {
909 HRTICK_SET, /* re-programm hrtick_timer */
910 HRTICK_RESET, /* not a new slice */
911};
912
913/*
914 * Use hrtick when:
915 * - enabled by features
916 * - hrtimer is actually high res
917 */
918static inline int hrtick_enabled(struct rq *rq)
919{
920 if (!sched_feat(HRTICK))
921 return 0;
922 return hrtimer_is_hres_active(&rq->hrtick_timer);
923}
924
925/*
926 * Called to set the hrtick timer state.
927 *
928 * called with rq->lock held and irqs disabled
929 */
930static void hrtick_start(struct rq *rq, u64 delay, int reset)
931{
932 assert_spin_locked(&rq->lock);
933
934 /*
935 * preempt at: now + delay
936 */
937 rq->hrtick_expire =
938 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
939 /*
940 * indicate we need to program the timer
941 */
942 __set_bit(HRTICK_SET, &rq->hrtick_flags);
943 if (reset)
944 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
945
946 /*
947 * New slices are called from the schedule path and don't need a
948 * forced reschedule.
949 */
950 if (reset)
951 resched_hrt(rq->curr);
952}
953
954static void hrtick_clear(struct rq *rq)
955{
956 if (hrtimer_active(&rq->hrtick_timer))
957 hrtimer_cancel(&rq->hrtick_timer);
958}
959
960/*
961 * Update the timer from the possible pending state.
962 */
963static void hrtick_set(struct rq *rq)
964{
965 ktime_t time;
966 int set, reset;
967 unsigned long flags;
968
969 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
970
971 spin_lock_irqsave(&rq->lock, flags);
972 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
973 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
974 time = rq->hrtick_expire;
975 clear_thread_flag(TIF_HRTICK_RESCHED);
976 spin_unlock_irqrestore(&rq->lock, flags);
977
978 if (set) {
979 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
980 if (reset && !hrtimer_active(&rq->hrtick_timer))
981 resched_rq(rq);
982 } else
983 hrtick_clear(rq);
984}
985
986/*
987 * High-resolution timer tick.
988 * Runs from hardirq context with interrupts disabled.
989 */
990static enum hrtimer_restart hrtick(struct hrtimer *timer)
991{
992 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
993
994 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
995
996 spin_lock(&rq->lock);
997 __update_rq_clock(rq);
998 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
999 spin_unlock(&rq->lock);
1000
1001 return HRTIMER_NORESTART;
1002}
1003
1004static inline void init_rq_hrtick(struct rq *rq)
1005{
1006 rq->hrtick_flags = 0;
1007 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1008 rq->hrtick_timer.function = hrtick;
1009 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1010}
1011
1012void hrtick_resched(void)
1013{
1014 struct rq *rq;
1015 unsigned long flags;
1016
1017 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1018 return;
1019
1020 local_irq_save(flags);
1021 rq = cpu_rq(smp_processor_id());
1022 hrtick_set(rq);
1023 local_irq_restore(flags);
1024}
1025#else
1026static inline void hrtick_clear(struct rq *rq)
1027{
1028}
1029
1030static inline void hrtick_set(struct rq *rq)
1031{
1032}
1033
1034static inline void init_rq_hrtick(struct rq *rq)
1035{
1036}
1037
1038void hrtick_resched(void)
1039{
1040}
1041#endif
1042
686/* 1043/*
687 * resched_task - mark a task 'to be rescheduled now'. 1044 * resched_task - mark a task 'to be rescheduled now'.
688 * 1045 *
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
696#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1053#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
697#endif 1054#endif
698 1055
699static void resched_task(struct task_struct *p) 1056static void __resched_task(struct task_struct *p, int tif_bit)
700{ 1057{
701 int cpu; 1058 int cpu;
702 1059
703 assert_spin_locked(&task_rq(p)->lock); 1060 assert_spin_locked(&task_rq(p)->lock);
704 1061
705 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1062 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
706 return; 1063 return;
707 1064
708 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1065 set_tsk_thread_flag(p, tif_bit);
709 1066
710 cpu = task_cpu(p); 1067 cpu = task_cpu(p);
711 if (cpu == smp_processor_id()) 1068 if (cpu == smp_processor_id())
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu)
728 spin_unlock_irqrestore(&rq->lock, flags); 1085 spin_unlock_irqrestore(&rq->lock, flags);
729} 1086}
730#else 1087#else
731static inline void resched_task(struct task_struct *p) 1088static void __resched_task(struct task_struct *p, int tif_bit)
732{ 1089{
733 assert_spin_locked(&task_rq(p)->lock); 1090 assert_spin_locked(&task_rq(p)->lock);
734 set_tsk_need_resched(p); 1091 set_tsk_thread_flag(p, tif_bit);
735} 1092}
736#endif 1093#endif
737 1094
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
871static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1228static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
872#endif 1229#endif
873 1230
1231static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1232{
1233 update_load_add(&rq->load, load);
1234}
1235
1236static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1237{
1238 update_load_sub(&rq->load, load);
1239}
1240
1241#ifdef CONFIG_SMP
1242static unsigned long source_load(int cpu, int type);
1243static unsigned long target_load(int cpu, int type);
1244static unsigned long cpu_avg_load_per_task(int cpu);
1245static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1246#endif /* CONFIG_SMP */
1247
874#include "sched_stats.h" 1248#include "sched_stats.h"
875#include "sched_idletask.c" 1249#include "sched_idletask.c"
876#include "sched_fair.c" 1250#include "sched_fair.c"
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
881 1255
882#define sched_class_highest (&rt_sched_class) 1256#define sched_class_highest (&rt_sched_class)
883 1257
884/*
885 * Update delta_exec, delta_fair fields for rq.
886 *
887 * delta_fair clock advances at a rate inversely proportional to
888 * total load (rq->load.weight) on the runqueue, while
889 * delta_exec advances at the same rate as wall-clock (provided
890 * cpu is not idle).
891 *
892 * delta_exec / delta_fair is a measure of the (smoothened) load on this
893 * runqueue over any given interval. This (smoothened) load is used
894 * during load balance.
895 *
896 * This function is called /before/ updating rq->load
897 * and when switching tasks.
898 */
899static inline void inc_load(struct rq *rq, const struct task_struct *p)
900{
901 update_load_add(&rq->load, p->se.load.weight);
902}
903
904static inline void dec_load(struct rq *rq, const struct task_struct *p)
905{
906 update_load_sub(&rq->load, p->se.load.weight);
907}
908
909static void inc_nr_running(struct task_struct *p, struct rq *rq) 1258static void inc_nr_running(struct task_struct *p, struct rq *rq)
910{ 1259{
911 rq->nr_running++; 1260 rq->nr_running++;
912 inc_load(rq, p);
913} 1261}
914 1262
915static void dec_nr_running(struct task_struct *p, struct rq *rq) 1263static void dec_nr_running(struct task_struct *p, struct rq *rq)
916{ 1264{
917 rq->nr_running--; 1265 rq->nr_running--;
918 dec_load(rq, p);
919} 1266}
920 1267
921static void set_load_weight(struct task_struct *p) 1268static void set_load_weight(struct task_struct *p)
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu)
1039 1386
1040static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1387static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1041{ 1388{
1042 set_task_cfs_rq(p, cpu); 1389 set_task_rq(p, cpu);
1043#ifdef CONFIG_SMP 1390#ifdef CONFIG_SMP
1044 /* 1391 /*
1045 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1392 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1051#endif 1398#endif
1052} 1399}
1053 1400
1401static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1402 const struct sched_class *prev_class,
1403 int oldprio, int running)
1404{
1405 if (prev_class != p->sched_class) {
1406 if (prev_class->switched_from)
1407 prev_class->switched_from(rq, p, running);
1408 p->sched_class->switched_to(rq, p, running);
1409 } else
1410 p->sched_class->prio_changed(rq, p, oldprio, running);
1411}
1412
1054#ifdef CONFIG_SMP 1413#ifdef CONFIG_SMP
1055 1414
1056/* 1415/*
1057 * Is this task likely cache-hot: 1416 * Is this task likely cache-hot:
1058 */ 1417 */
1059static inline int 1418static int
1060task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1419task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1061{ 1420{
1062 s64 delta; 1421 s64 delta;
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type)
1281/* 1640/*
1282 * Return the average load per task on the cpu's run queue 1641 * Return the average load per task on the cpu's run queue
1283 */ 1642 */
1284static inline unsigned long cpu_avg_load_per_task(int cpu) 1643static unsigned long cpu_avg_load_per_task(int cpu)
1285{ 1644{
1286 struct rq *rq = cpu_rq(cpu); 1645 struct rq *rq = cpu_rq(cpu);
1287 unsigned long total = weighted_cpuload(cpu); 1646 unsigned long total = weighted_cpuload(cpu);
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag)
1438 1797
1439#endif /* CONFIG_SMP */ 1798#endif /* CONFIG_SMP */
1440 1799
1441/*
1442 * wake_idle() will wake a task on an idle cpu if task->cpu is
1443 * not idle and an idle cpu is available. The span of cpus to
1444 * search starts with cpus closest then further out as needed,
1445 * so we always favor a closer, idle cpu.
1446 *
1447 * Returns the CPU we should wake onto.
1448 */
1449#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1450static int wake_idle(int cpu, struct task_struct *p)
1451{
1452 cpumask_t tmp;
1453 struct sched_domain *sd;
1454 int i;
1455
1456 /*
1457 * If it is idle, then it is the best cpu to run this task.
1458 *
1459 * This cpu is also the best, if it has more than one task already.
1460 * Siblings must be also busy(in most cases) as they didn't already
1461 * pickup the extra load from this cpu and hence we need not check
1462 * sibling runqueue info. This will avoid the checks and cache miss
1463 * penalities associated with that.
1464 */
1465 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1466 return cpu;
1467
1468 for_each_domain(cpu, sd) {
1469 if (sd->flags & SD_WAKE_IDLE) {
1470 cpus_and(tmp, sd->span, p->cpus_allowed);
1471 for_each_cpu_mask(i, tmp) {
1472 if (idle_cpu(i)) {
1473 if (i != task_cpu(p)) {
1474 schedstat_inc(p,
1475 se.nr_wakeups_idle);
1476 }
1477 return i;
1478 }
1479 }
1480 } else {
1481 break;
1482 }
1483 }
1484 return cpu;
1485}
1486#else
1487static inline int wake_idle(int cpu, struct task_struct *p)
1488{
1489 return cpu;
1490}
1491#endif
1492
1493/*** 1800/***
1494 * try_to_wake_up - wake up a thread 1801 * try_to_wake_up - wake up a thread
1495 * @p: the to-be-woken-up thread 1802 * @p: the to-be-woken-up thread
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1510 unsigned long flags; 1817 unsigned long flags;
1511 long old_state; 1818 long old_state;
1512 struct rq *rq; 1819 struct rq *rq;
1513#ifdef CONFIG_SMP
1514 struct sched_domain *sd, *this_sd = NULL;
1515 unsigned long load, this_load;
1516 int new_cpu;
1517#endif
1518 1820
1519 rq = task_rq_lock(p, &flags); 1821 rq = task_rq_lock(p, &flags);
1520 old_state = p->state; 1822 old_state = p->state;
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1532 if (unlikely(task_running(rq, p))) 1834 if (unlikely(task_running(rq, p)))
1533 goto out_activate; 1835 goto out_activate;
1534 1836
1535 new_cpu = cpu; 1837 cpu = p->sched_class->select_task_rq(p, sync);
1536 1838 if (cpu != orig_cpu) {
1537 schedstat_inc(rq, ttwu_count); 1839 set_task_cpu(p, cpu);
1538 if (cpu == this_cpu) {
1539 schedstat_inc(rq, ttwu_local);
1540 goto out_set_cpu;
1541 }
1542
1543 for_each_domain(this_cpu, sd) {
1544 if (cpu_isset(cpu, sd->span)) {
1545 schedstat_inc(sd, ttwu_wake_remote);
1546 this_sd = sd;
1547 break;
1548 }
1549 }
1550
1551 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1552 goto out_set_cpu;
1553
1554 /*
1555 * Check for affine wakeup and passive balancing possibilities.
1556 */
1557 if (this_sd) {
1558 int idx = this_sd->wake_idx;
1559 unsigned int imbalance;
1560
1561 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1562
1563 load = source_load(cpu, idx);
1564 this_load = target_load(this_cpu, idx);
1565
1566 new_cpu = this_cpu; /* Wake to this CPU if we can */
1567
1568 if (this_sd->flags & SD_WAKE_AFFINE) {
1569 unsigned long tl = this_load;
1570 unsigned long tl_per_task;
1571
1572 /*
1573 * Attract cache-cold tasks on sync wakeups:
1574 */
1575 if (sync && !task_hot(p, rq->clock, this_sd))
1576 goto out_set_cpu;
1577
1578 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1579 tl_per_task = cpu_avg_load_per_task(this_cpu);
1580
1581 /*
1582 * If sync wakeup then subtract the (maximum possible)
1583 * effect of the currently running task from the load
1584 * of the current CPU:
1585 */
1586 if (sync)
1587 tl -= current->se.load.weight;
1588
1589 if ((tl <= load &&
1590 tl + target_load(cpu, idx) <= tl_per_task) ||
1591 100*(tl + p->se.load.weight) <= imbalance*load) {
1592 /*
1593 * This domain has SD_WAKE_AFFINE and
1594 * p is cache cold in this domain, and
1595 * there is no bad imbalance.
1596 */
1597 schedstat_inc(this_sd, ttwu_move_affine);
1598 schedstat_inc(p, se.nr_wakeups_affine);
1599 goto out_set_cpu;
1600 }
1601 }
1602
1603 /*
1604 * Start passive balancing when half the imbalance_pct
1605 * limit is reached.
1606 */
1607 if (this_sd->flags & SD_WAKE_BALANCE) {
1608 if (imbalance*this_load <= 100*load) {
1609 schedstat_inc(this_sd, ttwu_move_balance);
1610 schedstat_inc(p, se.nr_wakeups_passive);
1611 goto out_set_cpu;
1612 }
1613 }
1614 }
1615
1616 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1617out_set_cpu:
1618 new_cpu = wake_idle(new_cpu, p);
1619 if (new_cpu != cpu) {
1620 set_task_cpu(p, new_cpu);
1621 task_rq_unlock(rq, &flags); 1840 task_rq_unlock(rq, &flags);
1622 /* might preempt at this point */ 1841 /* might preempt at this point */
1623 rq = task_rq_lock(p, &flags); 1842 rq = task_rq_lock(p, &flags);
@@ -1631,6 +1850,21 @@ out_set_cpu:
1631 cpu = task_cpu(p); 1850 cpu = task_cpu(p);
1632 } 1851 }
1633 1852
1853#ifdef CONFIG_SCHEDSTATS
1854 schedstat_inc(rq, ttwu_count);
1855 if (cpu == this_cpu)
1856 schedstat_inc(rq, ttwu_local);
1857 else {
1858 struct sched_domain *sd;
1859 for_each_domain(this_cpu, sd) {
1860 if (cpu_isset(cpu, sd->span)) {
1861 schedstat_inc(sd, ttwu_wake_remote);
1862 break;
1863 }
1864 }
1865 }
1866#endif
1867
1634out_activate: 1868out_activate:
1635#endif /* CONFIG_SMP */ 1869#endif /* CONFIG_SMP */
1636 schedstat_inc(p, se.nr_wakeups); 1870 schedstat_inc(p, se.nr_wakeups);
@@ -1649,6 +1883,10 @@ out_activate:
1649 1883
1650out_running: 1884out_running:
1651 p->state = TASK_RUNNING; 1885 p->state = TASK_RUNNING;
1886#ifdef CONFIG_SMP
1887 if (p->sched_class->task_wake_up)
1888 p->sched_class->task_wake_up(rq, p);
1889#endif
1652out: 1890out:
1653 task_rq_unlock(rq, &flags); 1891 task_rq_unlock(rq, &flags);
1654 1892
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p)
1691 p->se.wait_max = 0; 1929 p->se.wait_max = 0;
1692#endif 1930#endif
1693 1931
1694 INIT_LIST_HEAD(&p->run_list); 1932 INIT_LIST_HEAD(&p->rt.run_list);
1695 p->se.on_rq = 0; 1933 p->se.on_rq = 0;
1696 1934
1697#ifdef CONFIG_PREEMPT_NOTIFIERS 1935#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1771 inc_nr_running(p, rq); 2009 inc_nr_running(p, rq);
1772 } 2010 }
1773 check_preempt_curr(rq, p); 2011 check_preempt_curr(rq, p);
2012#ifdef CONFIG_SMP
2013 if (p->sched_class->task_wake_up)
2014 p->sched_class->task_wake_up(rq, p);
2015#endif
1774 task_rq_unlock(rq, &flags); 2016 task_rq_unlock(rq, &flags);
1775} 2017}
1776 2018
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1891 prev_state = prev->state; 2133 prev_state = prev->state;
1892 finish_arch_switch(prev); 2134 finish_arch_switch(prev);
1893 finish_lock_switch(rq, prev); 2135 finish_lock_switch(rq, prev);
2136#ifdef CONFIG_SMP
2137 if (current->sched_class->post_schedule)
2138 current->sched_class->post_schedule(rq);
2139#endif
2140
1894 fire_sched_in_preempt_notifiers(current); 2141 fire_sched_in_preempt_notifiers(current);
1895 if (mm) 2142 if (mm)
1896 mmdrop(mm); 2143 mmdrop(mm);
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2124/* 2371/*
2125 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2372 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2126 */ 2373 */
2127static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2374static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2128 __releases(this_rq->lock) 2375 __releases(this_rq->lock)
2129 __acquires(busiest->lock) 2376 __acquires(busiest->lock)
2130 __acquires(this_rq->lock) 2377 __acquires(this_rq->lock)
2131{ 2378{
2379 int ret = 0;
2380
2132 if (unlikely(!irqs_disabled())) { 2381 if (unlikely(!irqs_disabled())) {
2133 /* printk() doesn't work good under rq->lock */ 2382 /* printk() doesn't work good under rq->lock */
2134 spin_unlock(&this_rq->lock); 2383 spin_unlock(&this_rq->lock);
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2139 spin_unlock(&this_rq->lock); 2388 spin_unlock(&this_rq->lock);
2140 spin_lock(&busiest->lock); 2389 spin_lock(&busiest->lock);
2141 spin_lock(&this_rq->lock); 2390 spin_lock(&this_rq->lock);
2391 ret = 1;
2142 } else 2392 } else
2143 spin_lock(&busiest->lock); 2393 spin_lock(&busiest->lock);
2144 } 2394 }
2395 return ret;
2145} 2396}
2146 2397
2147/* 2398/*
@@ -3485,12 +3736,14 @@ void scheduler_tick(void)
3485 /* 3736 /*
3486 * Let rq->clock advance by at least TICK_NSEC: 3737 * Let rq->clock advance by at least TICK_NSEC:
3487 */ 3738 */
3488 if (unlikely(rq->clock < next_tick)) 3739 if (unlikely(rq->clock < next_tick)) {
3489 rq->clock = next_tick; 3740 rq->clock = next_tick;
3741 rq->clock_underflows++;
3742 }
3490 rq->tick_timestamp = rq->clock; 3743 rq->tick_timestamp = rq->clock;
3491 update_cpu_load(rq); 3744 update_cpu_load(rq);
3492 if (curr != rq->idle) /* FIXME: needed? */ 3745 curr->sched_class->task_tick(rq, curr, 0);
3493 curr->sched_class->task_tick(rq, curr); 3746 update_sched_rt_period(rq);
3494 spin_unlock(&rq->lock); 3747 spin_unlock(&rq->lock);
3495 3748
3496#ifdef CONFIG_SMP 3749#ifdef CONFIG_SMP
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible:
3636 3889
3637 schedule_debug(prev); 3890 schedule_debug(prev);
3638 3891
3892 hrtick_clear(rq);
3893
3639 /* 3894 /*
3640 * Do the rq-clock update outside the rq lock: 3895 * Do the rq-clock update outside the rq lock:
3641 */ 3896 */
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible:
3654 switch_count = &prev->nvcsw; 3909 switch_count = &prev->nvcsw;
3655 } 3910 }
3656 3911
3912#ifdef CONFIG_SMP
3913 if (prev->sched_class->pre_schedule)
3914 prev->sched_class->pre_schedule(rq, prev);
3915#endif
3916
3657 if (unlikely(!rq->nr_running)) 3917 if (unlikely(!rq->nr_running))
3658 idle_balance(cpu, rq); 3918 idle_balance(cpu, rq);
3659 3919
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible:
3668 ++*switch_count; 3928 ++*switch_count;
3669 3929
3670 context_switch(rq, prev, next); /* unlocks the rq */ 3930 context_switch(rq, prev, next); /* unlocks the rq */
3931 /*
3932 * the context switch might have flipped the stack from under
3933 * us, hence refresh the local variables.
3934 */
3935 cpu = smp_processor_id();
3936 rq = cpu_rq(cpu);
3671 } else 3937 } else
3672 spin_unlock_irq(&rq->lock); 3938 spin_unlock_irq(&rq->lock);
3673 3939
3674 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3940 hrtick_set(rq);
3675 cpu = smp_processor_id(); 3941
3676 rq = cpu_rq(cpu); 3942 if (unlikely(reacquire_kernel_lock(current) < 0))
3677 goto need_resched_nonpreemptible; 3943 goto need_resched_nonpreemptible;
3678 } 3944
3679 preempt_enable_no_resched(); 3945 preempt_enable_no_resched();
3680 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3946 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3681 goto need_resched; 3947 goto need_resched;
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule);
3691asmlinkage void __sched preempt_schedule(void) 3957asmlinkage void __sched preempt_schedule(void)
3692{ 3958{
3693 struct thread_info *ti = current_thread_info(); 3959 struct thread_info *ti = current_thread_info();
3694#ifdef CONFIG_PREEMPT_BKL
3695 struct task_struct *task = current; 3960 struct task_struct *task = current;
3696 int saved_lock_depth; 3961 int saved_lock_depth;
3697#endif 3962
3698 /* 3963 /*
3699 * If there is a non-zero preempt_count or interrupts are disabled, 3964 * If there is a non-zero preempt_count or interrupts are disabled,
3700 * we do not want to preempt the current task. Just return.. 3965 * we do not want to preempt the current task. Just return..
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void)
3710 * clear ->lock_depth so that schedule() doesnt 3975 * clear ->lock_depth so that schedule() doesnt
3711 * auto-release the semaphore: 3976 * auto-release the semaphore:
3712 */ 3977 */
3713#ifdef CONFIG_PREEMPT_BKL
3714 saved_lock_depth = task->lock_depth; 3978 saved_lock_depth = task->lock_depth;
3715 task->lock_depth = -1; 3979 task->lock_depth = -1;
3716#endif
3717 schedule(); 3980 schedule();
3718#ifdef CONFIG_PREEMPT_BKL
3719 task->lock_depth = saved_lock_depth; 3981 task->lock_depth = saved_lock_depth;
3720#endif
3721 sub_preempt_count(PREEMPT_ACTIVE); 3982 sub_preempt_count(PREEMPT_ACTIVE);
3722 3983
3723 /* 3984 /*
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule);
3738asmlinkage void __sched preempt_schedule_irq(void) 3999asmlinkage void __sched preempt_schedule_irq(void)
3739{ 4000{
3740 struct thread_info *ti = current_thread_info(); 4001 struct thread_info *ti = current_thread_info();
3741#ifdef CONFIG_PREEMPT_BKL
3742 struct task_struct *task = current; 4002 struct task_struct *task = current;
3743 int saved_lock_depth; 4003 int saved_lock_depth;
3744#endif 4004
3745 /* Catch callers which need to be fixed */ 4005 /* Catch callers which need to be fixed */
3746 BUG_ON(ti->preempt_count || !irqs_disabled()); 4006 BUG_ON(ti->preempt_count || !irqs_disabled());
3747 4007
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
3753 * clear ->lock_depth so that schedule() doesnt 4013 * clear ->lock_depth so that schedule() doesnt
3754 * auto-release the semaphore: 4014 * auto-release the semaphore:
3755 */ 4015 */
3756#ifdef CONFIG_PREEMPT_BKL
3757 saved_lock_depth = task->lock_depth; 4016 saved_lock_depth = task->lock_depth;
3758 task->lock_depth = -1; 4017 task->lock_depth = -1;
3759#endif
3760 local_irq_enable(); 4018 local_irq_enable();
3761 schedule(); 4019 schedule();
3762 local_irq_disable(); 4020 local_irq_disable();
3763#ifdef CONFIG_PREEMPT_BKL
3764 task->lock_depth = saved_lock_depth; 4021 task->lock_depth = saved_lock_depth;
3765#endif
3766 sub_preempt_count(PREEMPT_ACTIVE); 4022 sub_preempt_count(PREEMPT_ACTIVE);
3767 4023
3768 /* 4024 /*
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4019 unsigned long flags; 4275 unsigned long flags;
4020 int oldprio, on_rq, running; 4276 int oldprio, on_rq, running;
4021 struct rq *rq; 4277 struct rq *rq;
4278 const struct sched_class *prev_class = p->sched_class;
4022 4279
4023 BUG_ON(prio < 0 || prio > MAX_PRIO); 4280 BUG_ON(prio < 0 || prio > MAX_PRIO);
4024 4281
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4044 if (on_rq) { 4301 if (on_rq) {
4045 if (running) 4302 if (running)
4046 p->sched_class->set_curr_task(rq); 4303 p->sched_class->set_curr_task(rq);
4304
4047 enqueue_task(rq, p, 0); 4305 enqueue_task(rq, p, 0);
4048 /* 4306
4049 * Reschedule if we are currently running on this runqueue and 4307 check_class_changed(rq, p, prev_class, oldprio, running);
4050 * our priority decreased, or if we are not currently running on
4051 * this runqueue and our priority is higher than the current's
4052 */
4053 if (running) {
4054 if (p->prio > oldprio)
4055 resched_task(rq->curr);
4056 } else {
4057 check_preempt_curr(rq, p);
4058 }
4059 } 4308 }
4060 task_rq_unlock(rq, &flags); 4309 task_rq_unlock(rq, &flags);
4061} 4310}
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice)
4087 goto out_unlock; 4336 goto out_unlock;
4088 } 4337 }
4089 on_rq = p->se.on_rq; 4338 on_rq = p->se.on_rq;
4090 if (on_rq) { 4339 if (on_rq)
4091 dequeue_task(rq, p, 0); 4340 dequeue_task(rq, p, 0);
4092 dec_load(rq, p);
4093 }
4094 4341
4095 p->static_prio = NICE_TO_PRIO(nice); 4342 p->static_prio = NICE_TO_PRIO(nice);
4096 set_load_weight(p); 4343 set_load_weight(p);
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice)
4100 4347
4101 if (on_rq) { 4348 if (on_rq) {
4102 enqueue_task(rq, p, 0); 4349 enqueue_task(rq, p, 0);
4103 inc_load(rq, p);
4104 /* 4350 /*
4105 * If the task increased its priority or is running and 4351 * If the task increased its priority or is running and
4106 * lowered its priority, then reschedule its CPU: 4352 * lowered its priority, then reschedule its CPU:
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
4258{ 4504{
4259 int retval, oldprio, oldpolicy = -1, on_rq, running; 4505 int retval, oldprio, oldpolicy = -1, on_rq, running;
4260 unsigned long flags; 4506 unsigned long flags;
4507 const struct sched_class *prev_class = p->sched_class;
4261 struct rq *rq; 4508 struct rq *rq;
4262 4509
4263 /* may grab non-irq protected spin_locks */ 4510 /* may grab non-irq protected spin_locks */
@@ -4351,18 +4598,10 @@ recheck:
4351 if (on_rq) { 4598 if (on_rq) {
4352 if (running) 4599 if (running)
4353 p->sched_class->set_curr_task(rq); 4600 p->sched_class->set_curr_task(rq);
4601
4354 activate_task(rq, p, 0); 4602 activate_task(rq, p, 0);
4355 /* 4603
4356 * Reschedule if we are currently running on this runqueue and 4604 check_class_changed(rq, p, prev_class, oldprio, running);
4357 * our priority decreased, or if we are not currently running on
4358 * this runqueue and our priority is higher than the current's
4359 */
4360 if (running) {
4361 if (p->prio > oldprio)
4362 resched_task(rq->curr);
4363 } else {
4364 check_preempt_curr(rq, p);
4365 }
4366 } 4605 }
4367 __task_rq_unlock(rq); 4606 __task_rq_unlock(rq);
4368 spin_unlock_irqrestore(&p->pi_lock, flags); 4607 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4490 struct task_struct *p; 4729 struct task_struct *p;
4491 int retval; 4730 int retval;
4492 4731
4493 mutex_lock(&sched_hotcpu_mutex); 4732 get_online_cpus();
4494 read_lock(&tasklist_lock); 4733 read_lock(&tasklist_lock);
4495 4734
4496 p = find_process_by_pid(pid); 4735 p = find_process_by_pid(pid);
4497 if (!p) { 4736 if (!p) {
4498 read_unlock(&tasklist_lock); 4737 read_unlock(&tasklist_lock);
4499 mutex_unlock(&sched_hotcpu_mutex); 4738 put_online_cpus();
4500 return -ESRCH; 4739 return -ESRCH;
4501 } 4740 }
4502 4741
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4536 } 4775 }
4537out_unlock: 4776out_unlock:
4538 put_task_struct(p); 4777 put_task_struct(p);
4539 mutex_unlock(&sched_hotcpu_mutex); 4778 put_online_cpus();
4540 return retval; 4779 return retval;
4541} 4780}
4542 4781
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4593 struct task_struct *p; 4832 struct task_struct *p;
4594 int retval; 4833 int retval;
4595 4834
4596 mutex_lock(&sched_hotcpu_mutex); 4835 get_online_cpus();
4597 read_lock(&tasklist_lock); 4836 read_lock(&tasklist_lock);
4598 4837
4599 retval = -ESRCH; 4838 retval = -ESRCH;
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4609 4848
4610out_unlock: 4849out_unlock:
4611 read_unlock(&tasklist_lock); 4850 read_unlock(&tasklist_lock);
4612 mutex_unlock(&sched_hotcpu_mutex); 4851 put_online_cpus();
4613 4852
4614 return retval; 4853 return retval;
4615} 4854}
@@ -4683,7 +4922,8 @@ static void __cond_resched(void)
4683 } while (need_resched()); 4922 } while (need_resched());
4684} 4923}
4685 4924
4686int __sched cond_resched(void) 4925#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
4926int __sched _cond_resched(void)
4687{ 4927{
4688 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4928 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4689 system_state == SYSTEM_RUNNING) { 4929 system_state == SYSTEM_RUNNING) {
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void)
4692 } 4932 }
4693 return 0; 4933 return 0;
4694} 4934}
4695EXPORT_SYMBOL(cond_resched); 4935EXPORT_SYMBOL(_cond_resched);
4936#endif
4696 4937
4697/* 4938/*
4698 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4939 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4704,19 +4945,15 @@ EXPORT_SYMBOL(cond_resched);
4704 */ 4945 */
4705int cond_resched_lock(spinlock_t *lock) 4946int cond_resched_lock(spinlock_t *lock)
4706{ 4947{
4948 int resched = need_resched() && system_state == SYSTEM_RUNNING;
4707 int ret = 0; 4949 int ret = 0;
4708 4950
4709 if (need_lockbreak(lock)) { 4951 if (spin_needbreak(lock) || resched) {
4710 spin_unlock(lock); 4952 spin_unlock(lock);
4711 cpu_relax(); 4953 if (resched && need_resched())
4712 ret = 1; 4954 __cond_resched();
4713 spin_lock(lock); 4955 else
4714 } 4956 cpu_relax();
4715 if (need_resched() && system_state == SYSTEM_RUNNING) {
4716 spin_release(&lock->dep_map, 1, _THIS_IP_);
4717 _raw_spin_unlock(lock);
4718 preempt_enable_no_resched();
4719 __cond_resched();
4720 ret = 1; 4957 ret = 1;
4721 spin_lock(lock); 4958 spin_lock(lock);
4722 } 4959 }
@@ -4890,7 +5127,7 @@ out_unlock:
4890 5127
4891static const char stat_nam[] = "RSDTtZX"; 5128static const char stat_nam[] = "RSDTtZX";
4892 5129
4893static void show_task(struct task_struct *p) 5130void sched_show_task(struct task_struct *p)
4894{ 5131{
4895 unsigned long free = 0; 5132 unsigned long free = 0;
4896 unsigned state; 5133 unsigned state;
@@ -4920,8 +5157,7 @@ static void show_task(struct task_struct *p)
4920 printk(KERN_CONT "%5lu %5d %6d\n", free, 5157 printk(KERN_CONT "%5lu %5d %6d\n", free,
4921 task_pid_nr(p), task_pid_nr(p->real_parent)); 5158 task_pid_nr(p), task_pid_nr(p->real_parent));
4922 5159
4923 if (state != TASK_RUNNING) 5160 show_stack(p, NULL);
4924 show_stack(p, NULL);
4925} 5161}
4926 5162
4927void show_state_filter(unsigned long state_filter) 5163void show_state_filter(unsigned long state_filter)
@@ -4943,7 +5179,7 @@ void show_state_filter(unsigned long state_filter)
4943 */ 5179 */
4944 touch_nmi_watchdog(); 5180 touch_nmi_watchdog();
4945 if (!state_filter || (p->state & state_filter)) 5181 if (!state_filter || (p->state & state_filter))
4946 show_task(p); 5182 sched_show_task(p);
4947 } while_each_thread(g, p); 5183 } while_each_thread(g, p);
4948 5184
4949 touch_all_softlockup_watchdogs(); 5185 touch_all_softlockup_watchdogs();
@@ -4992,11 +5228,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4992 spin_unlock_irqrestore(&rq->lock, flags); 5228 spin_unlock_irqrestore(&rq->lock, flags);
4993 5229
4994 /* Set the preempt count _outside_ the spinlocks! */ 5230 /* Set the preempt count _outside_ the spinlocks! */
4995#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4996 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4997#else
4998 task_thread_info(idle)->preempt_count = 0; 5231 task_thread_info(idle)->preempt_count = 0;
4999#endif 5232
5000 /* 5233 /*
5001 * The idle tasks have their own, simple scheduling class: 5234 * The idle tasks have their own, simple scheduling class:
5002 */ 5235 */
@@ -5077,7 +5310,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5077 goto out; 5310 goto out;
5078 } 5311 }
5079 5312
5080 p->cpus_allowed = new_mask; 5313 if (p->sched_class->set_cpus_allowed)
5314 p->sched_class->set_cpus_allowed(p, &new_mask);
5315 else {
5316 p->cpus_allowed = new_mask;
5317 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5318 }
5319
5081 /* Can the task run on the task's current CPU? If so, we're done */ 5320 /* Can the task run on the task's current CPU? If so, we're done */
5082 if (cpu_isset(task_cpu(p), new_mask)) 5321 if (cpu_isset(task_cpu(p), new_mask))
5083 goto out; 5322 goto out;
@@ -5569,9 +5808,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5569 struct rq *rq; 5808 struct rq *rq;
5570 5809
5571 switch (action) { 5810 switch (action) {
5572 case CPU_LOCK_ACQUIRE:
5573 mutex_lock(&sched_hotcpu_mutex);
5574 break;
5575 5811
5576 case CPU_UP_PREPARE: 5812 case CPU_UP_PREPARE:
5577 case CPU_UP_PREPARE_FROZEN: 5813 case CPU_UP_PREPARE_FROZEN:
@@ -5590,6 +5826,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5590 case CPU_ONLINE_FROZEN: 5826 case CPU_ONLINE_FROZEN:
5591 /* Strictly unnecessary, as first user will wake it. */ 5827 /* Strictly unnecessary, as first user will wake it. */
5592 wake_up_process(cpu_rq(cpu)->migration_thread); 5828 wake_up_process(cpu_rq(cpu)->migration_thread);
5829
5830 /* Update our root-domain */
5831 rq = cpu_rq(cpu);
5832 spin_lock_irqsave(&rq->lock, flags);
5833 if (rq->rd) {
5834 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5835 cpu_set(cpu, rq->rd->online);
5836 }
5837 spin_unlock_irqrestore(&rq->lock, flags);
5593 break; 5838 break;
5594 5839
5595#ifdef CONFIG_HOTPLUG_CPU 5840#ifdef CONFIG_HOTPLUG_CPU
@@ -5640,10 +5885,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5640 } 5885 }
5641 spin_unlock_irq(&rq->lock); 5886 spin_unlock_irq(&rq->lock);
5642 break; 5887 break;
5643#endif 5888
5644 case CPU_LOCK_RELEASE: 5889 case CPU_DOWN_PREPARE:
5645 mutex_unlock(&sched_hotcpu_mutex); 5890 /* Update our root-domain */
5891 rq = cpu_rq(cpu);
5892 spin_lock_irqsave(&rq->lock, flags);
5893 if (rq->rd) {
5894 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5895 cpu_clear(cpu, rq->rd->online);
5896 }
5897 spin_unlock_irqrestore(&rq->lock, flags);
5646 break; 5898 break;
5899#endif
5647 } 5900 }
5648 return NOTIFY_OK; 5901 return NOTIFY_OK;
5649} 5902}
@@ -5831,11 +6084,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5831 return 1; 6084 return 1;
5832} 6085}
5833 6086
6087static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6088{
6089 unsigned long flags;
6090 const struct sched_class *class;
6091
6092 spin_lock_irqsave(&rq->lock, flags);
6093
6094 if (rq->rd) {
6095 struct root_domain *old_rd = rq->rd;
6096
6097 for (class = sched_class_highest; class; class = class->next) {
6098 if (class->leave_domain)
6099 class->leave_domain(rq);
6100 }
6101
6102 cpu_clear(rq->cpu, old_rd->span);
6103 cpu_clear(rq->cpu, old_rd->online);
6104
6105 if (atomic_dec_and_test(&old_rd->refcount))
6106 kfree(old_rd);
6107 }
6108
6109 atomic_inc(&rd->refcount);
6110 rq->rd = rd;
6111
6112 cpu_set(rq->cpu, rd->span);
6113 if (cpu_isset(rq->cpu, cpu_online_map))
6114 cpu_set(rq->cpu, rd->online);
6115
6116 for (class = sched_class_highest; class; class = class->next) {
6117 if (class->join_domain)
6118 class->join_domain(rq);
6119 }
6120
6121 spin_unlock_irqrestore(&rq->lock, flags);
6122}
6123
6124static void init_rootdomain(struct root_domain *rd)
6125{
6126 memset(rd, 0, sizeof(*rd));
6127
6128 cpus_clear(rd->span);
6129 cpus_clear(rd->online);
6130}
6131
6132static void init_defrootdomain(void)
6133{
6134 init_rootdomain(&def_root_domain);
6135 atomic_set(&def_root_domain.refcount, 1);
6136}
6137
6138static struct root_domain *alloc_rootdomain(void)
6139{
6140 struct root_domain *rd;
6141
6142 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6143 if (!rd)
6144 return NULL;
6145
6146 init_rootdomain(rd);
6147
6148 return rd;
6149}
6150
5834/* 6151/*
5835 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6152 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5836 * hold the hotplug lock. 6153 * hold the hotplug lock.
5837 */ 6154 */
5838static void cpu_attach_domain(struct sched_domain *sd, int cpu) 6155static void
6156cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5839{ 6157{
5840 struct rq *rq = cpu_rq(cpu); 6158 struct rq *rq = cpu_rq(cpu);
5841 struct sched_domain *tmp; 6159 struct sched_domain *tmp;
@@ -5860,6 +6178,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5860 6178
5861 sched_domain_debug(sd, cpu); 6179 sched_domain_debug(sd, cpu);
5862 6180
6181 rq_attach_root(rq, rd);
5863 rcu_assign_pointer(rq->sd, sd); 6182 rcu_assign_pointer(rq->sd, sd);
5864} 6183}
5865 6184
@@ -6228,6 +6547,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6228static int build_sched_domains(const cpumask_t *cpu_map) 6547static int build_sched_domains(const cpumask_t *cpu_map)
6229{ 6548{
6230 int i; 6549 int i;
6550 struct root_domain *rd;
6231#ifdef CONFIG_NUMA 6551#ifdef CONFIG_NUMA
6232 struct sched_group **sched_group_nodes = NULL; 6552 struct sched_group **sched_group_nodes = NULL;
6233 int sd_allnodes = 0; 6553 int sd_allnodes = 0;
@@ -6244,6 +6564,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6244 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6564 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6245#endif 6565#endif
6246 6566
6567 rd = alloc_rootdomain();
6568 if (!rd) {
6569 printk(KERN_WARNING "Cannot alloc root domain\n");
6570 return -ENOMEM;
6571 }
6572
6247 /* 6573 /*
6248 * Set up domains for cpus specified by the cpu_map. 6574 * Set up domains for cpus specified by the cpu_map.
6249 */ 6575 */
@@ -6460,7 +6786,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6460#else 6786#else
6461 sd = &per_cpu(phys_domains, i); 6787 sd = &per_cpu(phys_domains, i);
6462#endif 6788#endif
6463 cpu_attach_domain(sd, i); 6789 cpu_attach_domain(sd, rd, i);
6464 } 6790 }
6465 6791
6466 return 0; 6792 return 0;
@@ -6518,7 +6844,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6518 unregister_sched_domain_sysctl(); 6844 unregister_sched_domain_sysctl();
6519 6845
6520 for_each_cpu_mask(i, *cpu_map) 6846 for_each_cpu_mask(i, *cpu_map)
6521 cpu_attach_domain(NULL, i); 6847 cpu_attach_domain(NULL, &def_root_domain, i);
6522 synchronize_sched(); 6848 synchronize_sched();
6523 arch_destroy_sched_domains(cpu_map); 6849 arch_destroy_sched_domains(cpu_map);
6524} 6850}
@@ -6548,6 +6874,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6548{ 6874{
6549 int i, j; 6875 int i, j;
6550 6876
6877 lock_doms_cur();
6878
6551 /* always unregister in case we don't destroy any domains */ 6879 /* always unregister in case we don't destroy any domains */
6552 unregister_sched_domain_sysctl(); 6880 unregister_sched_domain_sysctl();
6553 6881
@@ -6588,6 +6916,8 @@ match2:
6588 ndoms_cur = ndoms_new; 6916 ndoms_cur = ndoms_new;
6589 6917
6590 register_sched_domain_sysctl(); 6918 register_sched_domain_sysctl();
6919
6920 unlock_doms_cur();
6591} 6921}
6592 6922
6593#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6923#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6595,10 +6925,10 @@ static int arch_reinit_sched_domains(void)
6595{ 6925{
6596 int err; 6926 int err;
6597 6927
6598 mutex_lock(&sched_hotcpu_mutex); 6928 get_online_cpus();
6599 detach_destroy_domains(&cpu_online_map); 6929 detach_destroy_domains(&cpu_online_map);
6600 err = arch_init_sched_domains(&cpu_online_map); 6930 err = arch_init_sched_domains(&cpu_online_map);
6601 mutex_unlock(&sched_hotcpu_mutex); 6931 put_online_cpus();
6602 6932
6603 return err; 6933 return err;
6604} 6934}
@@ -6709,12 +7039,12 @@ void __init sched_init_smp(void)
6709{ 7039{
6710 cpumask_t non_isolated_cpus; 7040 cpumask_t non_isolated_cpus;
6711 7041
6712 mutex_lock(&sched_hotcpu_mutex); 7042 get_online_cpus();
6713 arch_init_sched_domains(&cpu_online_map); 7043 arch_init_sched_domains(&cpu_online_map);
6714 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7044 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6715 if (cpus_empty(non_isolated_cpus)) 7045 if (cpus_empty(non_isolated_cpus))
6716 cpu_set(smp_processor_id(), non_isolated_cpus); 7046 cpu_set(smp_processor_id(), non_isolated_cpus);
6717 mutex_unlock(&sched_hotcpu_mutex); 7047 put_online_cpus();
6718 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7048 /* XXX: Theoretical race here - CPU may be hotplugged now */
6719 hotcpu_notifier(update_sched_domains, 0); 7049 hotcpu_notifier(update_sched_domains, 0);
6720 7050
@@ -6722,6 +7052,21 @@ void __init sched_init_smp(void)
6722 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7052 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6723 BUG(); 7053 BUG();
6724 sched_init_granularity(); 7054 sched_init_granularity();
7055
7056#ifdef CONFIG_FAIR_GROUP_SCHED
7057 if (nr_cpu_ids == 1)
7058 return;
7059
7060 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7061 "group_balance");
7062 if (!IS_ERR(lb_monitor_task)) {
7063 lb_monitor_task->flags |= PF_NOFREEZE;
7064 wake_up_process(lb_monitor_task);
7065 } else {
7066 printk(KERN_ERR "Could not create load balance monitor thread"
7067 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7068 }
7069#endif
6725} 7070}
6726#else 7071#else
6727void __init sched_init_smp(void) 7072void __init sched_init_smp(void)
@@ -6746,13 +7091,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6746 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7091 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6747} 7092}
6748 7093
7094static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7095{
7096 struct rt_prio_array *array;
7097 int i;
7098
7099 array = &rt_rq->active;
7100 for (i = 0; i < MAX_RT_PRIO; i++) {
7101 INIT_LIST_HEAD(array->queue + i);
7102 __clear_bit(i, array->bitmap);
7103 }
7104 /* delimiter for bitsearch: */
7105 __set_bit(MAX_RT_PRIO, array->bitmap);
7106
7107#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
7108 rt_rq->highest_prio = MAX_RT_PRIO;
7109#endif
7110#ifdef CONFIG_SMP
7111 rt_rq->rt_nr_migratory = 0;
7112 rt_rq->overloaded = 0;
7113#endif
7114
7115 rt_rq->rt_time = 0;
7116 rt_rq->rt_throttled = 0;
7117
7118#ifdef CONFIG_FAIR_GROUP_SCHED
7119 rt_rq->rq = rq;
7120#endif
7121}
7122
7123#ifdef CONFIG_FAIR_GROUP_SCHED
7124static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7125 struct cfs_rq *cfs_rq, struct sched_entity *se,
7126 int cpu, int add)
7127{
7128 tg->cfs_rq[cpu] = cfs_rq;
7129 init_cfs_rq(cfs_rq, rq);
7130 cfs_rq->tg = tg;
7131 if (add)
7132 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7133
7134 tg->se[cpu] = se;
7135 se->cfs_rq = &rq->cfs;
7136 se->my_q = cfs_rq;
7137 se->load.weight = tg->shares;
7138 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7139 se->parent = NULL;
7140}
7141
7142static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7143 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7144 int cpu, int add)
7145{
7146 tg->rt_rq[cpu] = rt_rq;
7147 init_rt_rq(rt_rq, rq);
7148 rt_rq->tg = tg;
7149 rt_rq->rt_se = rt_se;
7150 if (add)
7151 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7152
7153 tg->rt_se[cpu] = rt_se;
7154 rt_se->rt_rq = &rq->rt;
7155 rt_se->my_q = rt_rq;
7156 rt_se->parent = NULL;
7157 INIT_LIST_HEAD(&rt_se->run_list);
7158}
7159#endif
7160
6749void __init sched_init(void) 7161void __init sched_init(void)
6750{ 7162{
6751 int highest_cpu = 0; 7163 int highest_cpu = 0;
6752 int i, j; 7164 int i, j;
6753 7165
7166#ifdef CONFIG_SMP
7167 init_defrootdomain();
7168#endif
7169
7170#ifdef CONFIG_FAIR_GROUP_SCHED
7171 list_add(&init_task_group.list, &task_groups);
7172#endif
7173
6754 for_each_possible_cpu(i) { 7174 for_each_possible_cpu(i) {
6755 struct rt_prio_array *array;
6756 struct rq *rq; 7175 struct rq *rq;
6757 7176
6758 rq = cpu_rq(i); 7177 rq = cpu_rq(i);
@@ -6761,52 +7180,39 @@ void __init sched_init(void)
6761 rq->nr_running = 0; 7180 rq->nr_running = 0;
6762 rq->clock = 1; 7181 rq->clock = 1;
6763 init_cfs_rq(&rq->cfs, rq); 7182 init_cfs_rq(&rq->cfs, rq);
7183 init_rt_rq(&rq->rt, rq);
6764#ifdef CONFIG_FAIR_GROUP_SCHED 7184#ifdef CONFIG_FAIR_GROUP_SCHED
6765 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6766 {
6767 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6768 struct sched_entity *se =
6769 &per_cpu(init_sched_entity, i);
6770
6771 init_cfs_rq_p[i] = cfs_rq;
6772 init_cfs_rq(cfs_rq, rq);
6773 cfs_rq->tg = &init_task_group;
6774 list_add(&cfs_rq->leaf_cfs_rq_list,
6775 &rq->leaf_cfs_rq_list);
6776
6777 init_sched_entity_p[i] = se;
6778 se->cfs_rq = &rq->cfs;
6779 se->my_q = cfs_rq;
6780 se->load.weight = init_task_group_load;
6781 se->load.inv_weight =
6782 div64_64(1ULL<<32, init_task_group_load);
6783 se->parent = NULL;
6784 }
6785 init_task_group.shares = init_task_group_load; 7185 init_task_group.shares = init_task_group_load;
6786 spin_lock_init(&init_task_group.lock); 7186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7187 init_tg_cfs_entry(rq, &init_task_group,
7188 &per_cpu(init_cfs_rq, i),
7189 &per_cpu(init_sched_entity, i), i, 1);
7190
7191 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7192 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7193 init_tg_rt_entry(rq, &init_task_group,
7194 &per_cpu(init_rt_rq, i),
7195 &per_cpu(init_sched_rt_entity, i), i, 1);
6787#endif 7196#endif
7197 rq->rt_period_expire = 0;
7198 rq->rt_throttled = 0;
6788 7199
6789 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7200 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6790 rq->cpu_load[j] = 0; 7201 rq->cpu_load[j] = 0;
6791#ifdef CONFIG_SMP 7202#ifdef CONFIG_SMP
6792 rq->sd = NULL; 7203 rq->sd = NULL;
7204 rq->rd = NULL;
6793 rq->active_balance = 0; 7205 rq->active_balance = 0;
6794 rq->next_balance = jiffies; 7206 rq->next_balance = jiffies;
6795 rq->push_cpu = 0; 7207 rq->push_cpu = 0;
6796 rq->cpu = i; 7208 rq->cpu = i;
6797 rq->migration_thread = NULL; 7209 rq->migration_thread = NULL;
6798 INIT_LIST_HEAD(&rq->migration_queue); 7210 INIT_LIST_HEAD(&rq->migration_queue);
7211 rq_attach_root(rq, &def_root_domain);
6799#endif 7212#endif
7213 init_rq_hrtick(rq);
6800 atomic_set(&rq->nr_iowait, 0); 7214 atomic_set(&rq->nr_iowait, 0);
6801
6802 array = &rq->rt.active;
6803 for (j = 0; j < MAX_RT_PRIO; j++) {
6804 INIT_LIST_HEAD(array->queue + j);
6805 __clear_bit(j, array->bitmap);
6806 }
6807 highest_cpu = i; 7215 highest_cpu = i;
6808 /* delimiter for bitsearch: */
6809 __set_bit(MAX_RT_PRIO, array->bitmap);
6810 } 7216 }
6811 7217
6812 set_load_weight(&init_task); 7218 set_load_weight(&init_task);
@@ -6975,12 +7381,187 @@ void set_curr_task(int cpu, struct task_struct *p)
6975 7381
6976#ifdef CONFIG_FAIR_GROUP_SCHED 7382#ifdef CONFIG_FAIR_GROUP_SCHED
6977 7383
7384#ifdef CONFIG_SMP
7385/*
7386 * distribute shares of all task groups among their schedulable entities,
7387 * to reflect load distribution across cpus.
7388 */
7389static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7390{
7391 struct cfs_rq *cfs_rq;
7392 struct rq *rq = cpu_rq(this_cpu);
7393 cpumask_t sdspan = sd->span;
7394 int balanced = 1;
7395
7396 /* Walk thr' all the task groups that we have */
7397 for_each_leaf_cfs_rq(rq, cfs_rq) {
7398 int i;
7399 unsigned long total_load = 0, total_shares;
7400 struct task_group *tg = cfs_rq->tg;
7401
7402 /* Gather total task load of this group across cpus */
7403 for_each_cpu_mask(i, sdspan)
7404 total_load += tg->cfs_rq[i]->load.weight;
7405
7406 /* Nothing to do if this group has no load */
7407 if (!total_load)
7408 continue;
7409
7410 /*
7411 * tg->shares represents the number of cpu shares the task group
7412 * is eligible to hold on a single cpu. On N cpus, it is
7413 * eligible to hold (N * tg->shares) number of cpu shares.
7414 */
7415 total_shares = tg->shares * cpus_weight(sdspan);
7416
7417 /*
7418 * redistribute total_shares across cpus as per the task load
7419 * distribution.
7420 */
7421 for_each_cpu_mask(i, sdspan) {
7422 unsigned long local_load, local_shares;
7423
7424 local_load = tg->cfs_rq[i]->load.weight;
7425 local_shares = (local_load * total_shares) / total_load;
7426 if (!local_shares)
7427 local_shares = MIN_GROUP_SHARES;
7428 if (local_shares == tg->se[i]->load.weight)
7429 continue;
7430
7431 spin_lock_irq(&cpu_rq(i)->lock);
7432 set_se_shares(tg->se[i], local_shares);
7433 spin_unlock_irq(&cpu_rq(i)->lock);
7434 balanced = 0;
7435 }
7436 }
7437
7438 return balanced;
7439}
7440
7441/*
7442 * How frequently should we rebalance_shares() across cpus?
7443 *
7444 * The more frequently we rebalance shares, the more accurate is the fairness
7445 * of cpu bandwidth distribution between task groups. However higher frequency
7446 * also implies increased scheduling overhead.
7447 *
7448 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7449 * consecutive calls to rebalance_shares() in the same sched domain.
7450 *
7451 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7452 * consecutive calls to rebalance_shares() in the same sched domain.
7453 *
7454 * These settings allows for the appropriate trade-off between accuracy of
7455 * fairness and the associated overhead.
7456 *
7457 */
7458
7459/* default: 8ms, units: milliseconds */
7460const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7461
7462/* default: 128ms, units: milliseconds */
7463const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7464
7465/* kernel thread that runs rebalance_shares() periodically */
7466static int load_balance_monitor(void *unused)
7467{
7468 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7469 struct sched_param schedparm;
7470 int ret;
7471
7472 /*
7473 * We don't want this thread's execution to be limited by the shares
7474 * assigned to default group (init_task_group). Hence make it run
7475 * as a SCHED_RR RT task at the lowest priority.
7476 */
7477 schedparm.sched_priority = 1;
7478 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7479 if (ret)
7480 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7481 " monitor thread (error = %d) \n", ret);
7482
7483 while (!kthread_should_stop()) {
7484 int i, cpu, balanced = 1;
7485
7486 /* Prevent cpus going down or coming up */
7487 get_online_cpus();
7488 /* lockout changes to doms_cur[] array */
7489 lock_doms_cur();
7490 /*
7491 * Enter a rcu read-side critical section to safely walk rq->sd
7492 * chain on various cpus and to walk task group list
7493 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7494 */
7495 rcu_read_lock();
7496
7497 for (i = 0; i < ndoms_cur; i++) {
7498 cpumask_t cpumap = doms_cur[i];
7499 struct sched_domain *sd = NULL, *sd_prev = NULL;
7500
7501 cpu = first_cpu(cpumap);
7502
7503 /* Find the highest domain at which to balance shares */
7504 for_each_domain(cpu, sd) {
7505 if (!(sd->flags & SD_LOAD_BALANCE))
7506 continue;
7507 sd_prev = sd;
7508 }
7509
7510 sd = sd_prev;
7511 /* sd == NULL? No load balance reqd in this domain */
7512 if (!sd)
7513 continue;
7514
7515 balanced &= rebalance_shares(sd, cpu);
7516 }
7517
7518 rcu_read_unlock();
7519
7520 unlock_doms_cur();
7521 put_online_cpus();
7522
7523 if (!balanced)
7524 timeout = sysctl_sched_min_bal_int_shares;
7525 else if (timeout < sysctl_sched_max_bal_int_shares)
7526 timeout *= 2;
7527
7528 msleep_interruptible(timeout);
7529 }
7530
7531 return 0;
7532}
7533#endif /* CONFIG_SMP */
7534
7535static void free_sched_group(struct task_group *tg)
7536{
7537 int i;
7538
7539 for_each_possible_cpu(i) {
7540 if (tg->cfs_rq)
7541 kfree(tg->cfs_rq[i]);
7542 if (tg->se)
7543 kfree(tg->se[i]);
7544 if (tg->rt_rq)
7545 kfree(tg->rt_rq[i]);
7546 if (tg->rt_se)
7547 kfree(tg->rt_se[i]);
7548 }
7549
7550 kfree(tg->cfs_rq);
7551 kfree(tg->se);
7552 kfree(tg->rt_rq);
7553 kfree(tg->rt_se);
7554 kfree(tg);
7555}
7556
6978/* allocate runqueue etc for a new task group */ 7557/* allocate runqueue etc for a new task group */
6979struct task_group *sched_create_group(void) 7558struct task_group *sched_create_group(void)
6980{ 7559{
6981 struct task_group *tg; 7560 struct task_group *tg;
6982 struct cfs_rq *cfs_rq; 7561 struct cfs_rq *cfs_rq;
6983 struct sched_entity *se; 7562 struct sched_entity *se;
7563 struct rt_rq *rt_rq;
7564 struct sched_rt_entity *rt_se;
6984 struct rq *rq; 7565 struct rq *rq;
6985 int i; 7566 int i;
6986 7567
@@ -6994,97 +7575,89 @@ struct task_group *sched_create_group(void)
6994 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7575 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6995 if (!tg->se) 7576 if (!tg->se)
6996 goto err; 7577 goto err;
7578 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7579 if (!tg->rt_rq)
7580 goto err;
7581 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7582 if (!tg->rt_se)
7583 goto err;
7584
7585 tg->shares = NICE_0_LOAD;
7586 tg->rt_ratio = 0; /* XXX */
6997 7587
6998 for_each_possible_cpu(i) { 7588 for_each_possible_cpu(i) {
6999 rq = cpu_rq(i); 7589 rq = cpu_rq(i);
7000 7590
7001 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7591 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
7002 cpu_to_node(i)); 7592 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7003 if (!cfs_rq) 7593 if (!cfs_rq)
7004 goto err; 7594 goto err;
7005 7595
7006 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7596 se = kmalloc_node(sizeof(struct sched_entity),
7007 cpu_to_node(i)); 7597 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7008 if (!se) 7598 if (!se)
7009 goto err; 7599 goto err;
7010 7600
7011 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7601 rt_rq = kmalloc_node(sizeof(struct rt_rq),
7012 memset(se, 0, sizeof(struct sched_entity)); 7602 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7603 if (!rt_rq)
7604 goto err;
7013 7605
7014 tg->cfs_rq[i] = cfs_rq; 7606 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
7015 init_cfs_rq(cfs_rq, rq); 7607 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7016 cfs_rq->tg = tg; 7608 if (!rt_se)
7609 goto err;
7017 7610
7018 tg->se[i] = se; 7611 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
7019 se->cfs_rq = &rq->cfs; 7612 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
7020 se->my_q = cfs_rq;
7021 se->load.weight = NICE_0_LOAD;
7022 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
7023 se->parent = NULL;
7024 } 7613 }
7025 7614
7615 lock_task_group_list();
7026 for_each_possible_cpu(i) { 7616 for_each_possible_cpu(i) {
7027 rq = cpu_rq(i); 7617 rq = cpu_rq(i);
7028 cfs_rq = tg->cfs_rq[i]; 7618 cfs_rq = tg->cfs_rq[i];
7029 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7619 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7620 rt_rq = tg->rt_rq[i];
7621 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7030 } 7622 }
7031 7623 list_add_rcu(&tg->list, &task_groups);
7032 tg->shares = NICE_0_LOAD; 7624 unlock_task_group_list();
7033 spin_lock_init(&tg->lock);
7034 7625
7035 return tg; 7626 return tg;
7036 7627
7037err: 7628err:
7038 for_each_possible_cpu(i) { 7629 free_sched_group(tg);
7039 if (tg->cfs_rq)
7040 kfree(tg->cfs_rq[i]);
7041 if (tg->se)
7042 kfree(tg->se[i]);
7043 }
7044 kfree(tg->cfs_rq);
7045 kfree(tg->se);
7046 kfree(tg);
7047
7048 return ERR_PTR(-ENOMEM); 7630 return ERR_PTR(-ENOMEM);
7049} 7631}
7050 7632
7051/* rcu callback to free various structures associated with a task group */ 7633/* rcu callback to free various structures associated with a task group */
7052static void free_sched_group(struct rcu_head *rhp) 7634static void free_sched_group_rcu(struct rcu_head *rhp)
7053{ 7635{
7054 struct task_group *tg = container_of(rhp, struct task_group, rcu);
7055 struct cfs_rq *cfs_rq;
7056 struct sched_entity *se;
7057 int i;
7058
7059 /* now it should be safe to free those cfs_rqs */ 7636 /* now it should be safe to free those cfs_rqs */
7060 for_each_possible_cpu(i) { 7637 free_sched_group(container_of(rhp, struct task_group, rcu));
7061 cfs_rq = tg->cfs_rq[i];
7062 kfree(cfs_rq);
7063
7064 se = tg->se[i];
7065 kfree(se);
7066 }
7067
7068 kfree(tg->cfs_rq);
7069 kfree(tg->se);
7070 kfree(tg);
7071} 7638}
7072 7639
7073/* Destroy runqueue etc associated with a task group */ 7640/* Destroy runqueue etc associated with a task group */
7074void sched_destroy_group(struct task_group *tg) 7641void sched_destroy_group(struct task_group *tg)
7075{ 7642{
7076 struct cfs_rq *cfs_rq = NULL; 7643 struct cfs_rq *cfs_rq = NULL;
7644 struct rt_rq *rt_rq = NULL;
7077 int i; 7645 int i;
7078 7646
7647 lock_task_group_list();
7079 for_each_possible_cpu(i) { 7648 for_each_possible_cpu(i) {
7080 cfs_rq = tg->cfs_rq[i]; 7649 cfs_rq = tg->cfs_rq[i];
7081 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7650 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7651 rt_rq = tg->rt_rq[i];
7652 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7082 } 7653 }
7654 list_del_rcu(&tg->list);
7655 unlock_task_group_list();
7083 7656
7084 BUG_ON(!cfs_rq); 7657 BUG_ON(!cfs_rq);
7085 7658
7086 /* wait for possible concurrent references to cfs_rqs complete */ 7659 /* wait for possible concurrent references to cfs_rqs complete */
7087 call_rcu(&tg->rcu, free_sched_group); 7660 call_rcu(&tg->rcu, free_sched_group_rcu);
7088} 7661}
7089 7662
7090/* change task's runqueue when it moves between groups. 7663/* change task's runqueue when it moves between groups.
@@ -7100,11 +7673,6 @@ void sched_move_task(struct task_struct *tsk)
7100 7673
7101 rq = task_rq_lock(tsk, &flags); 7674 rq = task_rq_lock(tsk, &flags);
7102 7675
7103 if (tsk->sched_class != &fair_sched_class) {
7104 set_task_cfs_rq(tsk, task_cpu(tsk));
7105 goto done;
7106 }
7107
7108 update_rq_clock(rq); 7676 update_rq_clock(rq);
7109 7677
7110 running = task_current(rq, tsk); 7678 running = task_current(rq, tsk);
@@ -7116,7 +7684,7 @@ void sched_move_task(struct task_struct *tsk)
7116 tsk->sched_class->put_prev_task(rq, tsk); 7684 tsk->sched_class->put_prev_task(rq, tsk);
7117 } 7685 }
7118 7686
7119 set_task_cfs_rq(tsk, task_cpu(tsk)); 7687 set_task_rq(tsk, task_cpu(tsk));
7120 7688
7121 if (on_rq) { 7689 if (on_rq) {
7122 if (unlikely(running)) 7690 if (unlikely(running))
@@ -7124,53 +7692,82 @@ void sched_move_task(struct task_struct *tsk)
7124 enqueue_task(rq, tsk, 0); 7692 enqueue_task(rq, tsk, 0);
7125 } 7693 }
7126 7694
7127done:
7128 task_rq_unlock(rq, &flags); 7695 task_rq_unlock(rq, &flags);
7129} 7696}
7130 7697
7698/* rq->lock to be locked by caller */
7131static void set_se_shares(struct sched_entity *se, unsigned long shares) 7699static void set_se_shares(struct sched_entity *se, unsigned long shares)
7132{ 7700{
7133 struct cfs_rq *cfs_rq = se->cfs_rq; 7701 struct cfs_rq *cfs_rq = se->cfs_rq;
7134 struct rq *rq = cfs_rq->rq; 7702 struct rq *rq = cfs_rq->rq;
7135 int on_rq; 7703 int on_rq;
7136 7704
7137 spin_lock_irq(&rq->lock); 7705 if (!shares)
7706 shares = MIN_GROUP_SHARES;
7138 7707
7139 on_rq = se->on_rq; 7708 on_rq = se->on_rq;
7140 if (on_rq) 7709 if (on_rq) {
7141 dequeue_entity(cfs_rq, se, 0); 7710 dequeue_entity(cfs_rq, se, 0);
7711 dec_cpu_load(rq, se->load.weight);
7712 }
7142 7713
7143 se->load.weight = shares; 7714 se->load.weight = shares;
7144 se->load.inv_weight = div64_64((1ULL<<32), shares); 7715 se->load.inv_weight = div64_64((1ULL<<32), shares);
7145 7716
7146 if (on_rq) 7717 if (on_rq) {
7147 enqueue_entity(cfs_rq, se, 0); 7718 enqueue_entity(cfs_rq, se, 0);
7148 7719 inc_cpu_load(rq, se->load.weight);
7149 spin_unlock_irq(&rq->lock); 7720 }
7150} 7721}
7151 7722
7152int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7723int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7153{ 7724{
7154 int i; 7725 int i;
7726 struct cfs_rq *cfs_rq;
7727 struct rq *rq;
7728
7729 lock_task_group_list();
7730 if (tg->shares == shares)
7731 goto done;
7732
7733 if (shares < MIN_GROUP_SHARES)
7734 shares = MIN_GROUP_SHARES;
7155 7735
7156 /* 7736 /*
7157 * A weight of 0 or 1 can cause arithmetics problems. 7737 * Prevent any load balance activity (rebalance_shares,
7158 * (The default weight is 1024 - so there's no practical 7738 * load_balance_fair) from referring to this group first,
7159 * limitation from this.) 7739 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7160 */ 7740 */
7161 if (shares < 2) 7741 for_each_possible_cpu(i) {
7162 shares = 2; 7742 cfs_rq = tg->cfs_rq[i];
7743 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7744 }
7163 7745
7164 spin_lock(&tg->lock); 7746 /* wait for any ongoing reference to this group to finish */
7165 if (tg->shares == shares) 7747 synchronize_sched();
7166 goto done;
7167 7748
7749 /*
7750 * Now we are free to modify the group's share on each cpu
7751 * w/o tripping rebalance_share or load_balance_fair.
7752 */
7168 tg->shares = shares; 7753 tg->shares = shares;
7169 for_each_possible_cpu(i) 7754 for_each_possible_cpu(i) {
7755 spin_lock_irq(&cpu_rq(i)->lock);
7170 set_se_shares(tg->se[i], shares); 7756 set_se_shares(tg->se[i], shares);
7757 spin_unlock_irq(&cpu_rq(i)->lock);
7758 }
7171 7759
7760 /*
7761 * Enable load balance activity on this group, by inserting it back on
7762 * each cpu's rq->leaf_cfs_rq_list.
7763 */
7764 for_each_possible_cpu(i) {
7765 rq = cpu_rq(i);
7766 cfs_rq = tg->cfs_rq[i];
7767 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7768 }
7172done: 7769done:
7173 spin_unlock(&tg->lock); 7770 unlock_task_group_list();
7174 return 0; 7771 return 0;
7175} 7772}
7176 7773
@@ -7179,6 +7776,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7179 return tg->shares; 7776 return tg->shares;
7180} 7777}
7181 7778
7779/*
7780 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7781 */
7782int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7783{
7784 struct task_group *tgi;
7785 unsigned long total = 0;
7786
7787 rcu_read_lock();
7788 list_for_each_entry_rcu(tgi, &task_groups, list)
7789 total += tgi->rt_ratio;
7790 rcu_read_unlock();
7791
7792 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7793 return -EINVAL;
7794
7795 tg->rt_ratio = rt_ratio;
7796 return 0;
7797}
7798
7799unsigned long sched_group_rt_ratio(struct task_group *tg)
7800{
7801 return tg->rt_ratio;
7802}
7803
7182#endif /* CONFIG_FAIR_GROUP_SCHED */ 7804#endif /* CONFIG_FAIR_GROUP_SCHED */
7183 7805
7184#ifdef CONFIG_FAIR_CGROUP_SCHED 7806#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7254,12 +7876,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7254 return (u64) tg->shares; 7876 return (u64) tg->shares;
7255} 7877}
7256 7878
7879static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7880 u64 rt_ratio_val)
7881{
7882 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7883}
7884
7885static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7886{
7887 struct task_group *tg = cgroup_tg(cgrp);
7888
7889 return (u64) tg->rt_ratio;
7890}
7891
7257static struct cftype cpu_files[] = { 7892static struct cftype cpu_files[] = {
7258 { 7893 {
7259 .name = "shares", 7894 .name = "shares",
7260 .read_uint = cpu_shares_read_uint, 7895 .read_uint = cpu_shares_read_uint,
7261 .write_uint = cpu_shares_write_uint, 7896 .write_uint = cpu_shares_write_uint,
7262 }, 7897 },
7898 {
7899 .name = "rt_ratio",
7900 .read_uint = cpu_rt_ratio_read_uint,
7901 .write_uint = cpu_rt_ratio_write_uint,
7902 },
7263}; 7903};
7264 7904
7265static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7905static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)