diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1400 |
1 files changed, 1020 insertions, 380 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index e76b11ca6df3..ba4c88088f62 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -22,6 +22,8 @@ | |||
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
25 | */ | 27 | */ |
26 | 28 | ||
27 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
@@ -63,6 +65,7 @@ | |||
63 | #include <linux/reciprocal_div.h> | 65 | #include <linux/reciprocal_div.h> |
64 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
65 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | ||
66 | 69 | ||
67 | #include <asm/tlb.h> | 70 | #include <asm/tlb.h> |
68 | #include <asm/irq_regs.h> | 71 | #include <asm/irq_regs.h> |
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
96 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
97 | 100 | ||
98 | /* | 101 | /* |
99 | * Some helpers for converting nanosecond timing to jiffy resolution | 102 | * Helpers for converting nanosecond timing to jiffy resolution |
100 | */ | 103 | */ |
101 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | 104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
102 | #define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) | ||
103 | 105 | ||
104 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 106 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
105 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
@@ -159,6 +161,8 @@ struct rt_prio_array { | |||
159 | 161 | ||
160 | struct cfs_rq; | 162 | struct cfs_rq; |
161 | 163 | ||
164 | static LIST_HEAD(task_groups); | ||
165 | |||
162 | /* task group related information */ | 166 | /* task group related information */ |
163 | struct task_group { | 167 | struct task_group { |
164 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 168 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -168,10 +172,50 @@ struct task_group { | |||
168 | struct sched_entity **se; | 172 | struct sched_entity **se; |
169 | /* runqueue "owned" by this group on each cpu */ | 173 | /* runqueue "owned" by this group on each cpu */ |
170 | struct cfs_rq **cfs_rq; | 174 | struct cfs_rq **cfs_rq; |
175 | |||
176 | struct sched_rt_entity **rt_se; | ||
177 | struct rt_rq **rt_rq; | ||
178 | |||
179 | unsigned int rt_ratio; | ||
180 | |||
181 | /* | ||
182 | * shares assigned to a task group governs how much of cpu bandwidth | ||
183 | * is allocated to the group. The more shares a group has, the more is | ||
184 | * the cpu bandwidth allocated to it. | ||
185 | * | ||
186 | * For ex, lets say that there are three task groups, A, B and C which | ||
187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
189 | * should be: | ||
190 | * | ||
191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
194 | * | ||
195 | * The weight assigned to a task group's schedulable entities on every | ||
196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
197 | * group's shares. For ex: lets say that task group A has been | ||
198 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
199 | * | ||
200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
201 | * | ||
202 | * Note: It's not necessary that each of a task's group schedulable | ||
203 | * entity have the same weight on all CPUs. If the group | ||
204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
205 | * better distribution of weight could be: | ||
206 | * | ||
207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
209 | * | ||
210 | * rebalance_shares() is responsible for distributing the shares of a | ||
211 | * task groups like this among the group's schedulable entities across | ||
212 | * cpus. | ||
213 | * | ||
214 | */ | ||
171 | unsigned long shares; | 215 | unsigned long shares; |
172 | /* spinlock to serialize modification to shares */ | 216 | |
173 | spinlock_t lock; | ||
174 | struct rcu_head rcu; | 217 | struct rcu_head rcu; |
218 | struct list_head list; | ||
175 | }; | 219 | }; |
176 | 220 | ||
177 | /* Default task group's sched entity on each cpu */ | 221 | /* Default task group's sched entity on each cpu */ |
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | |||
179 | /* Default task group's cfs_rq on each cpu */ | 223 | /* Default task group's cfs_rq on each cpu */ |
180 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
181 | 225 | ||
226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | ||
228 | |||
182 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | 229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; |
183 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | 230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; |
184 | 231 | ||
232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | ||
233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | ||
234 | |||
235 | /* task_group_mutex serializes add/remove of task groups and also changes to | ||
236 | * a task group's cpu shares. | ||
237 | */ | ||
238 | static DEFINE_MUTEX(task_group_mutex); | ||
239 | |||
240 | /* doms_cur_mutex serializes access to doms_cur[] array */ | ||
241 | static DEFINE_MUTEX(doms_cur_mutex); | ||
242 | |||
243 | #ifdef CONFIG_SMP | ||
244 | /* kernel thread that runs rebalance_shares() periodically */ | ||
245 | static struct task_struct *lb_monitor_task; | ||
246 | static int load_balance_monitor(void *unused); | ||
247 | #endif | ||
248 | |||
249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
250 | |||
185 | /* Default task group. | 251 | /* Default task group. |
186 | * Every task in system belong to this group at bootup. | 252 | * Every task in system belong to this group at bootup. |
187 | */ | 253 | */ |
188 | struct task_group init_task_group = { | 254 | struct task_group init_task_group = { |
189 | .se = init_sched_entity_p, | 255 | .se = init_sched_entity_p, |
190 | .cfs_rq = init_cfs_rq_p, | 256 | .cfs_rq = init_cfs_rq_p, |
257 | |||
258 | .rt_se = init_sched_rt_entity_p, | ||
259 | .rt_rq = init_rt_rq_p, | ||
191 | }; | 260 | }; |
192 | 261 | ||
193 | #ifdef CONFIG_FAIR_USER_SCHED | 262 | #ifdef CONFIG_FAIR_USER_SCHED |
194 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | 263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
195 | #else | 264 | #else |
196 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | 265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
197 | #endif | 266 | #endif |
198 | 267 | ||
199 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | 268 | #define MIN_GROUP_SHARES 2 |
269 | |||
270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | ||
200 | 271 | ||
201 | /* return group to which a task belongs */ | 272 | /* return group to which a task belongs */ |
202 | static inline struct task_group *task_group(struct task_struct *p) | 273 | static inline struct task_group *task_group(struct task_struct *p) |
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
215 | } | 286 | } |
216 | 287 | ||
217 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
218 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) | 289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
219 | { | 290 | { |
220 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | 291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; |
221 | p->se.parent = task_group(p)->se[cpu]; | 292 | p->se.parent = task_group(p)->se[cpu]; |
293 | |||
294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
295 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
296 | } | ||
297 | |||
298 | static inline void lock_task_group_list(void) | ||
299 | { | ||
300 | mutex_lock(&task_group_mutex); | ||
301 | } | ||
302 | |||
303 | static inline void unlock_task_group_list(void) | ||
304 | { | ||
305 | mutex_unlock(&task_group_mutex); | ||
306 | } | ||
307 | |||
308 | static inline void lock_doms_cur(void) | ||
309 | { | ||
310 | mutex_lock(&doms_cur_mutex); | ||
311 | } | ||
312 | |||
313 | static inline void unlock_doms_cur(void) | ||
314 | { | ||
315 | mutex_unlock(&doms_cur_mutex); | ||
222 | } | 316 | } |
223 | 317 | ||
224 | #else | 318 | #else |
225 | 319 | ||
226 | static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } | 320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
321 | static inline void lock_task_group_list(void) { } | ||
322 | static inline void unlock_task_group_list(void) { } | ||
323 | static inline void lock_doms_cur(void) { } | ||
324 | static inline void unlock_doms_cur(void) { } | ||
227 | 325 | ||
228 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
229 | 327 | ||
@@ -264,11 +362,57 @@ struct cfs_rq { | |||
264 | /* Real-Time classes' related field in a runqueue: */ | 362 | /* Real-Time classes' related field in a runqueue: */ |
265 | struct rt_rq { | 363 | struct rt_rq { |
266 | struct rt_prio_array active; | 364 | struct rt_prio_array active; |
267 | int rt_load_balance_idx; | 365 | unsigned long rt_nr_running; |
268 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | 366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
367 | int highest_prio; /* highest queued rt task prio */ | ||
368 | #endif | ||
369 | #ifdef CONFIG_SMP | ||
370 | unsigned long rt_nr_migratory; | ||
371 | int overloaded; | ||
372 | #endif | ||
373 | int rt_throttled; | ||
374 | u64 rt_time; | ||
375 | |||
376 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
377 | struct rq *rq; | ||
378 | struct list_head leaf_rt_rq_list; | ||
379 | struct task_group *tg; | ||
380 | struct sched_rt_entity *rt_se; | ||
381 | #endif | ||
382 | }; | ||
383 | |||
384 | #ifdef CONFIG_SMP | ||
385 | |||
386 | /* | ||
387 | * We add the notion of a root-domain which will be used to define per-domain | ||
388 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
389 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
390 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
391 | * object. | ||
392 | * | ||
393 | */ | ||
394 | struct root_domain { | ||
395 | atomic_t refcount; | ||
396 | cpumask_t span; | ||
397 | cpumask_t online; | ||
398 | |||
399 | /* | ||
400 | * The "RT overload" flag: it gets set if a CPU has more than | ||
401 | * one runnable RT task. | ||
402 | */ | ||
403 | cpumask_t rto_mask; | ||
404 | atomic_t rto_count; | ||
269 | }; | 405 | }; |
270 | 406 | ||
271 | /* | 407 | /* |
408 | * By default the system creates a single root-domain with all cpus as | ||
409 | * members (mimicking the global state we have today). | ||
410 | */ | ||
411 | static struct root_domain def_root_domain; | ||
412 | |||
413 | #endif | ||
414 | |||
415 | /* | ||
272 | * This is the main, per-CPU runqueue data structure. | 416 | * This is the main, per-CPU runqueue data structure. |
273 | * | 417 | * |
274 | * Locking rule: those places that want to lock multiple runqueues | 418 | * Locking rule: those places that want to lock multiple runqueues |
@@ -296,11 +440,15 @@ struct rq { | |||
296 | u64 nr_switches; | 440 | u64 nr_switches; |
297 | 441 | ||
298 | struct cfs_rq cfs; | 442 | struct cfs_rq cfs; |
443 | struct rt_rq rt; | ||
444 | u64 rt_period_expire; | ||
445 | int rt_throttled; | ||
446 | |||
299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 447 | #ifdef CONFIG_FAIR_GROUP_SCHED |
300 | /* list of leaf cfs_rq on this cpu: */ | 448 | /* list of leaf cfs_rq on this cpu: */ |
301 | struct list_head leaf_cfs_rq_list; | 449 | struct list_head leaf_cfs_rq_list; |
450 | struct list_head leaf_rt_rq_list; | ||
302 | #endif | 451 | #endif |
303 | struct rt_rq rt; | ||
304 | 452 | ||
305 | /* | 453 | /* |
306 | * This is part of a global counter where only the total sum | 454 | * This is part of a global counter where only the total sum |
@@ -317,7 +465,7 @@ struct rq { | |||
317 | u64 clock, prev_clock_raw; | 465 | u64 clock, prev_clock_raw; |
318 | s64 clock_max_delta; | 466 | s64 clock_max_delta; |
319 | 467 | ||
320 | unsigned int clock_warps, clock_overflows; | 468 | unsigned int clock_warps, clock_overflows, clock_underflows; |
321 | u64 idle_clock; | 469 | u64 idle_clock; |
322 | unsigned int clock_deep_idle_events; | 470 | unsigned int clock_deep_idle_events; |
323 | u64 tick_timestamp; | 471 | u64 tick_timestamp; |
@@ -325,6 +473,7 @@ struct rq { | |||
325 | atomic_t nr_iowait; | 473 | atomic_t nr_iowait; |
326 | 474 | ||
327 | #ifdef CONFIG_SMP | 475 | #ifdef CONFIG_SMP |
476 | struct root_domain *rd; | ||
328 | struct sched_domain *sd; | 477 | struct sched_domain *sd; |
329 | 478 | ||
330 | /* For active balancing */ | 479 | /* For active balancing */ |
@@ -337,6 +486,12 @@ struct rq { | |||
337 | struct list_head migration_queue; | 486 | struct list_head migration_queue; |
338 | #endif | 487 | #endif |
339 | 488 | ||
489 | #ifdef CONFIG_SCHED_HRTICK | ||
490 | unsigned long hrtick_flags; | ||
491 | ktime_t hrtick_expire; | ||
492 | struct hrtimer hrtick_timer; | ||
493 | #endif | ||
494 | |||
340 | #ifdef CONFIG_SCHEDSTATS | 495 | #ifdef CONFIG_SCHEDSTATS |
341 | /* latency stats */ | 496 | /* latency stats */ |
342 | struct sched_info rq_sched_info; | 497 | struct sched_info rq_sched_info; |
@@ -363,7 +518,6 @@ struct rq { | |||
363 | }; | 518 | }; |
364 | 519 | ||
365 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
366 | static DEFINE_MUTEX(sched_hotcpu_mutex); | ||
367 | 521 | ||
368 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
369 | { | 523 | { |
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq) | |||
441 | #define task_rq(p) cpu_rq(task_cpu(p)) | 595 | #define task_rq(p) cpu_rq(task_cpu(p)) |
442 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
443 | 597 | ||
598 | unsigned long rt_needs_cpu(int cpu) | ||
599 | { | ||
600 | struct rq *rq = cpu_rq(cpu); | ||
601 | u64 delta; | ||
602 | |||
603 | if (!rq->rt_throttled) | ||
604 | return 0; | ||
605 | |||
606 | if (rq->clock > rq->rt_period_expire) | ||
607 | return 1; | ||
608 | |||
609 | delta = rq->rt_period_expire - rq->clock; | ||
610 | do_div(delta, NSEC_PER_SEC / HZ); | ||
611 | |||
612 | return (unsigned long)delta; | ||
613 | } | ||
614 | |||
444 | /* | 615 | /* |
445 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
446 | */ | 617 | */ |
@@ -459,6 +630,8 @@ enum { | |||
459 | SCHED_FEAT_START_DEBIT = 4, | 630 | SCHED_FEAT_START_DEBIT = 4, |
460 | SCHED_FEAT_TREE_AVG = 8, | 631 | SCHED_FEAT_TREE_AVG = 8, |
461 | SCHED_FEAT_APPROX_AVG = 16, | 632 | SCHED_FEAT_APPROX_AVG = 16, |
633 | SCHED_FEAT_HRTICK = 32, | ||
634 | SCHED_FEAT_DOUBLE_TICK = 64, | ||
462 | }; | 635 | }; |
463 | 636 | ||
464 | const_debug unsigned int sysctl_sched_features = | 637 | const_debug unsigned int sysctl_sched_features = |
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features = | |||
466 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | |
467 | SCHED_FEAT_START_DEBIT * 1 | | 640 | SCHED_FEAT_START_DEBIT * 1 | |
468 | SCHED_FEAT_TREE_AVG * 0 | | 641 | SCHED_FEAT_TREE_AVG * 0 | |
469 | SCHED_FEAT_APPROX_AVG * 0; | 642 | SCHED_FEAT_APPROX_AVG * 0 | |
643 | SCHED_FEAT_HRTICK * 1 | | ||
644 | SCHED_FEAT_DOUBLE_TICK * 0; | ||
470 | 645 | ||
471 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) |
472 | 647 | ||
@@ -477,6 +652,21 @@ const_debug unsigned int sysctl_sched_features = | |||
477 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
478 | 653 | ||
479 | /* | 654 | /* |
655 | * period over which we measure -rt task cpu usage in ms. | ||
656 | * default: 1s | ||
657 | */ | ||
658 | const_debug unsigned int sysctl_sched_rt_period = 1000; | ||
659 | |||
660 | #define SCHED_RT_FRAC_SHIFT 16 | ||
661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) | ||
662 | |||
663 | /* | ||
664 | * ratio of time -rt tasks may consume. | ||
665 | * default: 95% | ||
666 | */ | ||
667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; | ||
668 | |||
669 | /* | ||
480 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
481 | * clock constructed from sched_clock(): | 671 | * clock constructed from sched_clock(): |
482 | */ | 672 | */ |
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
668 | struct rq *rq = cpu_rq(smp_processor_id()); | 858 | struct rq *rq = cpu_rq(smp_processor_id()); |
669 | u64 now = sched_clock(); | 859 | u64 now = sched_clock(); |
670 | 860 | ||
671 | touch_softlockup_watchdog(); | ||
672 | rq->idle_clock += delta_ns; | 861 | rq->idle_clock += delta_ns; |
673 | /* | 862 | /* |
674 | * Override the previous timestamp and ignore all | 863 | * Override the previous timestamp and ignore all |
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
680 | rq->prev_clock_raw = now; | 869 | rq->prev_clock_raw = now; |
681 | rq->clock += delta_ns; | 870 | rq->clock += delta_ns; |
682 | spin_unlock(&rq->lock); | 871 | spin_unlock(&rq->lock); |
872 | touch_softlockup_watchdog(); | ||
683 | } | 873 | } |
684 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
685 | 875 | ||
876 | static void __resched_task(struct task_struct *p, int tif_bit); | ||
877 | |||
878 | static inline void resched_task(struct task_struct *p) | ||
879 | { | ||
880 | __resched_task(p, TIF_NEED_RESCHED); | ||
881 | } | ||
882 | |||
883 | #ifdef CONFIG_SCHED_HRTICK | ||
884 | /* | ||
885 | * Use HR-timers to deliver accurate preemption points. | ||
886 | * | ||
887 | * Its all a bit involved since we cannot program an hrt while holding the | ||
888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | ||
889 | * reschedule event. | ||
890 | * | ||
891 | * When we get rescheduled we reprogram the hrtick_timer outside of the | ||
892 | * rq->lock. | ||
893 | */ | ||
894 | static inline void resched_hrt(struct task_struct *p) | ||
895 | { | ||
896 | __resched_task(p, TIF_HRTICK_RESCHED); | ||
897 | } | ||
898 | |||
899 | static inline void resched_rq(struct rq *rq) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | |||
903 | spin_lock_irqsave(&rq->lock, flags); | ||
904 | resched_task(rq->curr); | ||
905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
906 | } | ||
907 | |||
908 | enum { | ||
909 | HRTICK_SET, /* re-programm hrtick_timer */ | ||
910 | HRTICK_RESET, /* not a new slice */ | ||
911 | }; | ||
912 | |||
913 | /* | ||
914 | * Use hrtick when: | ||
915 | * - enabled by features | ||
916 | * - hrtimer is actually high res | ||
917 | */ | ||
918 | static inline int hrtick_enabled(struct rq *rq) | ||
919 | { | ||
920 | if (!sched_feat(HRTICK)) | ||
921 | return 0; | ||
922 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Called to set the hrtick timer state. | ||
927 | * | ||
928 | * called with rq->lock held and irqs disabled | ||
929 | */ | ||
930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | ||
931 | { | ||
932 | assert_spin_locked(&rq->lock); | ||
933 | |||
934 | /* | ||
935 | * preempt at: now + delay | ||
936 | */ | ||
937 | rq->hrtick_expire = | ||
938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | ||
939 | /* | ||
940 | * indicate we need to program the timer | ||
941 | */ | ||
942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | ||
943 | if (reset) | ||
944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
945 | |||
946 | /* | ||
947 | * New slices are called from the schedule path and don't need a | ||
948 | * forced reschedule. | ||
949 | */ | ||
950 | if (reset) | ||
951 | resched_hrt(rq->curr); | ||
952 | } | ||
953 | |||
954 | static void hrtick_clear(struct rq *rq) | ||
955 | { | ||
956 | if (hrtimer_active(&rq->hrtick_timer)) | ||
957 | hrtimer_cancel(&rq->hrtick_timer); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Update the timer from the possible pending state. | ||
962 | */ | ||
963 | static void hrtick_set(struct rq *rq) | ||
964 | { | ||
965 | ktime_t time; | ||
966 | int set, reset; | ||
967 | unsigned long flags; | ||
968 | |||
969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
970 | |||
971 | spin_lock_irqsave(&rq->lock, flags); | ||
972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | ||
973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
974 | time = rq->hrtick_expire; | ||
975 | clear_thread_flag(TIF_HRTICK_RESCHED); | ||
976 | spin_unlock_irqrestore(&rq->lock, flags); | ||
977 | |||
978 | if (set) { | ||
979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | ||
980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | ||
981 | resched_rq(rq); | ||
982 | } else | ||
983 | hrtick_clear(rq); | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * High-resolution timer tick. | ||
988 | * Runs from hardirq context with interrupts disabled. | ||
989 | */ | ||
990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | ||
991 | { | ||
992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | ||
993 | |||
994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
995 | |||
996 | spin_lock(&rq->lock); | ||
997 | __update_rq_clock(rq); | ||
998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | ||
999 | spin_unlock(&rq->lock); | ||
1000 | |||
1001 | return HRTIMER_NORESTART; | ||
1002 | } | ||
1003 | |||
1004 | static inline void init_rq_hrtick(struct rq *rq) | ||
1005 | { | ||
1006 | rq->hrtick_flags = 0; | ||
1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1008 | rq->hrtick_timer.function = hrtick; | ||
1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
1010 | } | ||
1011 | |||
1012 | void hrtick_resched(void) | ||
1013 | { | ||
1014 | struct rq *rq; | ||
1015 | unsigned long flags; | ||
1016 | |||
1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | ||
1018 | return; | ||
1019 | |||
1020 | local_irq_save(flags); | ||
1021 | rq = cpu_rq(smp_processor_id()); | ||
1022 | hrtick_set(rq); | ||
1023 | local_irq_restore(flags); | ||
1024 | } | ||
1025 | #else | ||
1026 | static inline void hrtick_clear(struct rq *rq) | ||
1027 | { | ||
1028 | } | ||
1029 | |||
1030 | static inline void hrtick_set(struct rq *rq) | ||
1031 | { | ||
1032 | } | ||
1033 | |||
1034 | static inline void init_rq_hrtick(struct rq *rq) | ||
1035 | { | ||
1036 | } | ||
1037 | |||
1038 | void hrtick_resched(void) | ||
1039 | { | ||
1040 | } | ||
1041 | #endif | ||
1042 | |||
686 | /* | 1043 | /* |
687 | * resched_task - mark a task 'to be rescheduled now'. | 1044 | * resched_task - mark a task 'to be rescheduled now'. |
688 | * | 1045 | * |
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
696 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
697 | #endif | 1054 | #endif |
698 | 1055 | ||
699 | static void resched_task(struct task_struct *p) | 1056 | static void __resched_task(struct task_struct *p, int tif_bit) |
700 | { | 1057 | { |
701 | int cpu; | 1058 | int cpu; |
702 | 1059 | ||
703 | assert_spin_locked(&task_rq(p)->lock); | 1060 | assert_spin_locked(&task_rq(p)->lock); |
704 | 1061 | ||
705 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) |
706 | return; | 1063 | return; |
707 | 1064 | ||
708 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 1065 | set_tsk_thread_flag(p, tif_bit); |
709 | 1066 | ||
710 | cpu = task_cpu(p); | 1067 | cpu = task_cpu(p); |
711 | if (cpu == smp_processor_id()) | 1068 | if (cpu == smp_processor_id()) |
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu) | |||
728 | spin_unlock_irqrestore(&rq->lock, flags); | 1085 | spin_unlock_irqrestore(&rq->lock, flags); |
729 | } | 1086 | } |
730 | #else | 1087 | #else |
731 | static inline void resched_task(struct task_struct *p) | 1088 | static void __resched_task(struct task_struct *p, int tif_bit) |
732 | { | 1089 | { |
733 | assert_spin_locked(&task_rq(p)->lock); | 1090 | assert_spin_locked(&task_rq(p)->lock); |
734 | set_tsk_need_resched(p); | 1091 | set_tsk_thread_flag(p, tif_bit); |
735 | } | 1092 | } |
736 | #endif | 1093 | #endif |
737 | 1094 | ||
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
871 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
872 | #endif | 1229 | #endif |
873 | 1230 | ||
1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1232 | { | ||
1233 | update_load_add(&rq->load, load); | ||
1234 | } | ||
1235 | |||
1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1237 | { | ||
1238 | update_load_sub(&rq->load, load); | ||
1239 | } | ||
1240 | |||
1241 | #ifdef CONFIG_SMP | ||
1242 | static unsigned long source_load(int cpu, int type); | ||
1243 | static unsigned long target_load(int cpu, int type); | ||
1244 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1246 | #endif /* CONFIG_SMP */ | ||
1247 | |||
874 | #include "sched_stats.h" | 1248 | #include "sched_stats.h" |
875 | #include "sched_idletask.c" | 1249 | #include "sched_idletask.c" |
876 | #include "sched_fair.c" | 1250 | #include "sched_fair.c" |
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | |||
881 | 1255 | ||
882 | #define sched_class_highest (&rt_sched_class) | 1256 | #define sched_class_highest (&rt_sched_class) |
883 | 1257 | ||
884 | /* | ||
885 | * Update delta_exec, delta_fair fields for rq. | ||
886 | * | ||
887 | * delta_fair clock advances at a rate inversely proportional to | ||
888 | * total load (rq->load.weight) on the runqueue, while | ||
889 | * delta_exec advances at the same rate as wall-clock (provided | ||
890 | * cpu is not idle). | ||
891 | * | ||
892 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
893 | * runqueue over any given interval. This (smoothened) load is used | ||
894 | * during load balance. | ||
895 | * | ||
896 | * This function is called /before/ updating rq->load | ||
897 | * and when switching tasks. | ||
898 | */ | ||
899 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | ||
900 | { | ||
901 | update_load_add(&rq->load, p->se.load.weight); | ||
902 | } | ||
903 | |||
904 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
905 | { | ||
906 | update_load_sub(&rq->load, p->se.load.weight); | ||
907 | } | ||
908 | |||
909 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | 1258 | static void inc_nr_running(struct task_struct *p, struct rq *rq) |
910 | { | 1259 | { |
911 | rq->nr_running++; | 1260 | rq->nr_running++; |
912 | inc_load(rq, p); | ||
913 | } | 1261 | } |
914 | 1262 | ||
915 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1263 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
916 | { | 1264 | { |
917 | rq->nr_running--; | 1265 | rq->nr_running--; |
918 | dec_load(rq, p); | ||
919 | } | 1266 | } |
920 | 1267 | ||
921 | static void set_load_weight(struct task_struct *p) | 1268 | static void set_load_weight(struct task_struct *p) |
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu) | |||
1039 | 1386 | ||
1040 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1041 | { | 1388 | { |
1042 | set_task_cfs_rq(p, cpu); | 1389 | set_task_rq(p, cpu); |
1043 | #ifdef CONFIG_SMP | 1390 | #ifdef CONFIG_SMP |
1044 | /* | 1391 | /* |
1045 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | 1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be |
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1051 | #endif | 1398 | #endif |
1052 | } | 1399 | } |
1053 | 1400 | ||
1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | ||
1402 | const struct sched_class *prev_class, | ||
1403 | int oldprio, int running) | ||
1404 | { | ||
1405 | if (prev_class != p->sched_class) { | ||
1406 | if (prev_class->switched_from) | ||
1407 | prev_class->switched_from(rq, p, running); | ||
1408 | p->sched_class->switched_to(rq, p, running); | ||
1409 | } else | ||
1410 | p->sched_class->prio_changed(rq, p, oldprio, running); | ||
1411 | } | ||
1412 | |||
1054 | #ifdef CONFIG_SMP | 1413 | #ifdef CONFIG_SMP |
1055 | 1414 | ||
1056 | /* | 1415 | /* |
1057 | * Is this task likely cache-hot: | 1416 | * Is this task likely cache-hot: |
1058 | */ | 1417 | */ |
1059 | static inline int | 1418 | static int |
1060 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
1061 | { | 1420 | { |
1062 | s64 delta; | 1421 | s64 delta; |
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type) | |||
1281 | /* | 1640 | /* |
1282 | * Return the average load per task on the cpu's run queue | 1641 | * Return the average load per task on the cpu's run queue |
1283 | */ | 1642 | */ |
1284 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1643 | static unsigned long cpu_avg_load_per_task(int cpu) |
1285 | { | 1644 | { |
1286 | struct rq *rq = cpu_rq(cpu); | 1645 | struct rq *rq = cpu_rq(cpu); |
1287 | unsigned long total = weighted_cpuload(cpu); | 1646 | unsigned long total = weighted_cpuload(cpu); |
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag) | |||
1438 | 1797 | ||
1439 | #endif /* CONFIG_SMP */ | 1798 | #endif /* CONFIG_SMP */ |
1440 | 1799 | ||
1441 | /* | ||
1442 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
1443 | * not idle and an idle cpu is available. The span of cpus to | ||
1444 | * search starts with cpus closest then further out as needed, | ||
1445 | * so we always favor a closer, idle cpu. | ||
1446 | * | ||
1447 | * Returns the CPU we should wake onto. | ||
1448 | */ | ||
1449 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
1450 | static int wake_idle(int cpu, struct task_struct *p) | ||
1451 | { | ||
1452 | cpumask_t tmp; | ||
1453 | struct sched_domain *sd; | ||
1454 | int i; | ||
1455 | |||
1456 | /* | ||
1457 | * If it is idle, then it is the best cpu to run this task. | ||
1458 | * | ||
1459 | * This cpu is also the best, if it has more than one task already. | ||
1460 | * Siblings must be also busy(in most cases) as they didn't already | ||
1461 | * pickup the extra load from this cpu and hence we need not check | ||
1462 | * sibling runqueue info. This will avoid the checks and cache miss | ||
1463 | * penalities associated with that. | ||
1464 | */ | ||
1465 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
1466 | return cpu; | ||
1467 | |||
1468 | for_each_domain(cpu, sd) { | ||
1469 | if (sd->flags & SD_WAKE_IDLE) { | ||
1470 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
1471 | for_each_cpu_mask(i, tmp) { | ||
1472 | if (idle_cpu(i)) { | ||
1473 | if (i != task_cpu(p)) { | ||
1474 | schedstat_inc(p, | ||
1475 | se.nr_wakeups_idle); | ||
1476 | } | ||
1477 | return i; | ||
1478 | } | ||
1479 | } | ||
1480 | } else { | ||
1481 | break; | ||
1482 | } | ||
1483 | } | ||
1484 | return cpu; | ||
1485 | } | ||
1486 | #else | ||
1487 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
1488 | { | ||
1489 | return cpu; | ||
1490 | } | ||
1491 | #endif | ||
1492 | |||
1493 | /*** | 1800 | /*** |
1494 | * try_to_wake_up - wake up a thread | 1801 | * try_to_wake_up - wake up a thread |
1495 | * @p: the to-be-woken-up thread | 1802 | * @p: the to-be-woken-up thread |
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1510 | unsigned long flags; | 1817 | unsigned long flags; |
1511 | long old_state; | 1818 | long old_state; |
1512 | struct rq *rq; | 1819 | struct rq *rq; |
1513 | #ifdef CONFIG_SMP | ||
1514 | struct sched_domain *sd, *this_sd = NULL; | ||
1515 | unsigned long load, this_load; | ||
1516 | int new_cpu; | ||
1517 | #endif | ||
1518 | 1820 | ||
1519 | rq = task_rq_lock(p, &flags); | 1821 | rq = task_rq_lock(p, &flags); |
1520 | old_state = p->state; | 1822 | old_state = p->state; |
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1532 | if (unlikely(task_running(rq, p))) | 1834 | if (unlikely(task_running(rq, p))) |
1533 | goto out_activate; | 1835 | goto out_activate; |
1534 | 1836 | ||
1535 | new_cpu = cpu; | 1837 | cpu = p->sched_class->select_task_rq(p, sync); |
1536 | 1838 | if (cpu != orig_cpu) { | |
1537 | schedstat_inc(rq, ttwu_count); | 1839 | set_task_cpu(p, cpu); |
1538 | if (cpu == this_cpu) { | ||
1539 | schedstat_inc(rq, ttwu_local); | ||
1540 | goto out_set_cpu; | ||
1541 | } | ||
1542 | |||
1543 | for_each_domain(this_cpu, sd) { | ||
1544 | if (cpu_isset(cpu, sd->span)) { | ||
1545 | schedstat_inc(sd, ttwu_wake_remote); | ||
1546 | this_sd = sd; | ||
1547 | break; | ||
1548 | } | ||
1549 | } | ||
1550 | |||
1551 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1552 | goto out_set_cpu; | ||
1553 | |||
1554 | /* | ||
1555 | * Check for affine wakeup and passive balancing possibilities. | ||
1556 | */ | ||
1557 | if (this_sd) { | ||
1558 | int idx = this_sd->wake_idx; | ||
1559 | unsigned int imbalance; | ||
1560 | |||
1561 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1562 | |||
1563 | load = source_load(cpu, idx); | ||
1564 | this_load = target_load(this_cpu, idx); | ||
1565 | |||
1566 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1567 | |||
1568 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1569 | unsigned long tl = this_load; | ||
1570 | unsigned long tl_per_task; | ||
1571 | |||
1572 | /* | ||
1573 | * Attract cache-cold tasks on sync wakeups: | ||
1574 | */ | ||
1575 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1576 | goto out_set_cpu; | ||
1577 | |||
1578 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1579 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1580 | |||
1581 | /* | ||
1582 | * If sync wakeup then subtract the (maximum possible) | ||
1583 | * effect of the currently running task from the load | ||
1584 | * of the current CPU: | ||
1585 | */ | ||
1586 | if (sync) | ||
1587 | tl -= current->se.load.weight; | ||
1588 | |||
1589 | if ((tl <= load && | ||
1590 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1591 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1592 | /* | ||
1593 | * This domain has SD_WAKE_AFFINE and | ||
1594 | * p is cache cold in this domain, and | ||
1595 | * there is no bad imbalance. | ||
1596 | */ | ||
1597 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1598 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1599 | goto out_set_cpu; | ||
1600 | } | ||
1601 | } | ||
1602 | |||
1603 | /* | ||
1604 | * Start passive balancing when half the imbalance_pct | ||
1605 | * limit is reached. | ||
1606 | */ | ||
1607 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1608 | if (imbalance*this_load <= 100*load) { | ||
1609 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1610 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1611 | goto out_set_cpu; | ||
1612 | } | ||
1613 | } | ||
1614 | } | ||
1615 | |||
1616 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1617 | out_set_cpu: | ||
1618 | new_cpu = wake_idle(new_cpu, p); | ||
1619 | if (new_cpu != cpu) { | ||
1620 | set_task_cpu(p, new_cpu); | ||
1621 | task_rq_unlock(rq, &flags); | 1840 | task_rq_unlock(rq, &flags); |
1622 | /* might preempt at this point */ | 1841 | /* might preempt at this point */ |
1623 | rq = task_rq_lock(p, &flags); | 1842 | rq = task_rq_lock(p, &flags); |
@@ -1631,6 +1850,21 @@ out_set_cpu: | |||
1631 | cpu = task_cpu(p); | 1850 | cpu = task_cpu(p); |
1632 | } | 1851 | } |
1633 | 1852 | ||
1853 | #ifdef CONFIG_SCHEDSTATS | ||
1854 | schedstat_inc(rq, ttwu_count); | ||
1855 | if (cpu == this_cpu) | ||
1856 | schedstat_inc(rq, ttwu_local); | ||
1857 | else { | ||
1858 | struct sched_domain *sd; | ||
1859 | for_each_domain(this_cpu, sd) { | ||
1860 | if (cpu_isset(cpu, sd->span)) { | ||
1861 | schedstat_inc(sd, ttwu_wake_remote); | ||
1862 | break; | ||
1863 | } | ||
1864 | } | ||
1865 | } | ||
1866 | #endif | ||
1867 | |||
1634 | out_activate: | 1868 | out_activate: |
1635 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
1636 | schedstat_inc(p, se.nr_wakeups); | 1870 | schedstat_inc(p, se.nr_wakeups); |
@@ -1649,6 +1883,10 @@ out_activate: | |||
1649 | 1883 | ||
1650 | out_running: | 1884 | out_running: |
1651 | p->state = TASK_RUNNING; | 1885 | p->state = TASK_RUNNING; |
1886 | #ifdef CONFIG_SMP | ||
1887 | if (p->sched_class->task_wake_up) | ||
1888 | p->sched_class->task_wake_up(rq, p); | ||
1889 | #endif | ||
1652 | out: | 1890 | out: |
1653 | task_rq_unlock(rq, &flags); | 1891 | task_rq_unlock(rq, &flags); |
1654 | 1892 | ||
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p) | |||
1691 | p->se.wait_max = 0; | 1929 | p->se.wait_max = 0; |
1692 | #endif | 1930 | #endif |
1693 | 1931 | ||
1694 | INIT_LIST_HEAD(&p->run_list); | 1932 | INIT_LIST_HEAD(&p->rt.run_list); |
1695 | p->se.on_rq = 0; | 1933 | p->se.on_rq = 0; |
1696 | 1934 | ||
1697 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1935 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1771 | inc_nr_running(p, rq); | 2009 | inc_nr_running(p, rq); |
1772 | } | 2010 | } |
1773 | check_preempt_curr(rq, p); | 2011 | check_preempt_curr(rq, p); |
2012 | #ifdef CONFIG_SMP | ||
2013 | if (p->sched_class->task_wake_up) | ||
2014 | p->sched_class->task_wake_up(rq, p); | ||
2015 | #endif | ||
1774 | task_rq_unlock(rq, &flags); | 2016 | task_rq_unlock(rq, &flags); |
1775 | } | 2017 | } |
1776 | 2018 | ||
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1891 | prev_state = prev->state; | 2133 | prev_state = prev->state; |
1892 | finish_arch_switch(prev); | 2134 | finish_arch_switch(prev); |
1893 | finish_lock_switch(rq, prev); | 2135 | finish_lock_switch(rq, prev); |
2136 | #ifdef CONFIG_SMP | ||
2137 | if (current->sched_class->post_schedule) | ||
2138 | current->sched_class->post_schedule(rq); | ||
2139 | #endif | ||
2140 | |||
1894 | fire_sched_in_preempt_notifiers(current); | 2141 | fire_sched_in_preempt_notifiers(current); |
1895 | if (mm) | 2142 | if (mm) |
1896 | mmdrop(mm); | 2143 | mmdrop(mm); |
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
2124 | /* | 2371 | /* |
2125 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2372 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2126 | */ | 2373 | */ |
2127 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2374 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2128 | __releases(this_rq->lock) | 2375 | __releases(this_rq->lock) |
2129 | __acquires(busiest->lock) | 2376 | __acquires(busiest->lock) |
2130 | __acquires(this_rq->lock) | 2377 | __acquires(this_rq->lock) |
2131 | { | 2378 | { |
2379 | int ret = 0; | ||
2380 | |||
2132 | if (unlikely(!irqs_disabled())) { | 2381 | if (unlikely(!irqs_disabled())) { |
2133 | /* printk() doesn't work good under rq->lock */ | 2382 | /* printk() doesn't work good under rq->lock */ |
2134 | spin_unlock(&this_rq->lock); | 2383 | spin_unlock(&this_rq->lock); |
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2139 | spin_unlock(&this_rq->lock); | 2388 | spin_unlock(&this_rq->lock); |
2140 | spin_lock(&busiest->lock); | 2389 | spin_lock(&busiest->lock); |
2141 | spin_lock(&this_rq->lock); | 2390 | spin_lock(&this_rq->lock); |
2391 | ret = 1; | ||
2142 | } else | 2392 | } else |
2143 | spin_lock(&busiest->lock); | 2393 | spin_lock(&busiest->lock); |
2144 | } | 2394 | } |
2395 | return ret; | ||
2145 | } | 2396 | } |
2146 | 2397 | ||
2147 | /* | 2398 | /* |
@@ -3485,12 +3736,14 @@ void scheduler_tick(void) | |||
3485 | /* | 3736 | /* |
3486 | * Let rq->clock advance by at least TICK_NSEC: | 3737 | * Let rq->clock advance by at least TICK_NSEC: |
3487 | */ | 3738 | */ |
3488 | if (unlikely(rq->clock < next_tick)) | 3739 | if (unlikely(rq->clock < next_tick)) { |
3489 | rq->clock = next_tick; | 3740 | rq->clock = next_tick; |
3741 | rq->clock_underflows++; | ||
3742 | } | ||
3490 | rq->tick_timestamp = rq->clock; | 3743 | rq->tick_timestamp = rq->clock; |
3491 | update_cpu_load(rq); | 3744 | update_cpu_load(rq); |
3492 | if (curr != rq->idle) /* FIXME: needed? */ | 3745 | curr->sched_class->task_tick(rq, curr, 0); |
3493 | curr->sched_class->task_tick(rq, curr); | 3746 | update_sched_rt_period(rq); |
3494 | spin_unlock(&rq->lock); | 3747 | spin_unlock(&rq->lock); |
3495 | 3748 | ||
3496 | #ifdef CONFIG_SMP | 3749 | #ifdef CONFIG_SMP |
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible: | |||
3636 | 3889 | ||
3637 | schedule_debug(prev); | 3890 | schedule_debug(prev); |
3638 | 3891 | ||
3892 | hrtick_clear(rq); | ||
3893 | |||
3639 | /* | 3894 | /* |
3640 | * Do the rq-clock update outside the rq lock: | 3895 | * Do the rq-clock update outside the rq lock: |
3641 | */ | 3896 | */ |
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible: | |||
3654 | switch_count = &prev->nvcsw; | 3909 | switch_count = &prev->nvcsw; |
3655 | } | 3910 | } |
3656 | 3911 | ||
3912 | #ifdef CONFIG_SMP | ||
3913 | if (prev->sched_class->pre_schedule) | ||
3914 | prev->sched_class->pre_schedule(rq, prev); | ||
3915 | #endif | ||
3916 | |||
3657 | if (unlikely(!rq->nr_running)) | 3917 | if (unlikely(!rq->nr_running)) |
3658 | idle_balance(cpu, rq); | 3918 | idle_balance(cpu, rq); |
3659 | 3919 | ||
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible: | |||
3668 | ++*switch_count; | 3928 | ++*switch_count; |
3669 | 3929 | ||
3670 | context_switch(rq, prev, next); /* unlocks the rq */ | 3930 | context_switch(rq, prev, next); /* unlocks the rq */ |
3931 | /* | ||
3932 | * the context switch might have flipped the stack from under | ||
3933 | * us, hence refresh the local variables. | ||
3934 | */ | ||
3935 | cpu = smp_processor_id(); | ||
3936 | rq = cpu_rq(cpu); | ||
3671 | } else | 3937 | } else |
3672 | spin_unlock_irq(&rq->lock); | 3938 | spin_unlock_irq(&rq->lock); |
3673 | 3939 | ||
3674 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3940 | hrtick_set(rq); |
3675 | cpu = smp_processor_id(); | 3941 | |
3676 | rq = cpu_rq(cpu); | 3942 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
3677 | goto need_resched_nonpreemptible; | 3943 | goto need_resched_nonpreemptible; |
3678 | } | 3944 | |
3679 | preempt_enable_no_resched(); | 3945 | preempt_enable_no_resched(); |
3680 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3946 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3681 | goto need_resched; | 3947 | goto need_resched; |
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule); | |||
3691 | asmlinkage void __sched preempt_schedule(void) | 3957 | asmlinkage void __sched preempt_schedule(void) |
3692 | { | 3958 | { |
3693 | struct thread_info *ti = current_thread_info(); | 3959 | struct thread_info *ti = current_thread_info(); |
3694 | #ifdef CONFIG_PREEMPT_BKL | ||
3695 | struct task_struct *task = current; | 3960 | struct task_struct *task = current; |
3696 | int saved_lock_depth; | 3961 | int saved_lock_depth; |
3697 | #endif | 3962 | |
3698 | /* | 3963 | /* |
3699 | * If there is a non-zero preempt_count or interrupts are disabled, | 3964 | * If there is a non-zero preempt_count or interrupts are disabled, |
3700 | * we do not want to preempt the current task. Just return.. | 3965 | * we do not want to preempt the current task. Just return.. |
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void) | |||
3710 | * clear ->lock_depth so that schedule() doesnt | 3975 | * clear ->lock_depth so that schedule() doesnt |
3711 | * auto-release the semaphore: | 3976 | * auto-release the semaphore: |
3712 | */ | 3977 | */ |
3713 | #ifdef CONFIG_PREEMPT_BKL | ||
3714 | saved_lock_depth = task->lock_depth; | 3978 | saved_lock_depth = task->lock_depth; |
3715 | task->lock_depth = -1; | 3979 | task->lock_depth = -1; |
3716 | #endif | ||
3717 | schedule(); | 3980 | schedule(); |
3718 | #ifdef CONFIG_PREEMPT_BKL | ||
3719 | task->lock_depth = saved_lock_depth; | 3981 | task->lock_depth = saved_lock_depth; |
3720 | #endif | ||
3721 | sub_preempt_count(PREEMPT_ACTIVE); | 3982 | sub_preempt_count(PREEMPT_ACTIVE); |
3722 | 3983 | ||
3723 | /* | 3984 | /* |
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule); | |||
3738 | asmlinkage void __sched preempt_schedule_irq(void) | 3999 | asmlinkage void __sched preempt_schedule_irq(void) |
3739 | { | 4000 | { |
3740 | struct thread_info *ti = current_thread_info(); | 4001 | struct thread_info *ti = current_thread_info(); |
3741 | #ifdef CONFIG_PREEMPT_BKL | ||
3742 | struct task_struct *task = current; | 4002 | struct task_struct *task = current; |
3743 | int saved_lock_depth; | 4003 | int saved_lock_depth; |
3744 | #endif | 4004 | |
3745 | /* Catch callers which need to be fixed */ | 4005 | /* Catch callers which need to be fixed */ |
3746 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4006 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3747 | 4007 | ||
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3753 | * clear ->lock_depth so that schedule() doesnt | 4013 | * clear ->lock_depth so that schedule() doesnt |
3754 | * auto-release the semaphore: | 4014 | * auto-release the semaphore: |
3755 | */ | 4015 | */ |
3756 | #ifdef CONFIG_PREEMPT_BKL | ||
3757 | saved_lock_depth = task->lock_depth; | 4016 | saved_lock_depth = task->lock_depth; |
3758 | task->lock_depth = -1; | 4017 | task->lock_depth = -1; |
3759 | #endif | ||
3760 | local_irq_enable(); | 4018 | local_irq_enable(); |
3761 | schedule(); | 4019 | schedule(); |
3762 | local_irq_disable(); | 4020 | local_irq_disable(); |
3763 | #ifdef CONFIG_PREEMPT_BKL | ||
3764 | task->lock_depth = saved_lock_depth; | 4021 | task->lock_depth = saved_lock_depth; |
3765 | #endif | ||
3766 | sub_preempt_count(PREEMPT_ACTIVE); | 4022 | sub_preempt_count(PREEMPT_ACTIVE); |
3767 | 4023 | ||
3768 | /* | 4024 | /* |
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4019 | unsigned long flags; | 4275 | unsigned long flags; |
4020 | int oldprio, on_rq, running; | 4276 | int oldprio, on_rq, running; |
4021 | struct rq *rq; | 4277 | struct rq *rq; |
4278 | const struct sched_class *prev_class = p->sched_class; | ||
4022 | 4279 | ||
4023 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4280 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4024 | 4281 | ||
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4044 | if (on_rq) { | 4301 | if (on_rq) { |
4045 | if (running) | 4302 | if (running) |
4046 | p->sched_class->set_curr_task(rq); | 4303 | p->sched_class->set_curr_task(rq); |
4304 | |||
4047 | enqueue_task(rq, p, 0); | 4305 | enqueue_task(rq, p, 0); |
4048 | /* | 4306 | |
4049 | * Reschedule if we are currently running on this runqueue and | 4307 | check_class_changed(rq, p, prev_class, oldprio, running); |
4050 | * our priority decreased, or if we are not currently running on | ||
4051 | * this runqueue and our priority is higher than the current's | ||
4052 | */ | ||
4053 | if (running) { | ||
4054 | if (p->prio > oldprio) | ||
4055 | resched_task(rq->curr); | ||
4056 | } else { | ||
4057 | check_preempt_curr(rq, p); | ||
4058 | } | ||
4059 | } | 4308 | } |
4060 | task_rq_unlock(rq, &flags); | 4309 | task_rq_unlock(rq, &flags); |
4061 | } | 4310 | } |
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4087 | goto out_unlock; | 4336 | goto out_unlock; |
4088 | } | 4337 | } |
4089 | on_rq = p->se.on_rq; | 4338 | on_rq = p->se.on_rq; |
4090 | if (on_rq) { | 4339 | if (on_rq) |
4091 | dequeue_task(rq, p, 0); | 4340 | dequeue_task(rq, p, 0); |
4092 | dec_load(rq, p); | ||
4093 | } | ||
4094 | 4341 | ||
4095 | p->static_prio = NICE_TO_PRIO(nice); | 4342 | p->static_prio = NICE_TO_PRIO(nice); |
4096 | set_load_weight(p); | 4343 | set_load_weight(p); |
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4100 | 4347 | ||
4101 | if (on_rq) { | 4348 | if (on_rq) { |
4102 | enqueue_task(rq, p, 0); | 4349 | enqueue_task(rq, p, 0); |
4103 | inc_load(rq, p); | ||
4104 | /* | 4350 | /* |
4105 | * If the task increased its priority or is running and | 4351 | * If the task increased its priority or is running and |
4106 | * lowered its priority, then reschedule its CPU: | 4352 | * lowered its priority, then reschedule its CPU: |
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
4258 | { | 4504 | { |
4259 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4505 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4260 | unsigned long flags; | 4506 | unsigned long flags; |
4507 | const struct sched_class *prev_class = p->sched_class; | ||
4261 | struct rq *rq; | 4508 | struct rq *rq; |
4262 | 4509 | ||
4263 | /* may grab non-irq protected spin_locks */ | 4510 | /* may grab non-irq protected spin_locks */ |
@@ -4351,18 +4598,10 @@ recheck: | |||
4351 | if (on_rq) { | 4598 | if (on_rq) { |
4352 | if (running) | 4599 | if (running) |
4353 | p->sched_class->set_curr_task(rq); | 4600 | p->sched_class->set_curr_task(rq); |
4601 | |||
4354 | activate_task(rq, p, 0); | 4602 | activate_task(rq, p, 0); |
4355 | /* | 4603 | |
4356 | * Reschedule if we are currently running on this runqueue and | 4604 | check_class_changed(rq, p, prev_class, oldprio, running); |
4357 | * our priority decreased, or if we are not currently running on | ||
4358 | * this runqueue and our priority is higher than the current's | ||
4359 | */ | ||
4360 | if (running) { | ||
4361 | if (p->prio > oldprio) | ||
4362 | resched_task(rq->curr); | ||
4363 | } else { | ||
4364 | check_preempt_curr(rq, p); | ||
4365 | } | ||
4366 | } | 4605 | } |
4367 | __task_rq_unlock(rq); | 4606 | __task_rq_unlock(rq); |
4368 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4607 | spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4490 | struct task_struct *p; | 4729 | struct task_struct *p; |
4491 | int retval; | 4730 | int retval; |
4492 | 4731 | ||
4493 | mutex_lock(&sched_hotcpu_mutex); | 4732 | get_online_cpus(); |
4494 | read_lock(&tasklist_lock); | 4733 | read_lock(&tasklist_lock); |
4495 | 4734 | ||
4496 | p = find_process_by_pid(pid); | 4735 | p = find_process_by_pid(pid); |
4497 | if (!p) { | 4736 | if (!p) { |
4498 | read_unlock(&tasklist_lock); | 4737 | read_unlock(&tasklist_lock); |
4499 | mutex_unlock(&sched_hotcpu_mutex); | 4738 | put_online_cpus(); |
4500 | return -ESRCH; | 4739 | return -ESRCH; |
4501 | } | 4740 | } |
4502 | 4741 | ||
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4536 | } | 4775 | } |
4537 | out_unlock: | 4776 | out_unlock: |
4538 | put_task_struct(p); | 4777 | put_task_struct(p); |
4539 | mutex_unlock(&sched_hotcpu_mutex); | 4778 | put_online_cpus(); |
4540 | return retval; | 4779 | return retval; |
4541 | } | 4780 | } |
4542 | 4781 | ||
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4593 | struct task_struct *p; | 4832 | struct task_struct *p; |
4594 | int retval; | 4833 | int retval; |
4595 | 4834 | ||
4596 | mutex_lock(&sched_hotcpu_mutex); | 4835 | get_online_cpus(); |
4597 | read_lock(&tasklist_lock); | 4836 | read_lock(&tasklist_lock); |
4598 | 4837 | ||
4599 | retval = -ESRCH; | 4838 | retval = -ESRCH; |
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4609 | 4848 | ||
4610 | out_unlock: | 4849 | out_unlock: |
4611 | read_unlock(&tasklist_lock); | 4850 | read_unlock(&tasklist_lock); |
4612 | mutex_unlock(&sched_hotcpu_mutex); | 4851 | put_online_cpus(); |
4613 | 4852 | ||
4614 | return retval; | 4853 | return retval; |
4615 | } | 4854 | } |
@@ -4683,7 +4922,8 @@ static void __cond_resched(void) | |||
4683 | } while (need_resched()); | 4922 | } while (need_resched()); |
4684 | } | 4923 | } |
4685 | 4924 | ||
4686 | int __sched cond_resched(void) | 4925 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) |
4926 | int __sched _cond_resched(void) | ||
4687 | { | 4927 | { |
4688 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4928 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4689 | system_state == SYSTEM_RUNNING) { | 4929 | system_state == SYSTEM_RUNNING) { |
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void) | |||
4692 | } | 4932 | } |
4693 | return 0; | 4933 | return 0; |
4694 | } | 4934 | } |
4695 | EXPORT_SYMBOL(cond_resched); | 4935 | EXPORT_SYMBOL(_cond_resched); |
4936 | #endif | ||
4696 | 4937 | ||
4697 | /* | 4938 | /* |
4698 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4939 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
@@ -4704,19 +4945,15 @@ EXPORT_SYMBOL(cond_resched); | |||
4704 | */ | 4945 | */ |
4705 | int cond_resched_lock(spinlock_t *lock) | 4946 | int cond_resched_lock(spinlock_t *lock) |
4706 | { | 4947 | { |
4948 | int resched = need_resched() && system_state == SYSTEM_RUNNING; | ||
4707 | int ret = 0; | 4949 | int ret = 0; |
4708 | 4950 | ||
4709 | if (need_lockbreak(lock)) { | 4951 | if (spin_needbreak(lock) || resched) { |
4710 | spin_unlock(lock); | 4952 | spin_unlock(lock); |
4711 | cpu_relax(); | 4953 | if (resched && need_resched()) |
4712 | ret = 1; | 4954 | __cond_resched(); |
4713 | spin_lock(lock); | 4955 | else |
4714 | } | 4956 | cpu_relax(); |
4715 | if (need_resched() && system_state == SYSTEM_RUNNING) { | ||
4716 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
4717 | _raw_spin_unlock(lock); | ||
4718 | preempt_enable_no_resched(); | ||
4719 | __cond_resched(); | ||
4720 | ret = 1; | 4957 | ret = 1; |
4721 | spin_lock(lock); | 4958 | spin_lock(lock); |
4722 | } | 4959 | } |
@@ -4890,7 +5127,7 @@ out_unlock: | |||
4890 | 5127 | ||
4891 | static const char stat_nam[] = "RSDTtZX"; | 5128 | static const char stat_nam[] = "RSDTtZX"; |
4892 | 5129 | ||
4893 | static void show_task(struct task_struct *p) | 5130 | void sched_show_task(struct task_struct *p) |
4894 | { | 5131 | { |
4895 | unsigned long free = 0; | 5132 | unsigned long free = 0; |
4896 | unsigned state; | 5133 | unsigned state; |
@@ -4920,8 +5157,7 @@ static void show_task(struct task_struct *p) | |||
4920 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 5157 | printk(KERN_CONT "%5lu %5d %6d\n", free, |
4921 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 5158 | task_pid_nr(p), task_pid_nr(p->real_parent)); |
4922 | 5159 | ||
4923 | if (state != TASK_RUNNING) | 5160 | show_stack(p, NULL); |
4924 | show_stack(p, NULL); | ||
4925 | } | 5161 | } |
4926 | 5162 | ||
4927 | void show_state_filter(unsigned long state_filter) | 5163 | void show_state_filter(unsigned long state_filter) |
@@ -4943,7 +5179,7 @@ void show_state_filter(unsigned long state_filter) | |||
4943 | */ | 5179 | */ |
4944 | touch_nmi_watchdog(); | 5180 | touch_nmi_watchdog(); |
4945 | if (!state_filter || (p->state & state_filter)) | 5181 | if (!state_filter || (p->state & state_filter)) |
4946 | show_task(p); | 5182 | sched_show_task(p); |
4947 | } while_each_thread(g, p); | 5183 | } while_each_thread(g, p); |
4948 | 5184 | ||
4949 | touch_all_softlockup_watchdogs(); | 5185 | touch_all_softlockup_watchdogs(); |
@@ -4992,11 +5228,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4992 | spin_unlock_irqrestore(&rq->lock, flags); | 5228 | spin_unlock_irqrestore(&rq->lock, flags); |
4993 | 5229 | ||
4994 | /* Set the preempt count _outside_ the spinlocks! */ | 5230 | /* Set the preempt count _outside_ the spinlocks! */ |
4995 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | ||
4996 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
4997 | #else | ||
4998 | task_thread_info(idle)->preempt_count = 0; | 5231 | task_thread_info(idle)->preempt_count = 0; |
4999 | #endif | 5232 | |
5000 | /* | 5233 | /* |
5001 | * The idle tasks have their own, simple scheduling class: | 5234 | * The idle tasks have their own, simple scheduling class: |
5002 | */ | 5235 | */ |
@@ -5077,7 +5310,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
5077 | goto out; | 5310 | goto out; |
5078 | } | 5311 | } |
5079 | 5312 | ||
5080 | p->cpus_allowed = new_mask; | 5313 | if (p->sched_class->set_cpus_allowed) |
5314 | p->sched_class->set_cpus_allowed(p, &new_mask); | ||
5315 | else { | ||
5316 | p->cpus_allowed = new_mask; | ||
5317 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | ||
5318 | } | ||
5319 | |||
5081 | /* Can the task run on the task's current CPU? If so, we're done */ | 5320 | /* Can the task run on the task's current CPU? If so, we're done */ |
5082 | if (cpu_isset(task_cpu(p), new_mask)) | 5321 | if (cpu_isset(task_cpu(p), new_mask)) |
5083 | goto out; | 5322 | goto out; |
@@ -5569,9 +5808,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5569 | struct rq *rq; | 5808 | struct rq *rq; |
5570 | 5809 | ||
5571 | switch (action) { | 5810 | switch (action) { |
5572 | case CPU_LOCK_ACQUIRE: | ||
5573 | mutex_lock(&sched_hotcpu_mutex); | ||
5574 | break; | ||
5575 | 5811 | ||
5576 | case CPU_UP_PREPARE: | 5812 | case CPU_UP_PREPARE: |
5577 | case CPU_UP_PREPARE_FROZEN: | 5813 | case CPU_UP_PREPARE_FROZEN: |
@@ -5590,6 +5826,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5590 | case CPU_ONLINE_FROZEN: | 5826 | case CPU_ONLINE_FROZEN: |
5591 | /* Strictly unnecessary, as first user will wake it. */ | 5827 | /* Strictly unnecessary, as first user will wake it. */ |
5592 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5828 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5829 | |||
5830 | /* Update our root-domain */ | ||
5831 | rq = cpu_rq(cpu); | ||
5832 | spin_lock_irqsave(&rq->lock, flags); | ||
5833 | if (rq->rd) { | ||
5834 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5835 | cpu_set(cpu, rq->rd->online); | ||
5836 | } | ||
5837 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5593 | break; | 5838 | break; |
5594 | 5839 | ||
5595 | #ifdef CONFIG_HOTPLUG_CPU | 5840 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -5640,10 +5885,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5640 | } | 5885 | } |
5641 | spin_unlock_irq(&rq->lock); | 5886 | spin_unlock_irq(&rq->lock); |
5642 | break; | 5887 | break; |
5643 | #endif | 5888 | |
5644 | case CPU_LOCK_RELEASE: | 5889 | case CPU_DOWN_PREPARE: |
5645 | mutex_unlock(&sched_hotcpu_mutex); | 5890 | /* Update our root-domain */ |
5891 | rq = cpu_rq(cpu); | ||
5892 | spin_lock_irqsave(&rq->lock, flags); | ||
5893 | if (rq->rd) { | ||
5894 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5895 | cpu_clear(cpu, rq->rd->online); | ||
5896 | } | ||
5897 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5646 | break; | 5898 | break; |
5899 | #endif | ||
5647 | } | 5900 | } |
5648 | return NOTIFY_OK; | 5901 | return NOTIFY_OK; |
5649 | } | 5902 | } |
@@ -5831,11 +6084,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5831 | return 1; | 6084 | return 1; |
5832 | } | 6085 | } |
5833 | 6086 | ||
6087 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
6088 | { | ||
6089 | unsigned long flags; | ||
6090 | const struct sched_class *class; | ||
6091 | |||
6092 | spin_lock_irqsave(&rq->lock, flags); | ||
6093 | |||
6094 | if (rq->rd) { | ||
6095 | struct root_domain *old_rd = rq->rd; | ||
6096 | |||
6097 | for (class = sched_class_highest; class; class = class->next) { | ||
6098 | if (class->leave_domain) | ||
6099 | class->leave_domain(rq); | ||
6100 | } | ||
6101 | |||
6102 | cpu_clear(rq->cpu, old_rd->span); | ||
6103 | cpu_clear(rq->cpu, old_rd->online); | ||
6104 | |||
6105 | if (atomic_dec_and_test(&old_rd->refcount)) | ||
6106 | kfree(old_rd); | ||
6107 | } | ||
6108 | |||
6109 | atomic_inc(&rd->refcount); | ||
6110 | rq->rd = rd; | ||
6111 | |||
6112 | cpu_set(rq->cpu, rd->span); | ||
6113 | if (cpu_isset(rq->cpu, cpu_online_map)) | ||
6114 | cpu_set(rq->cpu, rd->online); | ||
6115 | |||
6116 | for (class = sched_class_highest; class; class = class->next) { | ||
6117 | if (class->join_domain) | ||
6118 | class->join_domain(rq); | ||
6119 | } | ||
6120 | |||
6121 | spin_unlock_irqrestore(&rq->lock, flags); | ||
6122 | } | ||
6123 | |||
6124 | static void init_rootdomain(struct root_domain *rd) | ||
6125 | { | ||
6126 | memset(rd, 0, sizeof(*rd)); | ||
6127 | |||
6128 | cpus_clear(rd->span); | ||
6129 | cpus_clear(rd->online); | ||
6130 | } | ||
6131 | |||
6132 | static void init_defrootdomain(void) | ||
6133 | { | ||
6134 | init_rootdomain(&def_root_domain); | ||
6135 | atomic_set(&def_root_domain.refcount, 1); | ||
6136 | } | ||
6137 | |||
6138 | static struct root_domain *alloc_rootdomain(void) | ||
6139 | { | ||
6140 | struct root_domain *rd; | ||
6141 | |||
6142 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
6143 | if (!rd) | ||
6144 | return NULL; | ||
6145 | |||
6146 | init_rootdomain(rd); | ||
6147 | |||
6148 | return rd; | ||
6149 | } | ||
6150 | |||
5834 | /* | 6151 | /* |
5835 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6152 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
5836 | * hold the hotplug lock. | 6153 | * hold the hotplug lock. |
5837 | */ | 6154 | */ |
5838 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 6155 | static void |
6156 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
5839 | { | 6157 | { |
5840 | struct rq *rq = cpu_rq(cpu); | 6158 | struct rq *rq = cpu_rq(cpu); |
5841 | struct sched_domain *tmp; | 6159 | struct sched_domain *tmp; |
@@ -5860,6 +6178,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
5860 | 6178 | ||
5861 | sched_domain_debug(sd, cpu); | 6179 | sched_domain_debug(sd, cpu); |
5862 | 6180 | ||
6181 | rq_attach_root(rq, rd); | ||
5863 | rcu_assign_pointer(rq->sd, sd); | 6182 | rcu_assign_pointer(rq->sd, sd); |
5864 | } | 6183 | } |
5865 | 6184 | ||
@@ -6228,6 +6547,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6228 | static int build_sched_domains(const cpumask_t *cpu_map) | 6547 | static int build_sched_domains(const cpumask_t *cpu_map) |
6229 | { | 6548 | { |
6230 | int i; | 6549 | int i; |
6550 | struct root_domain *rd; | ||
6231 | #ifdef CONFIG_NUMA | 6551 | #ifdef CONFIG_NUMA |
6232 | struct sched_group **sched_group_nodes = NULL; | 6552 | struct sched_group **sched_group_nodes = NULL; |
6233 | int sd_allnodes = 0; | 6553 | int sd_allnodes = 0; |
@@ -6244,6 +6564,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6244 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6564 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6245 | #endif | 6565 | #endif |
6246 | 6566 | ||
6567 | rd = alloc_rootdomain(); | ||
6568 | if (!rd) { | ||
6569 | printk(KERN_WARNING "Cannot alloc root domain\n"); | ||
6570 | return -ENOMEM; | ||
6571 | } | ||
6572 | |||
6247 | /* | 6573 | /* |
6248 | * Set up domains for cpus specified by the cpu_map. | 6574 | * Set up domains for cpus specified by the cpu_map. |
6249 | */ | 6575 | */ |
@@ -6460,7 +6786,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6460 | #else | 6786 | #else |
6461 | sd = &per_cpu(phys_domains, i); | 6787 | sd = &per_cpu(phys_domains, i); |
6462 | #endif | 6788 | #endif |
6463 | cpu_attach_domain(sd, i); | 6789 | cpu_attach_domain(sd, rd, i); |
6464 | } | 6790 | } |
6465 | 6791 | ||
6466 | return 0; | 6792 | return 0; |
@@ -6518,7 +6844,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6518 | unregister_sched_domain_sysctl(); | 6844 | unregister_sched_domain_sysctl(); |
6519 | 6845 | ||
6520 | for_each_cpu_mask(i, *cpu_map) | 6846 | for_each_cpu_mask(i, *cpu_map) |
6521 | cpu_attach_domain(NULL, i); | 6847 | cpu_attach_domain(NULL, &def_root_domain, i); |
6522 | synchronize_sched(); | 6848 | synchronize_sched(); |
6523 | arch_destroy_sched_domains(cpu_map); | 6849 | arch_destroy_sched_domains(cpu_map); |
6524 | } | 6850 | } |
@@ -6548,6 +6874,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
6548 | { | 6874 | { |
6549 | int i, j; | 6875 | int i, j; |
6550 | 6876 | ||
6877 | lock_doms_cur(); | ||
6878 | |||
6551 | /* always unregister in case we don't destroy any domains */ | 6879 | /* always unregister in case we don't destroy any domains */ |
6552 | unregister_sched_domain_sysctl(); | 6880 | unregister_sched_domain_sysctl(); |
6553 | 6881 | ||
@@ -6588,6 +6916,8 @@ match2: | |||
6588 | ndoms_cur = ndoms_new; | 6916 | ndoms_cur = ndoms_new; |
6589 | 6917 | ||
6590 | register_sched_domain_sysctl(); | 6918 | register_sched_domain_sysctl(); |
6919 | |||
6920 | unlock_doms_cur(); | ||
6591 | } | 6921 | } |
6592 | 6922 | ||
6593 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6923 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -6595,10 +6925,10 @@ static int arch_reinit_sched_domains(void) | |||
6595 | { | 6925 | { |
6596 | int err; | 6926 | int err; |
6597 | 6927 | ||
6598 | mutex_lock(&sched_hotcpu_mutex); | 6928 | get_online_cpus(); |
6599 | detach_destroy_domains(&cpu_online_map); | 6929 | detach_destroy_domains(&cpu_online_map); |
6600 | err = arch_init_sched_domains(&cpu_online_map); | 6930 | err = arch_init_sched_domains(&cpu_online_map); |
6601 | mutex_unlock(&sched_hotcpu_mutex); | 6931 | put_online_cpus(); |
6602 | 6932 | ||
6603 | return err; | 6933 | return err; |
6604 | } | 6934 | } |
@@ -6709,12 +7039,12 @@ void __init sched_init_smp(void) | |||
6709 | { | 7039 | { |
6710 | cpumask_t non_isolated_cpus; | 7040 | cpumask_t non_isolated_cpus; |
6711 | 7041 | ||
6712 | mutex_lock(&sched_hotcpu_mutex); | 7042 | get_online_cpus(); |
6713 | arch_init_sched_domains(&cpu_online_map); | 7043 | arch_init_sched_domains(&cpu_online_map); |
6714 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7044 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
6715 | if (cpus_empty(non_isolated_cpus)) | 7045 | if (cpus_empty(non_isolated_cpus)) |
6716 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7046 | cpu_set(smp_processor_id(), non_isolated_cpus); |
6717 | mutex_unlock(&sched_hotcpu_mutex); | 7047 | put_online_cpus(); |
6718 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7048 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6719 | hotcpu_notifier(update_sched_domains, 0); | 7049 | hotcpu_notifier(update_sched_domains, 0); |
6720 | 7050 | ||
@@ -6722,6 +7052,21 @@ void __init sched_init_smp(void) | |||
6722 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7052 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6723 | BUG(); | 7053 | BUG(); |
6724 | sched_init_granularity(); | 7054 | sched_init_granularity(); |
7055 | |||
7056 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7057 | if (nr_cpu_ids == 1) | ||
7058 | return; | ||
7059 | |||
7060 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7061 | "group_balance"); | ||
7062 | if (!IS_ERR(lb_monitor_task)) { | ||
7063 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7064 | wake_up_process(lb_monitor_task); | ||
7065 | } else { | ||
7066 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7067 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7068 | } | ||
7069 | #endif | ||
6725 | } | 7070 | } |
6726 | #else | 7071 | #else |
6727 | void __init sched_init_smp(void) | 7072 | void __init sched_init_smp(void) |
@@ -6746,13 +7091,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
6746 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7091 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
6747 | } | 7092 | } |
6748 | 7093 | ||
7094 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
7095 | { | ||
7096 | struct rt_prio_array *array; | ||
7097 | int i; | ||
7098 | |||
7099 | array = &rt_rq->active; | ||
7100 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
7101 | INIT_LIST_HEAD(array->queue + i); | ||
7102 | __clear_bit(i, array->bitmap); | ||
7103 | } | ||
7104 | /* delimiter for bitsearch: */ | ||
7105 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
7106 | |||
7107 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
7108 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
7109 | #endif | ||
7110 | #ifdef CONFIG_SMP | ||
7111 | rt_rq->rt_nr_migratory = 0; | ||
7112 | rt_rq->overloaded = 0; | ||
7113 | #endif | ||
7114 | |||
7115 | rt_rq->rt_time = 0; | ||
7116 | rt_rq->rt_throttled = 0; | ||
7117 | |||
7118 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7119 | rt_rq->rq = rq; | ||
7120 | #endif | ||
7121 | } | ||
7122 | |||
7123 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7124 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | ||
7125 | struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
7126 | int cpu, int add) | ||
7127 | { | ||
7128 | tg->cfs_rq[cpu] = cfs_rq; | ||
7129 | init_cfs_rq(cfs_rq, rq); | ||
7130 | cfs_rq->tg = tg; | ||
7131 | if (add) | ||
7132 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7133 | |||
7134 | tg->se[cpu] = se; | ||
7135 | se->cfs_rq = &rq->cfs; | ||
7136 | se->my_q = cfs_rq; | ||
7137 | se->load.weight = tg->shares; | ||
7138 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | ||
7139 | se->parent = NULL; | ||
7140 | } | ||
7141 | |||
7142 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | ||
7143 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | ||
7144 | int cpu, int add) | ||
7145 | { | ||
7146 | tg->rt_rq[cpu] = rt_rq; | ||
7147 | init_rt_rq(rt_rq, rq); | ||
7148 | rt_rq->tg = tg; | ||
7149 | rt_rq->rt_se = rt_se; | ||
7150 | if (add) | ||
7151 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7152 | |||
7153 | tg->rt_se[cpu] = rt_se; | ||
7154 | rt_se->rt_rq = &rq->rt; | ||
7155 | rt_se->my_q = rt_rq; | ||
7156 | rt_se->parent = NULL; | ||
7157 | INIT_LIST_HEAD(&rt_se->run_list); | ||
7158 | } | ||
7159 | #endif | ||
7160 | |||
6749 | void __init sched_init(void) | 7161 | void __init sched_init(void) |
6750 | { | 7162 | { |
6751 | int highest_cpu = 0; | 7163 | int highest_cpu = 0; |
6752 | int i, j; | 7164 | int i, j; |
6753 | 7165 | ||
7166 | #ifdef CONFIG_SMP | ||
7167 | init_defrootdomain(); | ||
7168 | #endif | ||
7169 | |||
7170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7171 | list_add(&init_task_group.list, &task_groups); | ||
7172 | #endif | ||
7173 | |||
6754 | for_each_possible_cpu(i) { | 7174 | for_each_possible_cpu(i) { |
6755 | struct rt_prio_array *array; | ||
6756 | struct rq *rq; | 7175 | struct rq *rq; |
6757 | 7176 | ||
6758 | rq = cpu_rq(i); | 7177 | rq = cpu_rq(i); |
@@ -6761,52 +7180,39 @@ void __init sched_init(void) | |||
6761 | rq->nr_running = 0; | 7180 | rq->nr_running = 0; |
6762 | rq->clock = 1; | 7181 | rq->clock = 1; |
6763 | init_cfs_rq(&rq->cfs, rq); | 7182 | init_cfs_rq(&rq->cfs, rq); |
7183 | init_rt_rq(&rq->rt, rq); | ||
6764 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7184 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6765 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
6766 | { | ||
6767 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6768 | struct sched_entity *se = | ||
6769 | &per_cpu(init_sched_entity, i); | ||
6770 | |||
6771 | init_cfs_rq_p[i] = cfs_rq; | ||
6772 | init_cfs_rq(cfs_rq, rq); | ||
6773 | cfs_rq->tg = &init_task_group; | ||
6774 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6775 | &rq->leaf_cfs_rq_list); | ||
6776 | |||
6777 | init_sched_entity_p[i] = se; | ||
6778 | se->cfs_rq = &rq->cfs; | ||
6779 | se->my_q = cfs_rq; | ||
6780 | se->load.weight = init_task_group_load; | ||
6781 | se->load.inv_weight = | ||
6782 | div64_64(1ULL<<32, init_task_group_load); | ||
6783 | se->parent = NULL; | ||
6784 | } | ||
6785 | init_task_group.shares = init_task_group_load; | 7185 | init_task_group.shares = init_task_group_load; |
6786 | spin_lock_init(&init_task_group.lock); | 7186 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7187 | init_tg_cfs_entry(rq, &init_task_group, | ||
7188 | &per_cpu(init_cfs_rq, i), | ||
7189 | &per_cpu(init_sched_entity, i), i, 1); | ||
7190 | |||
7191 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ | ||
7192 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | ||
7193 | init_tg_rt_entry(rq, &init_task_group, | ||
7194 | &per_cpu(init_rt_rq, i), | ||
7195 | &per_cpu(init_sched_rt_entity, i), i, 1); | ||
6787 | #endif | 7196 | #endif |
7197 | rq->rt_period_expire = 0; | ||
7198 | rq->rt_throttled = 0; | ||
6788 | 7199 | ||
6789 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7200 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6790 | rq->cpu_load[j] = 0; | 7201 | rq->cpu_load[j] = 0; |
6791 | #ifdef CONFIG_SMP | 7202 | #ifdef CONFIG_SMP |
6792 | rq->sd = NULL; | 7203 | rq->sd = NULL; |
7204 | rq->rd = NULL; | ||
6793 | rq->active_balance = 0; | 7205 | rq->active_balance = 0; |
6794 | rq->next_balance = jiffies; | 7206 | rq->next_balance = jiffies; |
6795 | rq->push_cpu = 0; | 7207 | rq->push_cpu = 0; |
6796 | rq->cpu = i; | 7208 | rq->cpu = i; |
6797 | rq->migration_thread = NULL; | 7209 | rq->migration_thread = NULL; |
6798 | INIT_LIST_HEAD(&rq->migration_queue); | 7210 | INIT_LIST_HEAD(&rq->migration_queue); |
7211 | rq_attach_root(rq, &def_root_domain); | ||
6799 | #endif | 7212 | #endif |
7213 | init_rq_hrtick(rq); | ||
6800 | atomic_set(&rq->nr_iowait, 0); | 7214 | atomic_set(&rq->nr_iowait, 0); |
6801 | |||
6802 | array = &rq->rt.active; | ||
6803 | for (j = 0; j < MAX_RT_PRIO; j++) { | ||
6804 | INIT_LIST_HEAD(array->queue + j); | ||
6805 | __clear_bit(j, array->bitmap); | ||
6806 | } | ||
6807 | highest_cpu = i; | 7215 | highest_cpu = i; |
6808 | /* delimiter for bitsearch: */ | ||
6809 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
6810 | } | 7216 | } |
6811 | 7217 | ||
6812 | set_load_weight(&init_task); | 7218 | set_load_weight(&init_task); |
@@ -6975,12 +7381,187 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6975 | 7381 | ||
6976 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7382 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6977 | 7383 | ||
7384 | #ifdef CONFIG_SMP | ||
7385 | /* | ||
7386 | * distribute shares of all task groups among their schedulable entities, | ||
7387 | * to reflect load distribution across cpus. | ||
7388 | */ | ||
7389 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7390 | { | ||
7391 | struct cfs_rq *cfs_rq; | ||
7392 | struct rq *rq = cpu_rq(this_cpu); | ||
7393 | cpumask_t sdspan = sd->span; | ||
7394 | int balanced = 1; | ||
7395 | |||
7396 | /* Walk thr' all the task groups that we have */ | ||
7397 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7398 | int i; | ||
7399 | unsigned long total_load = 0, total_shares; | ||
7400 | struct task_group *tg = cfs_rq->tg; | ||
7401 | |||
7402 | /* Gather total task load of this group across cpus */ | ||
7403 | for_each_cpu_mask(i, sdspan) | ||
7404 | total_load += tg->cfs_rq[i]->load.weight; | ||
7405 | |||
7406 | /* Nothing to do if this group has no load */ | ||
7407 | if (!total_load) | ||
7408 | continue; | ||
7409 | |||
7410 | /* | ||
7411 | * tg->shares represents the number of cpu shares the task group | ||
7412 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7413 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7414 | */ | ||
7415 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7416 | |||
7417 | /* | ||
7418 | * redistribute total_shares across cpus as per the task load | ||
7419 | * distribution. | ||
7420 | */ | ||
7421 | for_each_cpu_mask(i, sdspan) { | ||
7422 | unsigned long local_load, local_shares; | ||
7423 | |||
7424 | local_load = tg->cfs_rq[i]->load.weight; | ||
7425 | local_shares = (local_load * total_shares) / total_load; | ||
7426 | if (!local_shares) | ||
7427 | local_shares = MIN_GROUP_SHARES; | ||
7428 | if (local_shares == tg->se[i]->load.weight) | ||
7429 | continue; | ||
7430 | |||
7431 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7432 | set_se_shares(tg->se[i], local_shares); | ||
7433 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7434 | balanced = 0; | ||
7435 | } | ||
7436 | } | ||
7437 | |||
7438 | return balanced; | ||
7439 | } | ||
7440 | |||
7441 | /* | ||
7442 | * How frequently should we rebalance_shares() across cpus? | ||
7443 | * | ||
7444 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7445 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7446 | * also implies increased scheduling overhead. | ||
7447 | * | ||
7448 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7449 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7450 | * | ||
7451 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7452 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7453 | * | ||
7454 | * These settings allows for the appropriate trade-off between accuracy of | ||
7455 | * fairness and the associated overhead. | ||
7456 | * | ||
7457 | */ | ||
7458 | |||
7459 | /* default: 8ms, units: milliseconds */ | ||
7460 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7461 | |||
7462 | /* default: 128ms, units: milliseconds */ | ||
7463 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7464 | |||
7465 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7466 | static int load_balance_monitor(void *unused) | ||
7467 | { | ||
7468 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7469 | struct sched_param schedparm; | ||
7470 | int ret; | ||
7471 | |||
7472 | /* | ||
7473 | * We don't want this thread's execution to be limited by the shares | ||
7474 | * assigned to default group (init_task_group). Hence make it run | ||
7475 | * as a SCHED_RR RT task at the lowest priority. | ||
7476 | */ | ||
7477 | schedparm.sched_priority = 1; | ||
7478 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7479 | if (ret) | ||
7480 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7481 | " monitor thread (error = %d) \n", ret); | ||
7482 | |||
7483 | while (!kthread_should_stop()) { | ||
7484 | int i, cpu, balanced = 1; | ||
7485 | |||
7486 | /* Prevent cpus going down or coming up */ | ||
7487 | get_online_cpus(); | ||
7488 | /* lockout changes to doms_cur[] array */ | ||
7489 | lock_doms_cur(); | ||
7490 | /* | ||
7491 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7492 | * chain on various cpus and to walk task group list | ||
7493 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7494 | */ | ||
7495 | rcu_read_lock(); | ||
7496 | |||
7497 | for (i = 0; i < ndoms_cur; i++) { | ||
7498 | cpumask_t cpumap = doms_cur[i]; | ||
7499 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7500 | |||
7501 | cpu = first_cpu(cpumap); | ||
7502 | |||
7503 | /* Find the highest domain at which to balance shares */ | ||
7504 | for_each_domain(cpu, sd) { | ||
7505 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7506 | continue; | ||
7507 | sd_prev = sd; | ||
7508 | } | ||
7509 | |||
7510 | sd = sd_prev; | ||
7511 | /* sd == NULL? No load balance reqd in this domain */ | ||
7512 | if (!sd) | ||
7513 | continue; | ||
7514 | |||
7515 | balanced &= rebalance_shares(sd, cpu); | ||
7516 | } | ||
7517 | |||
7518 | rcu_read_unlock(); | ||
7519 | |||
7520 | unlock_doms_cur(); | ||
7521 | put_online_cpus(); | ||
7522 | |||
7523 | if (!balanced) | ||
7524 | timeout = sysctl_sched_min_bal_int_shares; | ||
7525 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7526 | timeout *= 2; | ||
7527 | |||
7528 | msleep_interruptible(timeout); | ||
7529 | } | ||
7530 | |||
7531 | return 0; | ||
7532 | } | ||
7533 | #endif /* CONFIG_SMP */ | ||
7534 | |||
7535 | static void free_sched_group(struct task_group *tg) | ||
7536 | { | ||
7537 | int i; | ||
7538 | |||
7539 | for_each_possible_cpu(i) { | ||
7540 | if (tg->cfs_rq) | ||
7541 | kfree(tg->cfs_rq[i]); | ||
7542 | if (tg->se) | ||
7543 | kfree(tg->se[i]); | ||
7544 | if (tg->rt_rq) | ||
7545 | kfree(tg->rt_rq[i]); | ||
7546 | if (tg->rt_se) | ||
7547 | kfree(tg->rt_se[i]); | ||
7548 | } | ||
7549 | |||
7550 | kfree(tg->cfs_rq); | ||
7551 | kfree(tg->se); | ||
7552 | kfree(tg->rt_rq); | ||
7553 | kfree(tg->rt_se); | ||
7554 | kfree(tg); | ||
7555 | } | ||
7556 | |||
6978 | /* allocate runqueue etc for a new task group */ | 7557 | /* allocate runqueue etc for a new task group */ |
6979 | struct task_group *sched_create_group(void) | 7558 | struct task_group *sched_create_group(void) |
6980 | { | 7559 | { |
6981 | struct task_group *tg; | 7560 | struct task_group *tg; |
6982 | struct cfs_rq *cfs_rq; | 7561 | struct cfs_rq *cfs_rq; |
6983 | struct sched_entity *se; | 7562 | struct sched_entity *se; |
7563 | struct rt_rq *rt_rq; | ||
7564 | struct sched_rt_entity *rt_se; | ||
6984 | struct rq *rq; | 7565 | struct rq *rq; |
6985 | int i; | 7566 | int i; |
6986 | 7567 | ||
@@ -6994,97 +7575,89 @@ struct task_group *sched_create_group(void) | |||
6994 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 7575 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); |
6995 | if (!tg->se) | 7576 | if (!tg->se) |
6996 | goto err; | 7577 | goto err; |
7578 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | ||
7579 | if (!tg->rt_rq) | ||
7580 | goto err; | ||
7581 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | ||
7582 | if (!tg->rt_se) | ||
7583 | goto err; | ||
7584 | |||
7585 | tg->shares = NICE_0_LOAD; | ||
7586 | tg->rt_ratio = 0; /* XXX */ | ||
6997 | 7587 | ||
6998 | for_each_possible_cpu(i) { | 7588 | for_each_possible_cpu(i) { |
6999 | rq = cpu_rq(i); | 7589 | rq = cpu_rq(i); |
7000 | 7590 | ||
7001 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | 7591 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), |
7002 | cpu_to_node(i)); | 7592 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7003 | if (!cfs_rq) | 7593 | if (!cfs_rq) |
7004 | goto err; | 7594 | goto err; |
7005 | 7595 | ||
7006 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | 7596 | se = kmalloc_node(sizeof(struct sched_entity), |
7007 | cpu_to_node(i)); | 7597 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7008 | if (!se) | 7598 | if (!se) |
7009 | goto err; | 7599 | goto err; |
7010 | 7600 | ||
7011 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | 7601 | rt_rq = kmalloc_node(sizeof(struct rt_rq), |
7012 | memset(se, 0, sizeof(struct sched_entity)); | 7602 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7603 | if (!rt_rq) | ||
7604 | goto err; | ||
7013 | 7605 | ||
7014 | tg->cfs_rq[i] = cfs_rq; | 7606 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), |
7015 | init_cfs_rq(cfs_rq, rq); | 7607 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7016 | cfs_rq->tg = tg; | 7608 | if (!rt_se) |
7609 | goto err; | ||
7017 | 7610 | ||
7018 | tg->se[i] = se; | 7611 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); |
7019 | se->cfs_rq = &rq->cfs; | 7612 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); |
7020 | se->my_q = cfs_rq; | ||
7021 | se->load.weight = NICE_0_LOAD; | ||
7022 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
7023 | se->parent = NULL; | ||
7024 | } | 7613 | } |
7025 | 7614 | ||
7615 | lock_task_group_list(); | ||
7026 | for_each_possible_cpu(i) { | 7616 | for_each_possible_cpu(i) { |
7027 | rq = cpu_rq(i); | 7617 | rq = cpu_rq(i); |
7028 | cfs_rq = tg->cfs_rq[i]; | 7618 | cfs_rq = tg->cfs_rq[i]; |
7029 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7619 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7620 | rt_rq = tg->rt_rq[i]; | ||
7621 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7030 | } | 7622 | } |
7031 | 7623 | list_add_rcu(&tg->list, &task_groups); | |
7032 | tg->shares = NICE_0_LOAD; | 7624 | unlock_task_group_list(); |
7033 | spin_lock_init(&tg->lock); | ||
7034 | 7625 | ||
7035 | return tg; | 7626 | return tg; |
7036 | 7627 | ||
7037 | err: | 7628 | err: |
7038 | for_each_possible_cpu(i) { | 7629 | free_sched_group(tg); |
7039 | if (tg->cfs_rq) | ||
7040 | kfree(tg->cfs_rq[i]); | ||
7041 | if (tg->se) | ||
7042 | kfree(tg->se[i]); | ||
7043 | } | ||
7044 | kfree(tg->cfs_rq); | ||
7045 | kfree(tg->se); | ||
7046 | kfree(tg); | ||
7047 | |||
7048 | return ERR_PTR(-ENOMEM); | 7630 | return ERR_PTR(-ENOMEM); |
7049 | } | 7631 | } |
7050 | 7632 | ||
7051 | /* rcu callback to free various structures associated with a task group */ | 7633 | /* rcu callback to free various structures associated with a task group */ |
7052 | static void free_sched_group(struct rcu_head *rhp) | 7634 | static void free_sched_group_rcu(struct rcu_head *rhp) |
7053 | { | 7635 | { |
7054 | struct task_group *tg = container_of(rhp, struct task_group, rcu); | ||
7055 | struct cfs_rq *cfs_rq; | ||
7056 | struct sched_entity *se; | ||
7057 | int i; | ||
7058 | |||
7059 | /* now it should be safe to free those cfs_rqs */ | 7636 | /* now it should be safe to free those cfs_rqs */ |
7060 | for_each_possible_cpu(i) { | 7637 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
7061 | cfs_rq = tg->cfs_rq[i]; | ||
7062 | kfree(cfs_rq); | ||
7063 | |||
7064 | se = tg->se[i]; | ||
7065 | kfree(se); | ||
7066 | } | ||
7067 | |||
7068 | kfree(tg->cfs_rq); | ||
7069 | kfree(tg->se); | ||
7070 | kfree(tg); | ||
7071 | } | 7638 | } |
7072 | 7639 | ||
7073 | /* Destroy runqueue etc associated with a task group */ | 7640 | /* Destroy runqueue etc associated with a task group */ |
7074 | void sched_destroy_group(struct task_group *tg) | 7641 | void sched_destroy_group(struct task_group *tg) |
7075 | { | 7642 | { |
7076 | struct cfs_rq *cfs_rq = NULL; | 7643 | struct cfs_rq *cfs_rq = NULL; |
7644 | struct rt_rq *rt_rq = NULL; | ||
7077 | int i; | 7645 | int i; |
7078 | 7646 | ||
7647 | lock_task_group_list(); | ||
7079 | for_each_possible_cpu(i) { | 7648 | for_each_possible_cpu(i) { |
7080 | cfs_rq = tg->cfs_rq[i]; | 7649 | cfs_rq = tg->cfs_rq[i]; |
7081 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7650 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
7651 | rt_rq = tg->rt_rq[i]; | ||
7652 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
7082 | } | 7653 | } |
7654 | list_del_rcu(&tg->list); | ||
7655 | unlock_task_group_list(); | ||
7083 | 7656 | ||
7084 | BUG_ON(!cfs_rq); | 7657 | BUG_ON(!cfs_rq); |
7085 | 7658 | ||
7086 | /* wait for possible concurrent references to cfs_rqs complete */ | 7659 | /* wait for possible concurrent references to cfs_rqs complete */ |
7087 | call_rcu(&tg->rcu, free_sched_group); | 7660 | call_rcu(&tg->rcu, free_sched_group_rcu); |
7088 | } | 7661 | } |
7089 | 7662 | ||
7090 | /* change task's runqueue when it moves between groups. | 7663 | /* change task's runqueue when it moves between groups. |
@@ -7100,11 +7673,6 @@ void sched_move_task(struct task_struct *tsk) | |||
7100 | 7673 | ||
7101 | rq = task_rq_lock(tsk, &flags); | 7674 | rq = task_rq_lock(tsk, &flags); |
7102 | 7675 | ||
7103 | if (tsk->sched_class != &fair_sched_class) { | ||
7104 | set_task_cfs_rq(tsk, task_cpu(tsk)); | ||
7105 | goto done; | ||
7106 | } | ||
7107 | |||
7108 | update_rq_clock(rq); | 7676 | update_rq_clock(rq); |
7109 | 7677 | ||
7110 | running = task_current(rq, tsk); | 7678 | running = task_current(rq, tsk); |
@@ -7116,7 +7684,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7116 | tsk->sched_class->put_prev_task(rq, tsk); | 7684 | tsk->sched_class->put_prev_task(rq, tsk); |
7117 | } | 7685 | } |
7118 | 7686 | ||
7119 | set_task_cfs_rq(tsk, task_cpu(tsk)); | 7687 | set_task_rq(tsk, task_cpu(tsk)); |
7120 | 7688 | ||
7121 | if (on_rq) { | 7689 | if (on_rq) { |
7122 | if (unlikely(running)) | 7690 | if (unlikely(running)) |
@@ -7124,53 +7692,82 @@ void sched_move_task(struct task_struct *tsk) | |||
7124 | enqueue_task(rq, tsk, 0); | 7692 | enqueue_task(rq, tsk, 0); |
7125 | } | 7693 | } |
7126 | 7694 | ||
7127 | done: | ||
7128 | task_rq_unlock(rq, &flags); | 7695 | task_rq_unlock(rq, &flags); |
7129 | } | 7696 | } |
7130 | 7697 | ||
7698 | /* rq->lock to be locked by caller */ | ||
7131 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7699 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7132 | { | 7700 | { |
7133 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7701 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7134 | struct rq *rq = cfs_rq->rq; | 7702 | struct rq *rq = cfs_rq->rq; |
7135 | int on_rq; | 7703 | int on_rq; |
7136 | 7704 | ||
7137 | spin_lock_irq(&rq->lock); | 7705 | if (!shares) |
7706 | shares = MIN_GROUP_SHARES; | ||
7138 | 7707 | ||
7139 | on_rq = se->on_rq; | 7708 | on_rq = se->on_rq; |
7140 | if (on_rq) | 7709 | if (on_rq) { |
7141 | dequeue_entity(cfs_rq, se, 0); | 7710 | dequeue_entity(cfs_rq, se, 0); |
7711 | dec_cpu_load(rq, se->load.weight); | ||
7712 | } | ||
7142 | 7713 | ||
7143 | se->load.weight = shares; | 7714 | se->load.weight = shares; |
7144 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7715 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7145 | 7716 | ||
7146 | if (on_rq) | 7717 | if (on_rq) { |
7147 | enqueue_entity(cfs_rq, se, 0); | 7718 | enqueue_entity(cfs_rq, se, 0); |
7148 | 7719 | inc_cpu_load(rq, se->load.weight); | |
7149 | spin_unlock_irq(&rq->lock); | 7720 | } |
7150 | } | 7721 | } |
7151 | 7722 | ||
7152 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7723 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7153 | { | 7724 | { |
7154 | int i; | 7725 | int i; |
7726 | struct cfs_rq *cfs_rq; | ||
7727 | struct rq *rq; | ||
7728 | |||
7729 | lock_task_group_list(); | ||
7730 | if (tg->shares == shares) | ||
7731 | goto done; | ||
7732 | |||
7733 | if (shares < MIN_GROUP_SHARES) | ||
7734 | shares = MIN_GROUP_SHARES; | ||
7155 | 7735 | ||
7156 | /* | 7736 | /* |
7157 | * A weight of 0 or 1 can cause arithmetics problems. | 7737 | * Prevent any load balance activity (rebalance_shares, |
7158 | * (The default weight is 1024 - so there's no practical | 7738 | * load_balance_fair) from referring to this group first, |
7159 | * limitation from this.) | 7739 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. |
7160 | */ | 7740 | */ |
7161 | if (shares < 2) | 7741 | for_each_possible_cpu(i) { |
7162 | shares = 2; | 7742 | cfs_rq = tg->cfs_rq[i]; |
7743 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
7744 | } | ||
7163 | 7745 | ||
7164 | spin_lock(&tg->lock); | 7746 | /* wait for any ongoing reference to this group to finish */ |
7165 | if (tg->shares == shares) | 7747 | synchronize_sched(); |
7166 | goto done; | ||
7167 | 7748 | ||
7749 | /* | ||
7750 | * Now we are free to modify the group's share on each cpu | ||
7751 | * w/o tripping rebalance_share or load_balance_fair. | ||
7752 | */ | ||
7168 | tg->shares = shares; | 7753 | tg->shares = shares; |
7169 | for_each_possible_cpu(i) | 7754 | for_each_possible_cpu(i) { |
7755 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7170 | set_se_shares(tg->se[i], shares); | 7756 | set_se_shares(tg->se[i], shares); |
7757 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7758 | } | ||
7171 | 7759 | ||
7760 | /* | ||
7761 | * Enable load balance activity on this group, by inserting it back on | ||
7762 | * each cpu's rq->leaf_cfs_rq_list. | ||
7763 | */ | ||
7764 | for_each_possible_cpu(i) { | ||
7765 | rq = cpu_rq(i); | ||
7766 | cfs_rq = tg->cfs_rq[i]; | ||
7767 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7768 | } | ||
7172 | done: | 7769 | done: |
7173 | spin_unlock(&tg->lock); | 7770 | unlock_task_group_list(); |
7174 | return 0; | 7771 | return 0; |
7175 | } | 7772 | } |
7176 | 7773 | ||
@@ -7179,6 +7776,31 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
7179 | return tg->shares; | 7776 | return tg->shares; |
7180 | } | 7777 | } |
7181 | 7778 | ||
7779 | /* | ||
7780 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio | ||
7781 | */ | ||
7782 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) | ||
7783 | { | ||
7784 | struct task_group *tgi; | ||
7785 | unsigned long total = 0; | ||
7786 | |||
7787 | rcu_read_lock(); | ||
7788 | list_for_each_entry_rcu(tgi, &task_groups, list) | ||
7789 | total += tgi->rt_ratio; | ||
7790 | rcu_read_unlock(); | ||
7791 | |||
7792 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) | ||
7793 | return -EINVAL; | ||
7794 | |||
7795 | tg->rt_ratio = rt_ratio; | ||
7796 | return 0; | ||
7797 | } | ||
7798 | |||
7799 | unsigned long sched_group_rt_ratio(struct task_group *tg) | ||
7800 | { | ||
7801 | return tg->rt_ratio; | ||
7802 | } | ||
7803 | |||
7182 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7804 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7183 | 7805 | ||
7184 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 7806 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -7254,12 +7876,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7254 | return (u64) tg->shares; | 7876 | return (u64) tg->shares; |
7255 | } | 7877 | } |
7256 | 7878 | ||
7879 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, | ||
7880 | u64 rt_ratio_val) | ||
7881 | { | ||
7882 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); | ||
7883 | } | ||
7884 | |||
7885 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
7886 | { | ||
7887 | struct task_group *tg = cgroup_tg(cgrp); | ||
7888 | |||
7889 | return (u64) tg->rt_ratio; | ||
7890 | } | ||
7891 | |||
7257 | static struct cftype cpu_files[] = { | 7892 | static struct cftype cpu_files[] = { |
7258 | { | 7893 | { |
7259 | .name = "shares", | 7894 | .name = "shares", |
7260 | .read_uint = cpu_shares_read_uint, | 7895 | .read_uint = cpu_shares_read_uint, |
7261 | .write_uint = cpu_shares_write_uint, | 7896 | .write_uint = cpu_shares_write_uint, |
7262 | }, | 7897 | }, |
7898 | { | ||
7899 | .name = "rt_ratio", | ||
7900 | .read_uint = cpu_rt_ratio_read_uint, | ||
7901 | .write_uint = cpu_rt_ratio_write_uint, | ||
7902 | }, | ||
7263 | }; | 7903 | }; |
7264 | 7904 | ||
7265 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7905 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |