aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1972
1 files changed, 1297 insertions, 675 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7d..b44b9a43b0fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/completion.h> 31#include <linux/completion.h>
32#include <linux/kernel_stat.h> 32#include <linux/kernel_stat.h>
33#include <linux/debug_locks.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/notifier.h> 35#include <linux/notifier.h>
35#include <linux/profile.h> 36#include <linux/profile.h>
@@ -50,6 +51,7 @@
50#include <linux/times.h> 51#include <linux/times.h>
51#include <linux/acct.h> 52#include <linux/acct.h>
52#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/delayacct.h>
53#include <asm/tlb.h> 55#include <asm/tlb.h>
54 56
55#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -168,29 +170,28 @@
168 */ 170 */
169 171
170#define SCALE_PRIO(x, prio) \ 172#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 173 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 174
173static unsigned int task_timeslice(task_t *p) 175static unsigned int static_prio_timeslice(int static_prio)
174{ 176{
175 if (p->static_prio < NICE_TO_PRIO(0)) 177 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 179 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 180 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
181}
182
183static inline unsigned int task_timeslice(struct task_struct *p)
184{
185 return static_prio_timeslice(p->static_prio);
179} 186}
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time)
182 187
183/* 188/*
184 * These are the runqueue data structures: 189 * These are the runqueue data structures:
185 */ 190 */
186 191
187#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189typedef struct runqueue runqueue_t;
190
191struct prio_array { 192struct prio_array {
192 unsigned int nr_active; 193 unsigned int nr_active;
193 unsigned long bitmap[BITMAP_SIZE]; 194 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
194 struct list_head queue[MAX_PRIO]; 195 struct list_head queue[MAX_PRIO];
195}; 196};
196 197
@@ -201,7 +202,7 @@ struct prio_array {
201 * (such as the load balancing or the thread migration code), lock 202 * (such as the load balancing or the thread migration code), lock
202 * acquire operations must be ordered by ascending &runqueue. 203 * acquire operations must be ordered by ascending &runqueue.
203 */ 204 */
204struct runqueue { 205struct rq {
205 spinlock_t lock; 206 spinlock_t lock;
206 207
207 /* 208 /*
@@ -209,6 +210,7 @@ struct runqueue {
209 * remote CPUs use both these fields when doing load calculation. 210 * remote CPUs use both these fields when doing load calculation.
210 */ 211 */
211 unsigned long nr_running; 212 unsigned long nr_running;
213 unsigned long raw_weighted_load;
212#ifdef CONFIG_SMP 214#ifdef CONFIG_SMP
213 unsigned long cpu_load[3]; 215 unsigned long cpu_load[3];
214#endif 216#endif
@@ -224,9 +226,9 @@ struct runqueue {
224 226
225 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
226 unsigned long long timestamp_last_tick; 228 unsigned long long timestamp_last_tick;
227 task_t *curr, *idle; 229 struct task_struct *curr, *idle;
228 struct mm_struct *prev_mm; 230 struct mm_struct *prev_mm;
229 prio_array_t *active, *expired, arrays[2]; 231 struct prio_array *active, *expired, arrays[2];
230 int best_expired_prio; 232 int best_expired_prio;
231 atomic_t nr_iowait; 233 atomic_t nr_iowait;
232 234
@@ -237,9 +239,8 @@ struct runqueue {
237 int active_balance; 239 int active_balance;
238 int push_cpu; 240 int push_cpu;
239 241
240 task_t *migration_thread; 242 struct task_struct *migration_thread;
241 struct list_head migration_queue; 243 struct list_head migration_queue;
242 int cpu;
243#endif 244#endif
244 245
245#ifdef CONFIG_SCHEDSTATS 246#ifdef CONFIG_SCHEDSTATS
@@ -261,9 +262,10 @@ struct runqueue {
261 unsigned long ttwu_cnt; 262 unsigned long ttwu_cnt;
262 unsigned long ttwu_local; 263 unsigned long ttwu_local;
263#endif 264#endif
265 struct lock_class_key rq_lock_key;
264}; 266};
265 267
266static DEFINE_PER_CPU(struct runqueue, runqueues); 268static DEFINE_PER_CPU(struct rq, runqueues);
267 269
268/* 270/*
269 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 271 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -272,8 +274,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
272 * The domain tree of any CPU may only be accessed from within 274 * The domain tree of any CPU may only be accessed from within
273 * preempt-disabled sections. 275 * preempt-disabled sections.
274 */ 276 */
275#define for_each_domain(cpu, domain) \ 277#define for_each_domain(cpu, __sd) \
276for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) 278 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
277 279
278#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 280#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
279#define this_rq() (&__get_cpu_var(runqueues)) 281#define this_rq() (&__get_cpu_var(runqueues))
@@ -288,26 +290,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
288#endif 290#endif
289 291
290#ifndef __ARCH_WANT_UNLOCKED_CTXSW 292#ifndef __ARCH_WANT_UNLOCKED_CTXSW
291static inline int task_running(runqueue_t *rq, task_t *p) 293static inline int task_running(struct rq *rq, struct task_struct *p)
292{ 294{
293 return rq->curr == p; 295 return rq->curr == p;
294} 296}
295 297
296static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 298static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
297{ 299{
298} 300}
299 301
300static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 302static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
301{ 303{
302#ifdef CONFIG_DEBUG_SPINLOCK 304#ifdef CONFIG_DEBUG_SPINLOCK
303 /* this is a valid case when another task releases the spinlock */ 305 /* this is a valid case when another task releases the spinlock */
304 rq->lock.owner = current; 306 rq->lock.owner = current;
305#endif 307#endif
308 /*
309 * If we are tracking spinlock dependencies then we have to
310 * fix up the runqueue lock - which gets 'carried over' from
311 * prev into current:
312 */
313 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
314
306 spin_unlock_irq(&rq->lock); 315 spin_unlock_irq(&rq->lock);
307} 316}
308 317
309#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 318#else /* __ARCH_WANT_UNLOCKED_CTXSW */
310static inline int task_running(runqueue_t *rq, task_t *p) 319static inline int task_running(struct rq *rq, struct task_struct *p)
311{ 320{
312#ifdef CONFIG_SMP 321#ifdef CONFIG_SMP
313 return p->oncpu; 322 return p->oncpu;
@@ -316,7 +325,7 @@ static inline int task_running(runqueue_t *rq, task_t *p)
316#endif 325#endif
317} 326}
318 327
319static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 328static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
320{ 329{
321#ifdef CONFIG_SMP 330#ifdef CONFIG_SMP
322 /* 331 /*
@@ -333,7 +342,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
333#endif 342#endif
334} 343}
335 344
336static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 345static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
337{ 346{
338#ifdef CONFIG_SMP 347#ifdef CONFIG_SMP
339 /* 348 /*
@@ -351,14 +360,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
351#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 360#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
352 361
353/* 362/*
363 * __task_rq_lock - lock the runqueue a given task resides on.
364 * Must be called interrupts disabled.
365 */
366static inline struct rq *__task_rq_lock(struct task_struct *p)
367 __acquires(rq->lock)
368{
369 struct rq *rq;
370
371repeat_lock_task:
372 rq = task_rq(p);
373 spin_lock(&rq->lock);
374 if (unlikely(rq != task_rq(p))) {
375 spin_unlock(&rq->lock);
376 goto repeat_lock_task;
377 }
378 return rq;
379}
380
381/*
354 * task_rq_lock - lock the runqueue a given task resides on and disable 382 * task_rq_lock - lock the runqueue a given task resides on and disable
355 * interrupts. Note the ordering: we can safely lookup the task_rq without 383 * interrupts. Note the ordering: we can safely lookup the task_rq without
356 * explicitly disabling preemption. 384 * explicitly disabling preemption.
357 */ 385 */
358static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 386static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
359 __acquires(rq->lock) 387 __acquires(rq->lock)
360{ 388{
361 struct runqueue *rq; 389 struct rq *rq;
362 390
363repeat_lock_task: 391repeat_lock_task:
364 local_irq_save(*flags); 392 local_irq_save(*flags);
@@ -371,7 +399,13 @@ repeat_lock_task:
371 return rq; 399 return rq;
372} 400}
373 401
374static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 402static inline void __task_rq_unlock(struct rq *rq)
403 __releases(rq->lock)
404{
405 spin_unlock(&rq->lock);
406}
407
408static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
375 __releases(rq->lock) 409 __releases(rq->lock)
376{ 410{
377 spin_unlock_irqrestore(&rq->lock, *flags); 411 spin_unlock_irqrestore(&rq->lock, *flags);
@@ -391,7 +425,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
391 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 425 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
392 seq_printf(seq, "timestamp %lu\n", jiffies); 426 seq_printf(seq, "timestamp %lu\n", jiffies);
393 for_each_online_cpu(cpu) { 427 for_each_online_cpu(cpu) {
394 runqueue_t *rq = cpu_rq(cpu); 428 struct rq *rq = cpu_rq(cpu);
395#ifdef CONFIG_SMP 429#ifdef CONFIG_SMP
396 struct sched_domain *sd; 430 struct sched_domain *sd;
397 int dcnt = 0; 431 int dcnt = 0;
@@ -468,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
468 .release = single_release, 502 .release = single_release,
469}; 503};
470 504
505/*
506 * Expects runqueue lock to be held for atomicity of update
507 */
508static inline void
509rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
510{
511 if (rq) {
512 rq->rq_sched_info.run_delay += delta_jiffies;
513 rq->rq_sched_info.pcnt++;
514 }
515}
516
517/*
518 * Expects runqueue lock to be held for atomicity of update
519 */
520static inline void
521rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
522{
523 if (rq)
524 rq->rq_sched_info.cpu_time += delta_jiffies;
525}
471# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 526# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
472# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 527# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
473#else /* !CONFIG_SCHEDSTATS */ 528#else /* !CONFIG_SCHEDSTATS */
529static inline void
530rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
531{}
532static inline void
533rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
534{}
474# define schedstat_inc(rq, field) do { } while (0) 535# define schedstat_inc(rq, field) do { } while (0)
475# define schedstat_add(rq, field, amt) do { } while (0) 536# define schedstat_add(rq, field, amt) do { } while (0)
476#endif 537#endif
@@ -478,10 +539,10 @@ struct file_operations proc_schedstat_operations = {
478/* 539/*
479 * rq_lock - lock a given runqueue and disable interrupts. 540 * rq_lock - lock a given runqueue and disable interrupts.
480 */ 541 */
481static inline runqueue_t *this_rq_lock(void) 542static inline struct rq *this_rq_lock(void)
482 __acquires(rq->lock) 543 __acquires(rq->lock)
483{ 544{
484 runqueue_t *rq; 545 struct rq *rq;
485 546
486 local_irq_disable(); 547 local_irq_disable();
487 rq = this_rq(); 548 rq = this_rq();
@@ -490,7 +551,7 @@ static inline runqueue_t *this_rq_lock(void)
490 return rq; 551 return rq;
491} 552}
492 553
493#ifdef CONFIG_SCHEDSTATS 554#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
494/* 555/*
495 * Called when a process is dequeued from the active array and given 556 * Called when a process is dequeued from the active array and given
496 * the cpu. We should note that with the exception of interactive 557 * the cpu. We should note that with the exception of interactive
@@ -506,7 +567,7 @@ static inline runqueue_t *this_rq_lock(void)
506 * long it was from the *first* time it was queued to the time that it 567 * long it was from the *first* time it was queued to the time that it
507 * finally hit a cpu. 568 * finally hit a cpu.
508 */ 569 */
509static inline void sched_info_dequeued(task_t *t) 570static inline void sched_info_dequeued(struct task_struct *t)
510{ 571{
511 t->sched_info.last_queued = 0; 572 t->sched_info.last_queued = 0;
512} 573}
@@ -516,23 +577,18 @@ static inline void sched_info_dequeued(task_t *t)
516 * long it was waiting to run. We also note when it began so that we 577 * long it was waiting to run. We also note when it began so that we
517 * can keep stats on how long its timeslice is. 578 * can keep stats on how long its timeslice is.
518 */ 579 */
519static void sched_info_arrive(task_t *t) 580static void sched_info_arrive(struct task_struct *t)
520{ 581{
521 unsigned long now = jiffies, diff = 0; 582 unsigned long now = jiffies, delta_jiffies = 0;
522 struct runqueue *rq = task_rq(t);
523 583
524 if (t->sched_info.last_queued) 584 if (t->sched_info.last_queued)
525 diff = now - t->sched_info.last_queued; 585 delta_jiffies = now - t->sched_info.last_queued;
526 sched_info_dequeued(t); 586 sched_info_dequeued(t);
527 t->sched_info.run_delay += diff; 587 t->sched_info.run_delay += delta_jiffies;
528 t->sched_info.last_arrival = now; 588 t->sched_info.last_arrival = now;
529 t->sched_info.pcnt++; 589 t->sched_info.pcnt++;
530 590
531 if (!rq) 591 rq_sched_info_arrive(task_rq(t), delta_jiffies);
532 return;
533
534 rq->rq_sched_info.run_delay += diff;
535 rq->rq_sched_info.pcnt++;
536} 592}
537 593
538/* 594/*
@@ -550,25 +606,23 @@ static void sched_info_arrive(task_t *t)
550 * the timestamp if it is already not set. It's assumed that 606 * the timestamp if it is already not set. It's assumed that
551 * sched_info_dequeued() will clear that stamp when appropriate. 607 * sched_info_dequeued() will clear that stamp when appropriate.
552 */ 608 */
553static inline void sched_info_queued(task_t *t) 609static inline void sched_info_queued(struct task_struct *t)
554{ 610{
555 if (!t->sched_info.last_queued) 611 if (unlikely(sched_info_on()))
556 t->sched_info.last_queued = jiffies; 612 if (!t->sched_info.last_queued)
613 t->sched_info.last_queued = jiffies;
557} 614}
558 615
559/* 616/*
560 * Called when a process ceases being the active-running process, either 617 * Called when a process ceases being the active-running process, either
561 * voluntarily or involuntarily. Now we can calculate how long we ran. 618 * voluntarily or involuntarily. Now we can calculate how long we ran.
562 */ 619 */
563static inline void sched_info_depart(task_t *t) 620static inline void sched_info_depart(struct task_struct *t)
564{ 621{
565 struct runqueue *rq = task_rq(t); 622 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
566 unsigned long diff = jiffies - t->sched_info.last_arrival;
567
568 t->sched_info.cpu_time += diff;
569 623
570 if (rq) 624 t->sched_info.cpu_time += delta_jiffies;
571 rq->rq_sched_info.cpu_time += diff; 625 rq_sched_info_depart(task_rq(t), delta_jiffies);
572} 626}
573 627
574/* 628/*
@@ -576,9 +630,10 @@ static inline void sched_info_depart(task_t *t)
576 * their time slice. (This may also be called when switching to or from 630 * their time slice. (This may also be called when switching to or from
577 * the idle task.) We are only called when prev != next. 631 * the idle task.) We are only called when prev != next.
578 */ 632 */
579static inline void sched_info_switch(task_t *prev, task_t *next) 633static inline void
634__sched_info_switch(struct task_struct *prev, struct task_struct *next)
580{ 635{
581 struct runqueue *rq = task_rq(prev); 636 struct rq *rq = task_rq(prev);
582 637
583 /* 638 /*
584 * prev now departs the cpu. It's not interesting to record 639 * prev now departs the cpu. It's not interesting to record
@@ -591,15 +646,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next)
591 if (next != rq->idle) 646 if (next != rq->idle)
592 sched_info_arrive(next); 647 sched_info_arrive(next);
593} 648}
649static inline void
650sched_info_switch(struct task_struct *prev, struct task_struct *next)
651{
652 if (unlikely(sched_info_on()))
653 __sched_info_switch(prev, next);
654}
594#else 655#else
595#define sched_info_queued(t) do { } while (0) 656#define sched_info_queued(t) do { } while (0)
596#define sched_info_switch(t, next) do { } while (0) 657#define sched_info_switch(t, next) do { } while (0)
597#endif /* CONFIG_SCHEDSTATS */ 658#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
598 659
599/* 660/*
600 * Adding/removing a task to/from a priority array: 661 * Adding/removing a task to/from a priority array:
601 */ 662 */
602static void dequeue_task(struct task_struct *p, prio_array_t *array) 663static void dequeue_task(struct task_struct *p, struct prio_array *array)
603{ 664{
604 array->nr_active--; 665 array->nr_active--;
605 list_del(&p->run_list); 666 list_del(&p->run_list);
@@ -607,7 +668,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array)
607 __clear_bit(p->prio, array->bitmap); 668 __clear_bit(p->prio, array->bitmap);
608} 669}
609 670
610static void enqueue_task(struct task_struct *p, prio_array_t *array) 671static void enqueue_task(struct task_struct *p, struct prio_array *array)
611{ 672{
612 sched_info_queued(p); 673 sched_info_queued(p);
613 list_add_tail(&p->run_list, array->queue + p->prio); 674 list_add_tail(&p->run_list, array->queue + p->prio);
@@ -620,12 +681,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
620 * Put task to the end of the run list without the overhead of dequeue 681 * Put task to the end of the run list without the overhead of dequeue
621 * followed by enqueue. 682 * followed by enqueue.
622 */ 683 */
623static void requeue_task(struct task_struct *p, prio_array_t *array) 684static void requeue_task(struct task_struct *p, struct prio_array *array)
624{ 685{
625 list_move_tail(&p->run_list, array->queue + p->prio); 686 list_move_tail(&p->run_list, array->queue + p->prio);
626} 687}
627 688
628static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) 689static inline void
690enqueue_task_head(struct task_struct *p, struct prio_array *array)
629{ 691{
630 list_add(&p->run_list, array->queue + p->prio); 692 list_add(&p->run_list, array->queue + p->prio);
631 __set_bit(p->prio, array->bitmap); 693 __set_bit(p->prio, array->bitmap);
@@ -634,7 +696,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
634} 696}
635 697
636/* 698/*
637 * effective_prio - return the priority that is based on the static 699 * __normal_prio - return the priority that is based on the static
638 * priority but is modified by bonuses/penalties. 700 * priority but is modified by bonuses/penalties.
639 * 701 *
640 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 702 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +709,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
647 * 709 *
648 * Both properties are important to certain workloads. 710 * Both properties are important to certain workloads.
649 */ 711 */
650static int effective_prio(task_t *p) 712
713static inline int __normal_prio(struct task_struct *p)
651{ 714{
652 int bonus, prio; 715 int bonus, prio;
653 716
654 if (rt_task(p))
655 return p->prio;
656
657 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 717 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
658 718
659 prio = p->static_prio - bonus; 719 prio = p->static_prio - bonus;
@@ -665,57 +725,165 @@ static int effective_prio(task_t *p)
665} 725}
666 726
667/* 727/*
728 * To aid in avoiding the subversion of "niceness" due to uneven distribution
729 * of tasks with abnormal "nice" values across CPUs the contribution that
730 * each task makes to its run queue's load is weighted according to its
731 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
732 * scaled version of the new time slice allocation that they receive on time
733 * slice expiry etc.
734 */
735
736/*
737 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
738 * If static_prio_timeslice() is ever changed to break this assumption then
739 * this code will need modification
740 */
741#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
742#define LOAD_WEIGHT(lp) \
743 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
744#define PRIO_TO_LOAD_WEIGHT(prio) \
745 LOAD_WEIGHT(static_prio_timeslice(prio))
746#define RTPRIO_TO_LOAD_WEIGHT(rp) \
747 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
748
749static void set_load_weight(struct task_struct *p)
750{
751 if (has_rt_policy(p)) {
752#ifdef CONFIG_SMP
753 if (p == task_rq(p)->migration_thread)
754 /*
755 * The migration thread does the actual balancing.
756 * Giving its load any weight will skew balancing
757 * adversely.
758 */
759 p->load_weight = 0;
760 else
761#endif
762 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
763 } else
764 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
765}
766
767static inline void
768inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
769{
770 rq->raw_weighted_load += p->load_weight;
771}
772
773static inline void
774dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
775{
776 rq->raw_weighted_load -= p->load_weight;
777}
778
779static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
780{
781 rq->nr_running++;
782 inc_raw_weighted_load(rq, p);
783}
784
785static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
786{
787 rq->nr_running--;
788 dec_raw_weighted_load(rq, p);
789}
790
791/*
792 * Calculate the expected normal priority: i.e. priority
793 * without taking RT-inheritance into account. Might be
794 * boosted by interactivity modifiers. Changes upon fork,
795 * setprio syscalls, and whenever the interactivity
796 * estimator recalculates.
797 */
798static inline int normal_prio(struct task_struct *p)
799{
800 int prio;
801
802 if (has_rt_policy(p))
803 prio = MAX_RT_PRIO-1 - p->rt_priority;
804 else
805 prio = __normal_prio(p);
806 return prio;
807}
808
809/*
810 * Calculate the current priority, i.e. the priority
811 * taken into account by the scheduler. This value might
812 * be boosted by RT tasks, or might be boosted by
813 * interactivity modifiers. Will be RT if the task got
814 * RT-boosted. If not then it returns p->normal_prio.
815 */
816static int effective_prio(struct task_struct *p)
817{
818 p->normal_prio = normal_prio(p);
819 /*
820 * If we are RT tasks or we were boosted to RT priority,
821 * keep the priority unchanged. Otherwise, update priority
822 * to the normal priority:
823 */
824 if (!rt_prio(p->prio))
825 return p->normal_prio;
826 return p->prio;
827}
828
829/*
668 * __activate_task - move a task to the runqueue. 830 * __activate_task - move a task to the runqueue.
669 */ 831 */
670static void __activate_task(task_t *p, runqueue_t *rq) 832static void __activate_task(struct task_struct *p, struct rq *rq)
671{ 833{
672 prio_array_t *target = rq->active; 834 struct prio_array *target = rq->active;
673 835
674 if (batch_task(p)) 836 if (batch_task(p))
675 target = rq->expired; 837 target = rq->expired;
676 enqueue_task(p, target); 838 enqueue_task(p, target);
677 rq->nr_running++; 839 inc_nr_running(p, rq);
678} 840}
679 841
680/* 842/*
681 * __activate_idle_task - move idle task to the _front_ of runqueue. 843 * __activate_idle_task - move idle task to the _front_ of runqueue.
682 */ 844 */
683static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 845static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
684{ 846{
685 enqueue_task_head(p, rq->active); 847 enqueue_task_head(p, rq->active);
686 rq->nr_running++; 848 inc_nr_running(p, rq);
687} 849}
688 850
689static int recalc_task_prio(task_t *p, unsigned long long now) 851/*
852 * Recalculate p->normal_prio and p->prio after having slept,
853 * updating the sleep-average too:
854 */
855static int recalc_task_prio(struct task_struct *p, unsigned long long now)
690{ 856{
691 /* Caller must always ensure 'now >= p->timestamp' */ 857 /* Caller must always ensure 'now >= p->timestamp' */
692 unsigned long long __sleep_time = now - p->timestamp; 858 unsigned long sleep_time = now - p->timestamp;
693 unsigned long sleep_time;
694 859
695 if (batch_task(p)) 860 if (batch_task(p))
696 sleep_time = 0; 861 sleep_time = 0;
697 else {
698 if (__sleep_time > NS_MAX_SLEEP_AVG)
699 sleep_time = NS_MAX_SLEEP_AVG;
700 else
701 sleep_time = (unsigned long)__sleep_time;
702 }
703 862
704 if (likely(sleep_time > 0)) { 863 if (likely(sleep_time > 0)) {
705 /* 864 /*
706 * User tasks that sleep a long time are categorised as 865 * This ceiling is set to the lowest priority that would allow
707 * idle. They will only have their sleep_avg increased to a 866 * a task to be reinserted into the active array on timeslice
708 * level that makes them just interactive priority to stay 867 * completion.
709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
711 */ 868 */
712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { 869 unsigned long ceiling = INTERACTIVE_SLEEP(p);
713 unsigned long ceiling;
714 870
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - 871 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
716 DEF_TIMESLICE); 872 /*
717 if (p->sleep_avg < ceiling) 873 * Prevents user tasks from achieving best priority
718 p->sleep_avg = ceiling; 874 * with one single large enough sleep.
875 */
876 p->sleep_avg = ceiling;
877 /*
878 * Using INTERACTIVE_SLEEP() as a ceiling places a
879 * nice(0) task 1ms sleep away from promotion, and
880 * gives it 700ms to round-robin with no chance of
881 * being demoted. This is more than generous, so
882 * mark this sleep as non-interactive to prevent the
883 * on-runqueue bonus logic from intervening should
884 * this task not receive cpu immediately.
885 */
886 p->sleep_type = SLEEP_NONINTERACTIVE;
719 } else { 887 } else {
720 /* 888 /*
721 * Tasks waking from uninterruptible sleep are 889 * Tasks waking from uninterruptible sleep are
@@ -723,12 +891,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
723 * are likely to be waiting on I/O 891 * are likely to be waiting on I/O
724 */ 892 */
725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 893 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 894 if (p->sleep_avg >= ceiling)
727 sleep_time = 0; 895 sleep_time = 0;
728 else if (p->sleep_avg + sleep_time >= 896 else if (p->sleep_avg + sleep_time >=
729 INTERACTIVE_SLEEP(p)) { 897 ceiling) {
730 p->sleep_avg = INTERACTIVE_SLEEP(p); 898 p->sleep_avg = ceiling;
731 sleep_time = 0; 899 sleep_time = 0;
732 } 900 }
733 } 901 }
734 902
@@ -742,9 +910,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
742 */ 910 */
743 p->sleep_avg += sleep_time; 911 p->sleep_avg += sleep_time;
744 912
745 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
746 p->sleep_avg = NS_MAX_SLEEP_AVG;
747 } 913 }
914 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
915 p->sleep_avg = NS_MAX_SLEEP_AVG;
748 } 916 }
749 917
750 return effective_prio(p); 918 return effective_prio(p);
@@ -756,7 +924,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
756 * Update all the scheduling statistics stuff. (sleep average 924 * Update all the scheduling statistics stuff. (sleep average
757 * calculation, priority modifiers, etc.) 925 * calculation, priority modifiers, etc.)
758 */ 926 */
759static void activate_task(task_t *p, runqueue_t *rq, int local) 927static void activate_task(struct task_struct *p, struct rq *rq, int local)
760{ 928{
761 unsigned long long now; 929 unsigned long long now;
762 930
@@ -764,7 +932,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
764#ifdef CONFIG_SMP 932#ifdef CONFIG_SMP
765 if (!local) { 933 if (!local) {
766 /* Compensate for drifting sched_clock */ 934 /* Compensate for drifting sched_clock */
767 runqueue_t *this_rq = this_rq(); 935 struct rq *this_rq = this_rq();
768 now = (now - this_rq->timestamp_last_tick) 936 now = (now - this_rq->timestamp_last_tick)
769 + rq->timestamp_last_tick; 937 + rq->timestamp_last_tick;
770 } 938 }
@@ -803,9 +971,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
803/* 971/*
804 * deactivate_task - remove a task from the runqueue. 972 * deactivate_task - remove a task from the runqueue.
805 */ 973 */
806static void deactivate_task(struct task_struct *p, runqueue_t *rq) 974static void deactivate_task(struct task_struct *p, struct rq *rq)
807{ 975{
808 rq->nr_running--; 976 dec_nr_running(p, rq);
809 dequeue_task(p, p->array); 977 dequeue_task(p, p->array);
810 p->array = NULL; 978 p->array = NULL;
811} 979}
@@ -818,7 +986,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
818 * the target CPU. 986 * the target CPU.
819 */ 987 */
820#ifdef CONFIG_SMP 988#ifdef CONFIG_SMP
821static void resched_task(task_t *p) 989
990#ifndef tsk_is_polling
991#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
992#endif
993
994static void resched_task(struct task_struct *p)
822{ 995{
823 int cpu; 996 int cpu;
824 997
@@ -833,13 +1006,13 @@ static void resched_task(task_t *p)
833 if (cpu == smp_processor_id()) 1006 if (cpu == smp_processor_id())
834 return; 1007 return;
835 1008
836 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 1009 /* NEED_RESCHED must be visible before we test polling */
837 smp_mb(); 1010 smp_mb();
838 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 1011 if (!tsk_is_polling(p))
839 smp_send_reschedule(cpu); 1012 smp_send_reschedule(cpu);
840} 1013}
841#else 1014#else
842static inline void resched_task(task_t *p) 1015static inline void resched_task(struct task_struct *p)
843{ 1016{
844 assert_spin_locked(&task_rq(p)->lock); 1017 assert_spin_locked(&task_rq(p)->lock);
845 set_tsk_need_resched(p); 1018 set_tsk_need_resched(p);
@@ -850,28 +1023,35 @@ static inline void resched_task(task_t *p)
850 * task_curr - is this task currently executing on a CPU? 1023 * task_curr - is this task currently executing on a CPU?
851 * @p: the task in question. 1024 * @p: the task in question.
852 */ 1025 */
853inline int task_curr(const task_t *p) 1026inline int task_curr(const struct task_struct *p)
854{ 1027{
855 return cpu_curr(task_cpu(p)) == p; 1028 return cpu_curr(task_cpu(p)) == p;
856} 1029}
857 1030
1031/* Used instead of source_load when we know the type == 0 */
1032unsigned long weighted_cpuload(const int cpu)
1033{
1034 return cpu_rq(cpu)->raw_weighted_load;
1035}
1036
858#ifdef CONFIG_SMP 1037#ifdef CONFIG_SMP
859typedef struct { 1038struct migration_req {
860 struct list_head list; 1039 struct list_head list;
861 1040
862 task_t *task; 1041 struct task_struct *task;
863 int dest_cpu; 1042 int dest_cpu;
864 1043
865 struct completion done; 1044 struct completion done;
866} migration_req_t; 1045};
867 1046
868/* 1047/*
869 * The task's runqueue lock must be held. 1048 * The task's runqueue lock must be held.
870 * Returns true if you have to wait for migration thread. 1049 * Returns true if you have to wait for migration thread.
871 */ 1050 */
872static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) 1051static int
1052migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
873{ 1053{
874 runqueue_t *rq = task_rq(p); 1054 struct rq *rq = task_rq(p);
875 1055
876 /* 1056 /*
877 * If the task is not on a runqueue (and not running), then 1057 * If the task is not on a runqueue (and not running), then
@@ -886,6 +1066,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
886 req->task = p; 1066 req->task = p;
887 req->dest_cpu = dest_cpu; 1067 req->dest_cpu = dest_cpu;
888 list_add(&req->list, &rq->migration_queue); 1068 list_add(&req->list, &rq->migration_queue);
1069
889 return 1; 1070 return 1;
890} 1071}
891 1072
@@ -898,10 +1079,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
898 * smp_call_function() if an IPI is sent by the same process we are 1079 * smp_call_function() if an IPI is sent by the same process we are
899 * waiting to become inactive. 1080 * waiting to become inactive.
900 */ 1081 */
901void wait_task_inactive(task_t *p) 1082void wait_task_inactive(struct task_struct *p)
902{ 1083{
903 unsigned long flags; 1084 unsigned long flags;
904 runqueue_t *rq; 1085 struct rq *rq;
905 int preempted; 1086 int preempted;
906 1087
907repeat: 1088repeat:
@@ -932,7 +1113,7 @@ repeat:
932 * to another CPU then no harm is done and the purpose has been 1113 * to another CPU then no harm is done and the purpose has been
933 * achieved as well. 1114 * achieved as well.
934 */ 1115 */
935void kick_process(task_t *p) 1116void kick_process(struct task_struct *p)
936{ 1117{
937 int cpu; 1118 int cpu;
938 1119
@@ -944,32 +1125,45 @@ void kick_process(task_t *p)
944} 1125}
945 1126
946/* 1127/*
947 * Return a low guess at the load of a migration-source cpu. 1128 * Return a low guess at the load of a migration-source cpu weighted
1129 * according to the scheduling class and "nice" value.
948 * 1130 *
949 * We want to under-estimate the load of migration sources, to 1131 * We want to under-estimate the load of migration sources, to
950 * balance conservatively. 1132 * balance conservatively.
951 */ 1133 */
952static inline unsigned long source_load(int cpu, int type) 1134static inline unsigned long source_load(int cpu, int type)
953{ 1135{
954 runqueue_t *rq = cpu_rq(cpu); 1136 struct rq *rq = cpu_rq(cpu);
955 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1137
956 if (type == 0) 1138 if (type == 0)
957 return load_now; 1139 return rq->raw_weighted_load;
958 1140
959 return min(rq->cpu_load[type-1], load_now); 1141 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
960} 1142}
961 1143
962/* 1144/*
963 * Return a high guess at the load of a migration-target cpu 1145 * Return a high guess at the load of a migration-target cpu weighted
1146 * according to the scheduling class and "nice" value.
964 */ 1147 */
965static inline unsigned long target_load(int cpu, int type) 1148static inline unsigned long target_load(int cpu, int type)
966{ 1149{
967 runqueue_t *rq = cpu_rq(cpu); 1150 struct rq *rq = cpu_rq(cpu);
968 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1151
969 if (type == 0) 1152 if (type == 0)
970 return load_now; 1153 return rq->raw_weighted_load;
971 1154
972 return max(rq->cpu_load[type-1], load_now); 1155 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1156}
1157
1158/*
1159 * Return the average load per task on the cpu's run queue
1160 */
1161static inline unsigned long cpu_avg_load_per_task(int cpu)
1162{
1163 struct rq *rq = cpu_rq(cpu);
1164 unsigned long n = rq->nr_running;
1165
1166 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
973} 1167}
974 1168
975/* 1169/*
@@ -1042,7 +1236,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1042 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1236 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1043 1237
1044 for_each_cpu_mask(i, tmp) { 1238 for_each_cpu_mask(i, tmp) {
1045 load = source_load(i, 0); 1239 load = weighted_cpuload(i);
1046 1240
1047 if (load < min_load || (load == min_load && i == this_cpu)) { 1241 if (load < min_load || (load == min_load && i == this_cpu)) {
1048 min_load = load; 1242 min_load = load;
@@ -1069,9 +1263,15 @@ static int sched_balance_self(int cpu, int flag)
1069 struct task_struct *t = current; 1263 struct task_struct *t = current;
1070 struct sched_domain *tmp, *sd = NULL; 1264 struct sched_domain *tmp, *sd = NULL;
1071 1265
1072 for_each_domain(cpu, tmp) 1266 for_each_domain(cpu, tmp) {
1267 /*
1268 * If power savings logic is enabled for a domain, stop there.
1269 */
1270 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1271 break;
1073 if (tmp->flags & flag) 1272 if (tmp->flags & flag)
1074 sd = tmp; 1273 sd = tmp;
1274 }
1075 1275
1076 while (sd) { 1276 while (sd) {
1077 cpumask_t span; 1277 cpumask_t span;
@@ -1116,7 +1316,7 @@ nextlevel:
1116 * Returns the CPU we should wake onto. 1316 * Returns the CPU we should wake onto.
1117 */ 1317 */
1118#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1318#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1119static int wake_idle(int cpu, task_t *p) 1319static int wake_idle(int cpu, struct task_struct *p)
1120{ 1320{
1121 cpumask_t tmp; 1321 cpumask_t tmp;
1122 struct sched_domain *sd; 1322 struct sched_domain *sd;
@@ -1139,7 +1339,7 @@ static int wake_idle(int cpu, task_t *p)
1139 return cpu; 1339 return cpu;
1140} 1340}
1141#else 1341#else
1142static inline int wake_idle(int cpu, task_t *p) 1342static inline int wake_idle(int cpu, struct task_struct *p)
1143{ 1343{
1144 return cpu; 1344 return cpu;
1145} 1345}
@@ -1159,15 +1359,15 @@ static inline int wake_idle(int cpu, task_t *p)
1159 * 1359 *
1160 * returns failure only if the task is already active. 1360 * returns failure only if the task is already active.
1161 */ 1361 */
1162static int try_to_wake_up(task_t *p, unsigned int state, int sync) 1362static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1163{ 1363{
1164 int cpu, this_cpu, success = 0; 1364 int cpu, this_cpu, success = 0;
1165 unsigned long flags; 1365 unsigned long flags;
1166 long old_state; 1366 long old_state;
1167 runqueue_t *rq; 1367 struct rq *rq;
1168#ifdef CONFIG_SMP 1368#ifdef CONFIG_SMP
1169 unsigned long load, this_load;
1170 struct sched_domain *sd, *this_sd = NULL; 1369 struct sched_domain *sd, *this_sd = NULL;
1370 unsigned long load, this_load;
1171 int new_cpu; 1371 int new_cpu;
1172#endif 1372#endif
1173 1373
@@ -1221,17 +1421,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1221 1421
1222 if (this_sd->flags & SD_WAKE_AFFINE) { 1422 if (this_sd->flags & SD_WAKE_AFFINE) {
1223 unsigned long tl = this_load; 1423 unsigned long tl = this_load;
1424 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1425
1224 /* 1426 /*
1225 * If sync wakeup then subtract the (maximum possible) 1427 * If sync wakeup then subtract the (maximum possible)
1226 * effect of the currently running task from the load 1428 * effect of the currently running task from the load
1227 * of the current CPU: 1429 * of the current CPU:
1228 */ 1430 */
1229 if (sync) 1431 if (sync)
1230 tl -= SCHED_LOAD_SCALE; 1432 tl -= current->load_weight;
1231 1433
1232 if ((tl <= load && 1434 if ((tl <= load &&
1233 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1435 tl + target_load(cpu, idx) <= tl_per_task) ||
1234 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1436 100*(tl + p->load_weight) <= imbalance*load) {
1235 /* 1437 /*
1236 * This domain has SD_WAKE_AFFINE and 1438 * This domain has SD_WAKE_AFFINE and
1237 * p is cache cold in this domain, and 1439 * p is cache cold in this domain, and
@@ -1315,15 +1517,14 @@ out:
1315 return success; 1517 return success;
1316} 1518}
1317 1519
1318int fastcall wake_up_process(task_t *p) 1520int fastcall wake_up_process(struct task_struct *p)
1319{ 1521{
1320 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1522 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1321 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1523 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1322} 1524}
1323
1324EXPORT_SYMBOL(wake_up_process); 1525EXPORT_SYMBOL(wake_up_process);
1325 1526
1326int fastcall wake_up_state(task_t *p, unsigned int state) 1527int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1327{ 1528{
1328 return try_to_wake_up(p, state, 0); 1529 return try_to_wake_up(p, state, 0);
1329} 1530}
@@ -1332,7 +1533,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1332 * Perform scheduler related setup for a newly forked process p. 1533 * Perform scheduler related setup for a newly forked process p.
1333 * p is forked by current. 1534 * p is forked by current.
1334 */ 1535 */
1335void fastcall sched_fork(task_t *p, int clone_flags) 1536void fastcall sched_fork(struct task_struct *p, int clone_flags)
1336{ 1537{
1337 int cpu = get_cpu(); 1538 int cpu = get_cpu();
1338 1539
@@ -1348,10 +1549,17 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1348 * event cannot wake it up and insert it on the runqueue either. 1549 * event cannot wake it up and insert it on the runqueue either.
1349 */ 1550 */
1350 p->state = TASK_RUNNING; 1551 p->state = TASK_RUNNING;
1552
1553 /*
1554 * Make sure we do not leak PI boosting priority to the child:
1555 */
1556 p->prio = current->normal_prio;
1557
1351 INIT_LIST_HEAD(&p->run_list); 1558 INIT_LIST_HEAD(&p->run_list);
1352 p->array = NULL; 1559 p->array = NULL;
1353#ifdef CONFIG_SCHEDSTATS 1560#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1354 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1561 if (unlikely(sched_info_on()))
1562 memset(&p->sched_info, 0, sizeof(p->sched_info));
1355#endif 1563#endif
1356#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1564#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1357 p->oncpu = 0; 1565 p->oncpu = 0;
@@ -1394,11 +1602,11 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1394 * that must be done for every newly created context, then puts the task 1602 * that must be done for every newly created context, then puts the task
1395 * on the runqueue and wakes it. 1603 * on the runqueue and wakes it.
1396 */ 1604 */
1397void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) 1605void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1398{ 1606{
1607 struct rq *rq, *this_rq;
1399 unsigned long flags; 1608 unsigned long flags;
1400 int this_cpu, cpu; 1609 int this_cpu, cpu;
1401 runqueue_t *rq, *this_rq;
1402 1610
1403 rq = task_rq_lock(p, &flags); 1611 rq = task_rq_lock(p, &flags);
1404 BUG_ON(p->state != TASK_RUNNING); 1612 BUG_ON(p->state != TASK_RUNNING);
@@ -1427,10 +1635,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1427 __activate_task(p, rq); 1635 __activate_task(p, rq);
1428 else { 1636 else {
1429 p->prio = current->prio; 1637 p->prio = current->prio;
1638 p->normal_prio = current->normal_prio;
1430 list_add_tail(&p->run_list, &current->run_list); 1639 list_add_tail(&p->run_list, &current->run_list);
1431 p->array = current->array; 1640 p->array = current->array;
1432 p->array->nr_active++; 1641 p->array->nr_active++;
1433 rq->nr_running++; 1642 inc_nr_running(p, rq);
1434 } 1643 }
1435 set_need_resched(); 1644 set_need_resched();
1436 } else 1645 } else
@@ -1477,10 +1686,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1477 * artificially, because any timeslice recovered here 1686 * artificially, because any timeslice recovered here
1478 * was given away by the parent in the first place.) 1687 * was given away by the parent in the first place.)
1479 */ 1688 */
1480void fastcall sched_exit(task_t *p) 1689void fastcall sched_exit(struct task_struct *p)
1481{ 1690{
1482 unsigned long flags; 1691 unsigned long flags;
1483 runqueue_t *rq; 1692 struct rq *rq;
1484 1693
1485 /* 1694 /*
1486 * If the child was a (relative-) CPU hog then decrease 1695 * If the child was a (relative-) CPU hog then decrease
@@ -1511,7 +1720,7 @@ void fastcall sched_exit(task_t *p)
1511 * prepare_task_switch sets up locking and calls architecture specific 1720 * prepare_task_switch sets up locking and calls architecture specific
1512 * hooks. 1721 * hooks.
1513 */ 1722 */
1514static inline void prepare_task_switch(runqueue_t *rq, task_t *next) 1723static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1515{ 1724{
1516 prepare_lock_switch(rq, next); 1725 prepare_lock_switch(rq, next);
1517 prepare_arch_switch(next); 1726 prepare_arch_switch(next);
@@ -1532,7 +1741,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1532 * with the lock held can cause deadlocks; see schedule() for 1741 * with the lock held can cause deadlocks; see schedule() for
1533 * details.) 1742 * details.)
1534 */ 1743 */
1535static inline void finish_task_switch(runqueue_t *rq, task_t *prev) 1744static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1536 __releases(rq->lock) 1745 __releases(rq->lock)
1537{ 1746{
1538 struct mm_struct *mm = rq->prev_mm; 1747 struct mm_struct *mm = rq->prev_mm;
@@ -1570,10 +1779,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1570 * schedule_tail - first thing a freshly forked thread must call. 1779 * schedule_tail - first thing a freshly forked thread must call.
1571 * @prev: the thread we just switched away from. 1780 * @prev: the thread we just switched away from.
1572 */ 1781 */
1573asmlinkage void schedule_tail(task_t *prev) 1782asmlinkage void schedule_tail(struct task_struct *prev)
1574 __releases(rq->lock) 1783 __releases(rq->lock)
1575{ 1784{
1576 runqueue_t *rq = this_rq(); 1785 struct rq *rq = this_rq();
1786
1577 finish_task_switch(rq, prev); 1787 finish_task_switch(rq, prev);
1578#ifdef __ARCH_WANT_UNLOCKED_CTXSW 1788#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1579 /* In this case, finish_task_switch does not reenable preemption */ 1789 /* In this case, finish_task_switch does not reenable preemption */
@@ -1587,8 +1797,9 @@ asmlinkage void schedule_tail(task_t *prev)
1587 * context_switch - switch to the new MM and the new 1797 * context_switch - switch to the new MM and the new
1588 * thread's register state. 1798 * thread's register state.
1589 */ 1799 */
1590static inline 1800static inline struct task_struct *
1591task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) 1801context_switch(struct rq *rq, struct task_struct *prev,
1802 struct task_struct *next)
1592{ 1803{
1593 struct mm_struct *mm = next->mm; 1804 struct mm_struct *mm = next->mm;
1594 struct mm_struct *oldmm = prev->active_mm; 1805 struct mm_struct *oldmm = prev->active_mm;
@@ -1605,6 +1816,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1605 WARN_ON(rq->prev_mm); 1816 WARN_ON(rq->prev_mm);
1606 rq->prev_mm = oldmm; 1817 rq->prev_mm = oldmm;
1607 } 1818 }
1819 /*
1820 * Since the runqueue lock will be released by the next
1821 * task (which is an invalid locking op but in the case
1822 * of the scheduler it's an obvious special-case), so we
1823 * do an early lockdep release here:
1824 */
1825#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1826 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1827#endif
1608 1828
1609 /* Here we just switch the register state and the stack. */ 1829 /* Here we just switch the register state and the stack. */
1610 switch_to(prev, next, prev); 1830 switch_to(prev, next, prev);
@@ -1648,7 +1868,8 @@ unsigned long nr_uninterruptible(void)
1648 1868
1649unsigned long long nr_context_switches(void) 1869unsigned long long nr_context_switches(void)
1650{ 1870{
1651 unsigned long long i, sum = 0; 1871 int i;
1872 unsigned long long sum = 0;
1652 1873
1653 for_each_possible_cpu(i) 1874 for_each_possible_cpu(i)
1654 sum += cpu_rq(i)->nr_switches; 1875 sum += cpu_rq(i)->nr_switches;
@@ -1684,15 +1905,21 @@ unsigned long nr_active(void)
1684#ifdef CONFIG_SMP 1905#ifdef CONFIG_SMP
1685 1906
1686/* 1907/*
1908 * Is this task likely cache-hot:
1909 */
1910static inline int
1911task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
1912{
1913 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
1914}
1915
1916/*
1687 * double_rq_lock - safely lock two runqueues 1917 * double_rq_lock - safely lock two runqueues
1688 * 1918 *
1689 * We must take them in cpu order to match code in
1690 * dependent_sleeper and wake_dependent_sleeper.
1691 *
1692 * Note this does not disable interrupts like task_rq_lock, 1919 * Note this does not disable interrupts like task_rq_lock,
1693 * you need to do so manually before calling. 1920 * you need to do so manually before calling.
1694 */ 1921 */
1695static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) 1922static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1696 __acquires(rq1->lock) 1923 __acquires(rq1->lock)
1697 __acquires(rq2->lock) 1924 __acquires(rq2->lock)
1698{ 1925{
@@ -1700,7 +1927,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1700 spin_lock(&rq1->lock); 1927 spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */ 1928 __acquire(rq2->lock); /* Fake it out ;) */
1702 } else { 1929 } else {
1703 if (rq1->cpu < rq2->cpu) { 1930 if (rq1 < rq2) {
1704 spin_lock(&rq1->lock); 1931 spin_lock(&rq1->lock);
1705 spin_lock(&rq2->lock); 1932 spin_lock(&rq2->lock);
1706 } else { 1933 } else {
@@ -1716,7 +1943,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1716 * Note this does not restore interrupts like task_rq_unlock, 1943 * Note this does not restore interrupts like task_rq_unlock,
1717 * you need to do so manually after calling. 1944 * you need to do so manually after calling.
1718 */ 1945 */
1719static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) 1946static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1720 __releases(rq1->lock) 1947 __releases(rq1->lock)
1721 __releases(rq2->lock) 1948 __releases(rq2->lock)
1722{ 1949{
@@ -1730,13 +1957,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1730/* 1957/*
1731 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1958 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1732 */ 1959 */
1733static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) 1960static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1734 __releases(this_rq->lock) 1961 __releases(this_rq->lock)
1735 __acquires(busiest->lock) 1962 __acquires(busiest->lock)
1736 __acquires(this_rq->lock) 1963 __acquires(this_rq->lock)
1737{ 1964{
1738 if (unlikely(!spin_trylock(&busiest->lock))) { 1965 if (unlikely(!spin_trylock(&busiest->lock))) {
1739 if (busiest->cpu < this_rq->cpu) { 1966 if (busiest < this_rq) {
1740 spin_unlock(&this_rq->lock); 1967 spin_unlock(&this_rq->lock);
1741 spin_lock(&busiest->lock); 1968 spin_lock(&busiest->lock);
1742 spin_lock(&this_rq->lock); 1969 spin_lock(&this_rq->lock);
@@ -1751,11 +1978,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1751 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1978 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1752 * the cpu_allowed mask is restored. 1979 * the cpu_allowed mask is restored.
1753 */ 1980 */
1754static void sched_migrate_task(task_t *p, int dest_cpu) 1981static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1755{ 1982{
1756 migration_req_t req; 1983 struct migration_req req;
1757 runqueue_t *rq;
1758 unsigned long flags; 1984 unsigned long flags;
1985 struct rq *rq;
1759 1986
1760 rq = task_rq_lock(p, &flags); 1987 rq = task_rq_lock(p, &flags);
1761 if (!cpu_isset(dest_cpu, p->cpus_allowed) 1988 if (!cpu_isset(dest_cpu, p->cpus_allowed)
@@ -1766,11 +1993,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
1766 if (migrate_task(p, dest_cpu, &req)) { 1993 if (migrate_task(p, dest_cpu, &req)) {
1767 /* Need to wait for migration thread (might exit: take ref). */ 1994 /* Need to wait for migration thread (might exit: take ref). */
1768 struct task_struct *mt = rq->migration_thread; 1995 struct task_struct *mt = rq->migration_thread;
1996
1769 get_task_struct(mt); 1997 get_task_struct(mt);
1770 task_rq_unlock(rq, &flags); 1998 task_rq_unlock(rq, &flags);
1771 wake_up_process(mt); 1999 wake_up_process(mt);
1772 put_task_struct(mt); 2000 put_task_struct(mt);
1773 wait_for_completion(&req.done); 2001 wait_for_completion(&req.done);
2002
1774 return; 2003 return;
1775 } 2004 }
1776out: 2005out:
@@ -1794,14 +2023,14 @@ void sched_exec(void)
1794 * pull_task - move a task from a remote runqueue to the local runqueue. 2023 * pull_task - move a task from a remote runqueue to the local runqueue.
1795 * Both runqueues must be locked. 2024 * Both runqueues must be locked.
1796 */ 2025 */
1797static 2026static void pull_task(struct rq *src_rq, struct prio_array *src_array,
1798void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 2027 struct task_struct *p, struct rq *this_rq,
1799 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 2028 struct prio_array *this_array, int this_cpu)
1800{ 2029{
1801 dequeue_task(p, src_array); 2030 dequeue_task(p, src_array);
1802 src_rq->nr_running--; 2031 dec_nr_running(p, src_rq);
1803 set_task_cpu(p, this_cpu); 2032 set_task_cpu(p, this_cpu);
1804 this_rq->nr_running++; 2033 inc_nr_running(p, this_rq);
1805 enqueue_task(p, this_array); 2034 enqueue_task(p, this_array);
1806 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2035 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1807 + this_rq->timestamp_last_tick; 2036 + this_rq->timestamp_last_tick;
@@ -1817,7 +2046,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1817 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 2046 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1818 */ 2047 */
1819static 2048static
1820int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 2049int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1821 struct sched_domain *sd, enum idle_type idle, 2050 struct sched_domain *sd, enum idle_type idle,
1822 int *all_pinned) 2051 int *all_pinned)
1823{ 2052{
@@ -1848,26 +2077,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1848 return 1; 2077 return 1;
1849} 2078}
1850 2079
2080#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
2081
1851/* 2082/*
1852 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 2083 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1853 * as part of a balancing operation within "domain". Returns the number of 2084 * load from busiest to this_rq, as part of a balancing operation within
1854 * tasks moved. 2085 * "domain". Returns the number of tasks moved.
1855 * 2086 *
1856 * Called with both runqueues locked. 2087 * Called with both runqueues locked.
1857 */ 2088 */
1858static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 2089static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1859 unsigned long max_nr_move, struct sched_domain *sd, 2090 unsigned long max_nr_move, unsigned long max_load_move,
1860 enum idle_type idle, int *all_pinned) 2091 struct sched_domain *sd, enum idle_type idle,
2092 int *all_pinned)
1861{ 2093{
1862 prio_array_t *array, *dst_array; 2094 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
2095 best_prio_seen, skip_for_load;
2096 struct prio_array *array, *dst_array;
1863 struct list_head *head, *curr; 2097 struct list_head *head, *curr;
1864 int idx, pulled = 0, pinned = 0; 2098 struct task_struct *tmp;
1865 task_t *tmp; 2099 long rem_load_move;
1866 2100
1867 if (max_nr_move == 0) 2101 if (max_nr_move == 0 || max_load_move == 0)
1868 goto out; 2102 goto out;
1869 2103
2104 rem_load_move = max_load_move;
1870 pinned = 1; 2105 pinned = 1;
2106 this_best_prio = rq_best_prio(this_rq);
2107 best_prio = rq_best_prio(busiest);
2108 /*
2109 * Enable handling of the case where there is more than one task
2110 * with the best priority. If the current running task is one
2111 * of those with prio==best_prio we know it won't be moved
2112 * and therefore it's safe to override the skip (based on load) of
2113 * any task we find with that prio.
2114 */
2115 best_prio_seen = best_prio == busiest->curr->prio;
1871 2116
1872 /* 2117 /*
1873 * We first consider expired tasks. Those will likely not be 2118 * We first consider expired tasks. Those will likely not be
@@ -1903,11 +2148,22 @@ skip_bitmap:
1903 head = array->queue + idx; 2148 head = array->queue + idx;
1904 curr = head->prev; 2149 curr = head->prev;
1905skip_queue: 2150skip_queue:
1906 tmp = list_entry(curr, task_t, run_list); 2151 tmp = list_entry(curr, struct task_struct, run_list);
1907 2152
1908 curr = curr->prev; 2153 curr = curr->prev;
1909 2154
1910 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2155 /*
2156 * To help distribute high priority tasks accross CPUs we don't
2157 * skip a task if it will be the highest priority task (i.e. smallest
2158 * prio value) on its new queue regardless of its load weight
2159 */
2160 skip_for_load = tmp->load_weight > rem_load_move;
2161 if (skip_for_load && idx < this_best_prio)
2162 skip_for_load = !best_prio_seen && idx == best_prio;
2163 if (skip_for_load ||
2164 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2165
2166 best_prio_seen |= idx == best_prio;
1911 if (curr != head) 2167 if (curr != head)
1912 goto skip_queue; 2168 goto skip_queue;
1913 idx++; 2169 idx++;
@@ -1921,9 +2177,15 @@ skip_queue:
1921 2177
1922 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2178 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1923 pulled++; 2179 pulled++;
2180 rem_load_move -= tmp->load_weight;
1924 2181
1925 /* We only want to steal up to the prescribed number of tasks. */ 2182 /*
1926 if (pulled < max_nr_move) { 2183 * We only want to steal up to the prescribed number of tasks
2184 * and the prescribed amount of weighted load.
2185 */
2186 if (pulled < max_nr_move && rem_load_move > 0) {
2187 if (idx < this_best_prio)
2188 this_best_prio = idx;
1927 if (curr != head) 2189 if (curr != head)
1928 goto skip_queue; 2190 goto skip_queue;
1929 idx++; 2191 idx++;
@@ -1944,8 +2206,8 @@ out:
1944 2206
1945/* 2207/*
1946 * find_busiest_group finds and returns the busiest CPU group within the 2208 * find_busiest_group finds and returns the busiest CPU group within the
1947 * domain. It calculates and returns the number of tasks which should be 2209 * domain. It calculates and returns the amount of weighted load which
1948 * moved to restore balance via the imbalance parameter. 2210 * should be moved to restore balance via the imbalance parameter.
1949 */ 2211 */
1950static struct sched_group * 2212static struct sched_group *
1951find_busiest_group(struct sched_domain *sd, int this_cpu, 2213find_busiest_group(struct sched_domain *sd, int this_cpu,
@@ -1954,9 +2216,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1954 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2216 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1955 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2217 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1956 unsigned long max_pull; 2218 unsigned long max_pull;
2219 unsigned long busiest_load_per_task, busiest_nr_running;
2220 unsigned long this_load_per_task, this_nr_running;
1957 int load_idx; 2221 int load_idx;
2222#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2223 int power_savings_balance = 1;
2224 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2225 unsigned long min_nr_running = ULONG_MAX;
2226 struct sched_group *group_min = NULL, *group_leader = NULL;
2227#endif
1958 2228
1959 max_load = this_load = total_load = total_pwr = 0; 2229 max_load = this_load = total_load = total_pwr = 0;
2230 busiest_load_per_task = busiest_nr_running = 0;
2231 this_load_per_task = this_nr_running = 0;
1960 if (idle == NOT_IDLE) 2232 if (idle == NOT_IDLE)
1961 load_idx = sd->busy_idx; 2233 load_idx = sd->busy_idx;
1962 else if (idle == NEWLY_IDLE) 2234 else if (idle == NEWLY_IDLE)
@@ -1965,16 +2237,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1965 load_idx = sd->idle_idx; 2237 load_idx = sd->idle_idx;
1966 2238
1967 do { 2239 do {
1968 unsigned long load; 2240 unsigned long load, group_capacity;
1969 int local_group; 2241 int local_group;
1970 int i; 2242 int i;
2243 unsigned long sum_nr_running, sum_weighted_load;
1971 2244
1972 local_group = cpu_isset(this_cpu, group->cpumask); 2245 local_group = cpu_isset(this_cpu, group->cpumask);
1973 2246
1974 /* Tally up the load of all CPUs in the group */ 2247 /* Tally up the load of all CPUs in the group */
1975 avg_load = 0; 2248 sum_weighted_load = sum_nr_running = avg_load = 0;
1976 2249
1977 for_each_cpu_mask(i, group->cpumask) { 2250 for_each_cpu_mask(i, group->cpumask) {
2251 struct rq *rq = cpu_rq(i);
2252
1978 if (*sd_idle && !idle_cpu(i)) 2253 if (*sd_idle && !idle_cpu(i))
1979 *sd_idle = 0; 2254 *sd_idle = 0;
1980 2255
@@ -1985,6 +2260,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1985 load = source_load(i, load_idx); 2260 load = source_load(i, load_idx);
1986 2261
1987 avg_load += load; 2262 avg_load += load;
2263 sum_nr_running += rq->nr_running;
2264 sum_weighted_load += rq->raw_weighted_load;
1988 } 2265 }
1989 2266
1990 total_load += avg_load; 2267 total_load += avg_load;
@@ -1993,17 +2270,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1993 /* Adjust by relative CPU power of the group */ 2270 /* Adjust by relative CPU power of the group */
1994 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2271 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1995 2272
2273 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2274
1996 if (local_group) { 2275 if (local_group) {
1997 this_load = avg_load; 2276 this_load = avg_load;
1998 this = group; 2277 this = group;
1999 } else if (avg_load > max_load) { 2278 this_nr_running = sum_nr_running;
2279 this_load_per_task = sum_weighted_load;
2280 } else if (avg_load > max_load &&
2281 sum_nr_running > group_capacity) {
2000 max_load = avg_load; 2282 max_load = avg_load;
2001 busiest = group; 2283 busiest = group;
2284 busiest_nr_running = sum_nr_running;
2285 busiest_load_per_task = sum_weighted_load;
2002 } 2286 }
2287
2288#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2289 /*
2290 * Busy processors will not participate in power savings
2291 * balance.
2292 */
2293 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2294 goto group_next;
2295
2296 /*
2297 * If the local group is idle or completely loaded
2298 * no need to do power savings balance at this domain
2299 */
2300 if (local_group && (this_nr_running >= group_capacity ||
2301 !this_nr_running))
2302 power_savings_balance = 0;
2303
2304 /*
2305 * If a group is already running at full capacity or idle,
2306 * don't include that group in power savings calculations
2307 */
2308 if (!power_savings_balance || sum_nr_running >= group_capacity
2309 || !sum_nr_running)
2310 goto group_next;
2311
2312 /*
2313 * Calculate the group which has the least non-idle load.
2314 * This is the group from where we need to pick up the load
2315 * for saving power
2316 */
2317 if ((sum_nr_running < min_nr_running) ||
2318 (sum_nr_running == min_nr_running &&
2319 first_cpu(group->cpumask) <
2320 first_cpu(group_min->cpumask))) {
2321 group_min = group;
2322 min_nr_running = sum_nr_running;
2323 min_load_per_task = sum_weighted_load /
2324 sum_nr_running;
2325 }
2326
2327 /*
2328 * Calculate the group which is almost near its
2329 * capacity but still has some space to pick up some load
2330 * from other group and save more power
2331 */
2332 if (sum_nr_running <= group_capacity - 1) {
2333 if (sum_nr_running > leader_nr_running ||
2334 (sum_nr_running == leader_nr_running &&
2335 first_cpu(group->cpumask) >
2336 first_cpu(group_leader->cpumask))) {
2337 group_leader = group;
2338 leader_nr_running = sum_nr_running;
2339 }
2340 }
2341group_next:
2342#endif
2003 group = group->next; 2343 group = group->next;
2004 } while (group != sd->groups); 2344 } while (group != sd->groups);
2005 2345
2006 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2346 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2007 goto out_balanced; 2347 goto out_balanced;
2008 2348
2009 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2349 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2012,6 +2352,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2012 100*max_load <= sd->imbalance_pct*this_load) 2352 100*max_load <= sd->imbalance_pct*this_load)
2013 goto out_balanced; 2353 goto out_balanced;
2014 2354
2355 busiest_load_per_task /= busiest_nr_running;
2015 /* 2356 /*
2016 * We're trying to get all the cpus to the average_load, so we don't 2357 * We're trying to get all the cpus to the average_load, so we don't
2017 * want to push ourselves above the average load, nor do we wish to 2358 * want to push ourselves above the average load, nor do we wish to
@@ -2023,21 +2364,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2023 * by pulling tasks to us. Be careful of negative numbers as they'll 2364 * by pulling tasks to us. Be careful of negative numbers as they'll
2024 * appear as very large values with unsigned longs. 2365 * appear as very large values with unsigned longs.
2025 */ 2366 */
2367 if (max_load <= busiest_load_per_task)
2368 goto out_balanced;
2369
2370 /*
2371 * In the presence of smp nice balancing, certain scenarios can have
2372 * max load less than avg load(as we skip the groups at or below
2373 * its cpu_power, while calculating max_load..)
2374 */
2375 if (max_load < avg_load) {
2376 *imbalance = 0;
2377 goto small_imbalance;
2378 }
2026 2379
2027 /* Don't want to pull so many tasks that a group would go idle */ 2380 /* Don't want to pull so many tasks that a group would go idle */
2028 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2381 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2029 2382
2030 /* How much load to actually move to equalise the imbalance */ 2383 /* How much load to actually move to equalise the imbalance */
2031 *imbalance = min(max_pull * busiest->cpu_power, 2384 *imbalance = min(max_pull * busiest->cpu_power,
2032 (avg_load - this_load) * this->cpu_power) 2385 (avg_load - this_load) * this->cpu_power)
2033 / SCHED_LOAD_SCALE; 2386 / SCHED_LOAD_SCALE;
2034 2387
2035 if (*imbalance < SCHED_LOAD_SCALE) { 2388 /*
2036 unsigned long pwr_now = 0, pwr_move = 0; 2389 * if *imbalance is less than the average load per runnable task
2037 unsigned long tmp; 2390 * there is no gaurantee that any tasks will be moved so we'll have
2391 * a think about bumping its value to force at least one task to be
2392 * moved
2393 */
2394 if (*imbalance < busiest_load_per_task) {
2395 unsigned long tmp, pwr_now, pwr_move;
2396 unsigned int imbn;
2397
2398small_imbalance:
2399 pwr_move = pwr_now = 0;
2400 imbn = 2;
2401 if (this_nr_running) {
2402 this_load_per_task /= this_nr_running;
2403 if (busiest_load_per_task > this_load_per_task)
2404 imbn = 1;
2405 } else
2406 this_load_per_task = SCHED_LOAD_SCALE;
2038 2407
2039 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2408 if (max_load - this_load >= busiest_load_per_task * imbn) {
2040 *imbalance = 1; 2409 *imbalance = busiest_load_per_task;
2041 return busiest; 2410 return busiest;
2042 } 2411 }
2043 2412
@@ -2047,39 +2416,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2047 * moving them. 2416 * moving them.
2048 */ 2417 */
2049 2418
2050 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2419 pwr_now += busiest->cpu_power *
2051 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2420 min(busiest_load_per_task, max_load);
2421 pwr_now += this->cpu_power *
2422 min(this_load_per_task, this_load);
2052 pwr_now /= SCHED_LOAD_SCALE; 2423 pwr_now /= SCHED_LOAD_SCALE;
2053 2424
2054 /* Amount of load we'd subtract */ 2425 /* Amount of load we'd subtract */
2055 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2426 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2056 if (max_load > tmp) 2427 if (max_load > tmp)
2057 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2428 pwr_move += busiest->cpu_power *
2058 max_load - tmp); 2429 min(busiest_load_per_task, max_load - tmp);
2059 2430
2060 /* Amount of load we'd add */ 2431 /* Amount of load we'd add */
2061 if (max_load*busiest->cpu_power < 2432 if (max_load*busiest->cpu_power <
2062 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2433 busiest_load_per_task*SCHED_LOAD_SCALE)
2063 tmp = max_load*busiest->cpu_power/this->cpu_power; 2434 tmp = max_load*busiest->cpu_power/this->cpu_power;
2064 else 2435 else
2065 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2436 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2066 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2437 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2067 pwr_move /= SCHED_LOAD_SCALE; 2438 pwr_move /= SCHED_LOAD_SCALE;
2068 2439
2069 /* Move if we gain throughput */ 2440 /* Move if we gain throughput */
2070 if (pwr_move <= pwr_now) 2441 if (pwr_move <= pwr_now)
2071 goto out_balanced; 2442 goto out_balanced;
2072 2443
2073 *imbalance = 1; 2444 *imbalance = busiest_load_per_task;
2074 return busiest;
2075 } 2445 }
2076 2446
2077 /* Get rid of the scaling factor, rounding down as we divide */
2078 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2079 return busiest; 2447 return busiest;
2080 2448
2081out_balanced: 2449out_balanced:
2450#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2451 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2452 goto ret;
2082 2453
2454 if (this == group_leader && group_leader != group_min) {
2455 *imbalance = min_load_per_task;
2456 return group_min;
2457 }
2458ret:
2459#endif
2083 *imbalance = 0; 2460 *imbalance = 0;
2084 return NULL; 2461 return NULL;
2085} 2462}
@@ -2087,19 +2464,23 @@ out_balanced:
2087/* 2464/*
2088 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2465 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2089 */ 2466 */
2090static runqueue_t *find_busiest_queue(struct sched_group *group, 2467static struct rq *
2091 enum idle_type idle) 2468find_busiest_queue(struct sched_group *group, enum idle_type idle,
2469 unsigned long imbalance)
2092{ 2470{
2093 unsigned long load, max_load = 0; 2471 struct rq *busiest = NULL, *rq;
2094 runqueue_t *busiest = NULL; 2472 unsigned long max_load = 0;
2095 int i; 2473 int i;
2096 2474
2097 for_each_cpu_mask(i, group->cpumask) { 2475 for_each_cpu_mask(i, group->cpumask) {
2098 load = source_load(i, 0); 2476 rq = cpu_rq(i);
2099 2477
2100 if (load > max_load) { 2478 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
2101 max_load = load; 2479 continue;
2102 busiest = cpu_rq(i); 2480
2481 if (rq->raw_weighted_load > max_load) {
2482 max_load = rq->raw_weighted_load;
2483 busiest = rq;
2103 } 2484 }
2104 } 2485 }
2105 2486
@@ -2112,23 +2493,27 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2112 */ 2493 */
2113#define MAX_PINNED_INTERVAL 512 2494#define MAX_PINNED_INTERVAL 512
2114 2495
2496static inline unsigned long minus_1_or_zero(unsigned long n)
2497{
2498 return n > 0 ? n - 1 : 0;
2499}
2500
2115/* 2501/*
2116 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2502 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2117 * tasks if there is an imbalance. 2503 * tasks if there is an imbalance.
2118 * 2504 *
2119 * Called with this_rq unlocked. 2505 * Called with this_rq unlocked.
2120 */ 2506 */
2121static int load_balance(int this_cpu, runqueue_t *this_rq, 2507static int load_balance(int this_cpu, struct rq *this_rq,
2122 struct sched_domain *sd, enum idle_type idle) 2508 struct sched_domain *sd, enum idle_type idle)
2123{ 2509{
2510 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2124 struct sched_group *group; 2511 struct sched_group *group;
2125 runqueue_t *busiest;
2126 unsigned long imbalance; 2512 unsigned long imbalance;
2127 int nr_moved, all_pinned = 0; 2513 struct rq *busiest;
2128 int active_balance = 0;
2129 int sd_idle = 0;
2130 2514
2131 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2515 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2516 !sched_smt_power_savings)
2132 sd_idle = 1; 2517 sd_idle = 1;
2133 2518
2134 schedstat_inc(sd, lb_cnt[idle]); 2519 schedstat_inc(sd, lb_cnt[idle]);
@@ -2139,7 +2524,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2139 goto out_balanced; 2524 goto out_balanced;
2140 } 2525 }
2141 2526
2142 busiest = find_busiest_queue(group, idle); 2527 busiest = find_busiest_queue(group, idle, imbalance);
2143 if (!busiest) { 2528 if (!busiest) {
2144 schedstat_inc(sd, lb_nobusyq[idle]); 2529 schedstat_inc(sd, lb_nobusyq[idle]);
2145 goto out_balanced; 2530 goto out_balanced;
@@ -2159,7 +2544,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2159 */ 2544 */
2160 double_rq_lock(this_rq, busiest); 2545 double_rq_lock(this_rq, busiest);
2161 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2546 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2162 imbalance, sd, idle, &all_pinned); 2547 minus_1_or_zero(busiest->nr_running),
2548 imbalance, sd, idle, &all_pinned);
2163 double_rq_unlock(this_rq, busiest); 2549 double_rq_unlock(this_rq, busiest);
2164 2550
2165 /* All tasks on this runqueue were pinned by CPU affinity */ 2551 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -2216,7 +2602,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2216 sd->balance_interval *= 2; 2602 sd->balance_interval *= 2;
2217 } 2603 }
2218 2604
2219 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2605 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2606 !sched_smt_power_savings)
2220 return -1; 2607 return -1;
2221 return nr_moved; 2608 return nr_moved;
2222 2609
@@ -2231,7 +2618,8 @@ out_one_pinned:
2231 (sd->balance_interval < sd->max_interval)) 2618 (sd->balance_interval < sd->max_interval))
2232 sd->balance_interval *= 2; 2619 sd->balance_interval *= 2;
2233 2620
2234 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2621 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2622 !sched_smt_power_savings)
2235 return -1; 2623 return -1;
2236 return 0; 2624 return 0;
2237} 2625}
@@ -2243,16 +2631,16 @@ out_one_pinned:
2243 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2631 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2244 * this_rq is locked. 2632 * this_rq is locked.
2245 */ 2633 */
2246static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, 2634static int
2247 struct sched_domain *sd) 2635load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2248{ 2636{
2249 struct sched_group *group; 2637 struct sched_group *group;
2250 runqueue_t *busiest = NULL; 2638 struct rq *busiest = NULL;
2251 unsigned long imbalance; 2639 unsigned long imbalance;
2252 int nr_moved = 0; 2640 int nr_moved = 0;
2253 int sd_idle = 0; 2641 int sd_idle = 0;
2254 2642
2255 if (sd->flags & SD_SHARE_CPUPOWER) 2643 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2256 sd_idle = 1; 2644 sd_idle = 1;
2257 2645
2258 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2646 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2262,7 +2650,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2262 goto out_balanced; 2650 goto out_balanced;
2263 } 2651 }
2264 2652
2265 busiest = find_busiest_queue(group, NEWLY_IDLE); 2653 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
2266 if (!busiest) { 2654 if (!busiest) {
2267 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2655 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2268 goto out_balanced; 2656 goto out_balanced;
@@ -2277,6 +2665,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2277 /* Attempt to move tasks */ 2665 /* Attempt to move tasks */
2278 double_lock_balance(this_rq, busiest); 2666 double_lock_balance(this_rq, busiest);
2279 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2667 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2668 minus_1_or_zero(busiest->nr_running),
2280 imbalance, sd, NEWLY_IDLE, NULL); 2669 imbalance, sd, NEWLY_IDLE, NULL);
2281 spin_unlock(&busiest->lock); 2670 spin_unlock(&busiest->lock);
2282 } 2671 }
@@ -2292,9 +2681,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2292 2681
2293out_balanced: 2682out_balanced:
2294 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2683 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2295 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2684 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2685 !sched_smt_power_savings)
2296 return -1; 2686 return -1;
2297 sd->nr_balance_failed = 0; 2687 sd->nr_balance_failed = 0;
2688
2298 return 0; 2689 return 0;
2299} 2690}
2300 2691
@@ -2302,16 +2693,15 @@ out_balanced:
2302 * idle_balance is called by schedule() if this_cpu is about to become 2693 * idle_balance is called by schedule() if this_cpu is about to become
2303 * idle. Attempts to pull tasks from other CPUs. 2694 * idle. Attempts to pull tasks from other CPUs.
2304 */ 2695 */
2305static void idle_balance(int this_cpu, runqueue_t *this_rq) 2696static void idle_balance(int this_cpu, struct rq *this_rq)
2306{ 2697{
2307 struct sched_domain *sd; 2698 struct sched_domain *sd;
2308 2699
2309 for_each_domain(this_cpu, sd) { 2700 for_each_domain(this_cpu, sd) {
2310 if (sd->flags & SD_BALANCE_NEWIDLE) { 2701 if (sd->flags & SD_BALANCE_NEWIDLE) {
2311 if (load_balance_newidle(this_cpu, this_rq, sd)) { 2702 /* If we've pulled tasks over stop searching: */
2312 /* We've pulled tasks over so stop searching */ 2703 if (load_balance_newidle(this_cpu, this_rq, sd))
2313 break; 2704 break;
2314 }
2315 } 2705 }
2316 } 2706 }
2317} 2707}
@@ -2324,14 +2714,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq)
2324 * 2714 *
2325 * Called with busiest_rq locked. 2715 * Called with busiest_rq locked.
2326 */ 2716 */
2327static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2717static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2328{ 2718{
2329 struct sched_domain *sd;
2330 runqueue_t *target_rq;
2331 int target_cpu = busiest_rq->push_cpu; 2719 int target_cpu = busiest_rq->push_cpu;
2720 struct sched_domain *sd;
2721 struct rq *target_rq;
2332 2722
2723 /* Is there any task to move? */
2333 if (busiest_rq->nr_running <= 1) 2724 if (busiest_rq->nr_running <= 1)
2334 /* no task to move */
2335 return; 2725 return;
2336 2726
2337 target_rq = cpu_rq(target_cpu); 2727 target_rq = cpu_rq(target_cpu);
@@ -2347,21 +2737,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2347 double_lock_balance(busiest_rq, target_rq); 2737 double_lock_balance(busiest_rq, target_rq);
2348 2738
2349 /* Search for an sd spanning us and the target CPU. */ 2739 /* Search for an sd spanning us and the target CPU. */
2350 for_each_domain(target_cpu, sd) 2740 for_each_domain(target_cpu, sd) {
2351 if ((sd->flags & SD_LOAD_BALANCE) && 2741 if ((sd->flags & SD_LOAD_BALANCE) &&
2352 cpu_isset(busiest_cpu, sd->span)) 2742 cpu_isset(busiest_cpu, sd->span))
2353 break; 2743 break;
2744 }
2354 2745
2355 if (unlikely(sd == NULL)) 2746 if (likely(sd)) {
2356 goto out; 2747 schedstat_inc(sd, alb_cnt);
2357
2358 schedstat_inc(sd, alb_cnt);
2359 2748
2360 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2749 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2361 schedstat_inc(sd, alb_pushed); 2750 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
2362 else 2751 NULL))
2363 schedstat_inc(sd, alb_failed); 2752 schedstat_inc(sd, alb_pushed);
2364out: 2753 else
2754 schedstat_inc(sd, alb_failed);
2755 }
2365 spin_unlock(&target_rq->lock); 2756 spin_unlock(&target_rq->lock);
2366} 2757}
2367 2758
@@ -2374,23 +2765,27 @@ out:
2374 * Balancing parameters are set up in arch_init_sched_domains. 2765 * Balancing parameters are set up in arch_init_sched_domains.
2375 */ 2766 */
2376 2767
2377/* Don't have all balancing operations going off at once */ 2768/* Don't have all balancing operations going off at once: */
2378#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) 2769static inline unsigned long cpu_offset(int cpu)
2770{
2771 return jiffies + cpu * HZ / NR_CPUS;
2772}
2379 2773
2380static void rebalance_tick(int this_cpu, runqueue_t *this_rq, 2774static void
2381 enum idle_type idle) 2775rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2382{ 2776{
2383 unsigned long old_load, this_load; 2777 unsigned long this_load, interval, j = cpu_offset(this_cpu);
2384 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2385 struct sched_domain *sd; 2778 struct sched_domain *sd;
2386 int i; 2779 int i, scale;
2780
2781 this_load = this_rq->raw_weighted_load;
2782
2783 /* Update our load: */
2784 for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
2785 unsigned long old_load, new_load;
2387 2786
2388 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2389 /* Update our load */
2390 for (i = 0; i < 3; i++) {
2391 unsigned long new_load = this_load;
2392 int scale = 1 << i;
2393 old_load = this_rq->cpu_load[i]; 2787 old_load = this_rq->cpu_load[i];
2788 new_load = this_load;
2394 /* 2789 /*
2395 * Round up the averaging division if load is increasing. This 2790 * Round up the averaging division if load is increasing. This
2396 * prevents us from getting stuck on 9 if the load is 10, for 2791 * prevents us from getting stuck on 9 if the load is 10, for
@@ -2402,8 +2797,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2402 } 2797 }
2403 2798
2404 for_each_domain(this_cpu, sd) { 2799 for_each_domain(this_cpu, sd) {
2405 unsigned long interval;
2406
2407 if (!(sd->flags & SD_LOAD_BALANCE)) 2800 if (!(sd->flags & SD_LOAD_BALANCE))
2408 continue; 2801 continue;
2409 2802
@@ -2433,17 +2826,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2433/* 2826/*
2434 * on UP we do not need to balance between CPUs: 2827 * on UP we do not need to balance between CPUs:
2435 */ 2828 */
2436static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) 2829static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2437{ 2830{
2438} 2831}
2439static inline void idle_balance(int cpu, runqueue_t *rq) 2832static inline void idle_balance(int cpu, struct rq *rq)
2440{ 2833{
2441} 2834}
2442#endif 2835#endif
2443 2836
2444static inline int wake_priority_sleeper(runqueue_t *rq) 2837static inline int wake_priority_sleeper(struct rq *rq)
2445{ 2838{
2446 int ret = 0; 2839 int ret = 0;
2840
2447#ifdef CONFIG_SCHED_SMT 2841#ifdef CONFIG_SCHED_SMT
2448 spin_lock(&rq->lock); 2842 spin_lock(&rq->lock);
2449 /* 2843 /*
@@ -2467,25 +2861,26 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2467 * This is called on clock ticks and on context switches. 2861 * This is called on clock ticks and on context switches.
2468 * Bank in p->sched_time the ns elapsed since the last tick or switch. 2862 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2469 */ 2863 */
2470static inline void update_cpu_clock(task_t *p, runqueue_t *rq, 2864static inline void
2471 unsigned long long now) 2865update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2472{ 2866{
2473 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); 2867 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
2474 p->sched_time += now - last;
2475} 2868}
2476 2869
2477/* 2870/*
2478 * Return current->sched_time plus any more ns on the sched_clock 2871 * Return current->sched_time plus any more ns on the sched_clock
2479 * that have not yet been banked. 2872 * that have not yet been banked.
2480 */ 2873 */
2481unsigned long long current_sched_time(const task_t *tsk) 2874unsigned long long current_sched_time(const struct task_struct *p)
2482{ 2875{
2483 unsigned long long ns; 2876 unsigned long long ns;
2484 unsigned long flags; 2877 unsigned long flags;
2878
2485 local_irq_save(flags); 2879 local_irq_save(flags);
2486 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); 2880 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
2487 ns = tsk->sched_time + (sched_clock() - ns); 2881 ns = p->sched_time + sched_clock() - ns;
2488 local_irq_restore(flags); 2882 local_irq_restore(flags);
2883
2489 return ns; 2884 return ns;
2490} 2885}
2491 2886
@@ -2499,11 +2894,16 @@ unsigned long long current_sched_time(const task_t *tsk)
2499 * increasing number of running tasks. We also ignore the interactivity 2894 * increasing number of running tasks. We also ignore the interactivity
2500 * if a better static_prio task has expired: 2895 * if a better static_prio task has expired:
2501 */ 2896 */
2502#define EXPIRED_STARVING(rq) \ 2897static inline int expired_starving(struct rq *rq)
2503 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ 2898{
2504 (jiffies - (rq)->expired_timestamp >= \ 2899 if (rq->curr->static_prio > rq->best_expired_prio)
2505 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ 2900 return 1;
2506 ((rq)->curr->static_prio > (rq)->best_expired_prio)) 2901 if (!STARVATION_LIMIT || !rq->expired_timestamp)
2902 return 0;
2903 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
2904 return 1;
2905 return 0;
2906}
2507 2907
2508/* 2908/*
2509 * Account user cpu time to a process. 2909 * Account user cpu time to a process.
@@ -2536,7 +2936,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
2536 cputime_t cputime) 2936 cputime_t cputime)
2537{ 2937{
2538 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2938 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2539 runqueue_t *rq = this_rq(); 2939 struct rq *rq = this_rq();
2540 cputime64_t tmp; 2940 cputime64_t tmp;
2541 2941
2542 p->stime = cputime_add(p->stime, cputime); 2942 p->stime = cputime_add(p->stime, cputime);
@@ -2566,7 +2966,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
2566{ 2966{
2567 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2967 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2568 cputime64_t tmp = cputime_to_cputime64(steal); 2968 cputime64_t tmp = cputime_to_cputime64(steal);
2569 runqueue_t *rq = this_rq(); 2969 struct rq *rq = this_rq();
2570 2970
2571 if (p == rq->idle) { 2971 if (p == rq->idle) {
2572 p->stime = cputime_add(p->stime, steal); 2972 p->stime = cputime_add(p->stime, steal);
@@ -2587,10 +2987,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
2587 */ 2987 */
2588void scheduler_tick(void) 2988void scheduler_tick(void)
2589{ 2989{
2590 int cpu = smp_processor_id();
2591 runqueue_t *rq = this_rq();
2592 task_t *p = current;
2593 unsigned long long now = sched_clock(); 2990 unsigned long long now = sched_clock();
2991 struct task_struct *p = current;
2992 int cpu = smp_processor_id();
2993 struct rq *rq = cpu_rq(cpu);
2594 2994
2595 update_cpu_clock(p, rq, now); 2995 update_cpu_clock(p, rq, now);
2596 2996
@@ -2640,7 +3040,7 @@ void scheduler_tick(void)
2640 3040
2641 if (!rq->expired_timestamp) 3041 if (!rq->expired_timestamp)
2642 rq->expired_timestamp = jiffies; 3042 rq->expired_timestamp = jiffies;
2643 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { 3043 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
2644 enqueue_task(p, rq->expired); 3044 enqueue_task(p, rq->expired);
2645 if (p->static_prio < rq->best_expired_prio) 3045 if (p->static_prio < rq->best_expired_prio)
2646 rq->best_expired_prio = p->static_prio; 3046 rq->best_expired_prio = p->static_prio;
@@ -2679,55 +3079,42 @@ out:
2679} 3079}
2680 3080
2681#ifdef CONFIG_SCHED_SMT 3081#ifdef CONFIG_SCHED_SMT
2682static inline void wakeup_busy_runqueue(runqueue_t *rq) 3082static inline void wakeup_busy_runqueue(struct rq *rq)
2683{ 3083{
2684 /* If an SMT runqueue is sleeping due to priority reasons wake it up */ 3084 /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2685 if (rq->curr == rq->idle && rq->nr_running) 3085 if (rq->curr == rq->idle && rq->nr_running)
2686 resched_task(rq->idle); 3086 resched_task(rq->idle);
2687} 3087}
2688 3088
2689static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3089/*
3090 * Called with interrupt disabled and this_rq's runqueue locked.
3091 */
3092static void wake_sleeping_dependent(int this_cpu)
2690{ 3093{
2691 struct sched_domain *tmp, *sd = NULL; 3094 struct sched_domain *tmp, *sd = NULL;
2692 cpumask_t sibling_map;
2693 int i; 3095 int i;
2694 3096
2695 for_each_domain(this_cpu, tmp) 3097 for_each_domain(this_cpu, tmp) {
2696 if (tmp->flags & SD_SHARE_CPUPOWER) 3098 if (tmp->flags & SD_SHARE_CPUPOWER) {
2697 sd = tmp; 3099 sd = tmp;
3100 break;
3101 }
3102 }
2698 3103
2699 if (!sd) 3104 if (!sd)
2700 return; 3105 return;
2701 3106
2702 /* 3107 for_each_cpu_mask(i, sd->span) {
2703 * Unlock the current runqueue because we have to lock in 3108 struct rq *smt_rq = cpu_rq(i);
2704 * CPU order to avoid deadlocks. Caller knows that we might
2705 * unlock. We keep IRQs disabled.
2706 */
2707 spin_unlock(&this_rq->lock);
2708
2709 sibling_map = sd->span;
2710 3109
2711 for_each_cpu_mask(i, sibling_map) 3110 if (i == this_cpu)
2712 spin_lock(&cpu_rq(i)->lock); 3111 continue;
2713 /* 3112 if (unlikely(!spin_trylock(&smt_rq->lock)))
2714 * We clear this CPU from the mask. This both simplifies the 3113 continue;
2715 * inner loop and keps this_rq locked when we exit:
2716 */
2717 cpu_clear(this_cpu, sibling_map);
2718
2719 for_each_cpu_mask(i, sibling_map) {
2720 runqueue_t *smt_rq = cpu_rq(i);
2721 3114
2722 wakeup_busy_runqueue(smt_rq); 3115 wakeup_busy_runqueue(smt_rq);
3116 spin_unlock(&smt_rq->lock);
2723 } 3117 }
2724
2725 for_each_cpu_mask(i, sibling_map)
2726 spin_unlock(&cpu_rq(i)->lock);
2727 /*
2728 * We exit with this_cpu's rq still held and IRQs
2729 * still disabled:
2730 */
2731} 3118}
2732 3119
2733/* 3120/*
@@ -2735,57 +3122,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2735 * utilize, if another task runs on a sibling. This models the 3122 * utilize, if another task runs on a sibling. This models the
2736 * slowdown effect of other tasks running on siblings: 3123 * slowdown effect of other tasks running on siblings:
2737 */ 3124 */
2738static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) 3125static inline unsigned long
3126smt_slice(struct task_struct *p, struct sched_domain *sd)
2739{ 3127{
2740 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3128 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2741} 3129}
2742 3130
2743static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3131/*
3132 * To minimise lock contention and not have to drop this_rq's runlock we only
3133 * trylock the sibling runqueues and bypass those runqueues if we fail to
3134 * acquire their lock. As we only trylock the normal locking order does not
3135 * need to be obeyed.
3136 */
3137static int
3138dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
2744{ 3139{
2745 struct sched_domain *tmp, *sd = NULL; 3140 struct sched_domain *tmp, *sd = NULL;
2746 cpumask_t sibling_map;
2747 prio_array_t *array;
2748 int ret = 0, i; 3141 int ret = 0, i;
2749 task_t *p;
2750 3142
2751 for_each_domain(this_cpu, tmp) 3143 /* kernel/rt threads do not participate in dependent sleeping */
2752 if (tmp->flags & SD_SHARE_CPUPOWER) 3144 if (!p->mm || rt_task(p))
3145 return 0;
3146
3147 for_each_domain(this_cpu, tmp) {
3148 if (tmp->flags & SD_SHARE_CPUPOWER) {
2753 sd = tmp; 3149 sd = tmp;
3150 break;
3151 }
3152 }
2754 3153
2755 if (!sd) 3154 if (!sd)
2756 return 0; 3155 return 0;
2757 3156
2758 /* 3157 for_each_cpu_mask(i, sd->span) {
2759 * The same locking rules and details apply as for 3158 struct task_struct *smt_curr;
2760 * wake_sleeping_dependent(): 3159 struct rq *smt_rq;
2761 */
2762 spin_unlock(&this_rq->lock);
2763 sibling_map = sd->span;
2764 for_each_cpu_mask(i, sibling_map)
2765 spin_lock(&cpu_rq(i)->lock);
2766 cpu_clear(this_cpu, sibling_map);
2767 3160
2768 /* 3161 if (i == this_cpu)
2769 * Establish next task to be run - it might have gone away because 3162 continue;
2770 * we released the runqueue lock above:
2771 */
2772 if (!this_rq->nr_running)
2773 goto out_unlock;
2774 array = this_rq->active;
2775 if (!array->nr_active)
2776 array = this_rq->expired;
2777 BUG_ON(!array->nr_active);
2778 3163
2779 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 3164 smt_rq = cpu_rq(i);
2780 task_t, run_list); 3165 if (unlikely(!spin_trylock(&smt_rq->lock)))
3166 continue;
2781 3167
2782 for_each_cpu_mask(i, sibling_map) { 3168 smt_curr = smt_rq->curr;
2783 runqueue_t *smt_rq = cpu_rq(i);
2784 task_t *smt_curr = smt_rq->curr;
2785 3169
2786 /* Kernel threads do not participate in dependent sleeping */ 3170 if (!smt_curr->mm)
2787 if (!p->mm || !smt_curr->mm || rt_task(p)) 3171 goto unlock;
2788 goto check_smt_task;
2789 3172
2790 /* 3173 /*
2791 * If a user task with lower static priority than the 3174 * If a user task with lower static priority than the
@@ -2803,49 +3186,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2803 if ((jiffies % DEF_TIMESLICE) > 3186 if ((jiffies % DEF_TIMESLICE) >
2804 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3187 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2805 ret = 1; 3188 ret = 1;
2806 } else 3189 } else {
2807 if (smt_curr->static_prio < p->static_prio && 3190 if (smt_curr->static_prio < p->static_prio &&
2808 !TASK_PREEMPTS_CURR(p, smt_rq) && 3191 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2809 smt_slice(smt_curr, sd) > task_timeslice(p)) 3192 smt_slice(smt_curr, sd) > task_timeslice(p))
2810 ret = 1; 3193 ret = 1;
2811
2812check_smt_task:
2813 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2814 rt_task(smt_curr))
2815 continue;
2816 if (!p->mm) {
2817 wakeup_busy_runqueue(smt_rq);
2818 continue;
2819 }
2820
2821 /*
2822 * Reschedule a lower priority task on the SMT sibling for
2823 * it to be put to sleep, or wake it up if it has been put to
2824 * sleep for priority reasons to see if it should run now.
2825 */
2826 if (rt_task(p)) {
2827 if ((jiffies % DEF_TIMESLICE) >
2828 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2829 resched_task(smt_curr);
2830 } else {
2831 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2832 smt_slice(p, sd) > task_timeslice(smt_curr))
2833 resched_task(smt_curr);
2834 else
2835 wakeup_busy_runqueue(smt_rq);
2836 } 3194 }
3195unlock:
3196 spin_unlock(&smt_rq->lock);
2837 } 3197 }
2838out_unlock:
2839 for_each_cpu_mask(i, sibling_map)
2840 spin_unlock(&cpu_rq(i)->lock);
2841 return ret; 3198 return ret;
2842} 3199}
2843#else 3200#else
2844static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3201static inline void wake_sleeping_dependent(int this_cpu)
2845{ 3202{
2846} 3203}
2847 3204static inline int
2848static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3205dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
2849{ 3206{
2850 return 0; 3207 return 0;
2851} 3208}
@@ -2858,12 +3215,13 @@ void fastcall add_preempt_count(int val)
2858 /* 3215 /*
2859 * Underflow? 3216 * Underflow?
2860 */ 3217 */
2861 BUG_ON((preempt_count() < 0)); 3218 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3219 return;
2862 preempt_count() += val; 3220 preempt_count() += val;
2863 /* 3221 /*
2864 * Spinlock count overflowing soon? 3222 * Spinlock count overflowing soon?
2865 */ 3223 */
2866 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3224 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2867} 3225}
2868EXPORT_SYMBOL(add_preempt_count); 3226EXPORT_SYMBOL(add_preempt_count);
2869 3227
@@ -2872,11 +3230,15 @@ void fastcall sub_preempt_count(int val)
2872 /* 3230 /*
2873 * Underflow? 3231 * Underflow?
2874 */ 3232 */
2875 BUG_ON(val > preempt_count()); 3233 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3234 return;
2876 /* 3235 /*
2877 * Is the spinlock portion underflowing? 3236 * Is the spinlock portion underflowing?
2878 */ 3237 */
2879 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); 3238 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3239 !(preempt_count() & PREEMPT_MASK)))
3240 return;
3241
2880 preempt_count() -= val; 3242 preempt_count() -= val;
2881} 3243}
2882EXPORT_SYMBOL(sub_preempt_count); 3244EXPORT_SYMBOL(sub_preempt_count);
@@ -2894,14 +3256,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
2894 */ 3256 */
2895asmlinkage void __sched schedule(void) 3257asmlinkage void __sched schedule(void)
2896{ 3258{
2897 long *switch_count; 3259 struct task_struct *prev, *next;
2898 task_t *prev, *next; 3260 struct prio_array *array;
2899 runqueue_t *rq;
2900 prio_array_t *array;
2901 struct list_head *queue; 3261 struct list_head *queue;
2902 unsigned long long now; 3262 unsigned long long now;
2903 unsigned long run_time; 3263 unsigned long run_time;
2904 int cpu, idx, new_prio; 3264 int cpu, idx, new_prio;
3265 long *switch_count;
3266 struct rq *rq;
2905 3267
2906 /* 3268 /*
2907 * Test if we are atomic. Since do_exit() needs to call into 3269 * Test if we are atomic. Since do_exit() needs to call into
@@ -2967,32 +3329,13 @@ need_resched_nonpreemptible:
2967 3329
2968 cpu = smp_processor_id(); 3330 cpu = smp_processor_id();
2969 if (unlikely(!rq->nr_running)) { 3331 if (unlikely(!rq->nr_running)) {
2970go_idle:
2971 idle_balance(cpu, rq); 3332 idle_balance(cpu, rq);
2972 if (!rq->nr_running) { 3333 if (!rq->nr_running) {
2973 next = rq->idle; 3334 next = rq->idle;
2974 rq->expired_timestamp = 0; 3335 rq->expired_timestamp = 0;
2975 wake_sleeping_dependent(cpu, rq); 3336 wake_sleeping_dependent(cpu);
2976 /*
2977 * wake_sleeping_dependent() might have released
2978 * the runqueue, so break out if we got new
2979 * tasks meanwhile:
2980 */
2981 if (!rq->nr_running)
2982 goto switch_tasks;
2983 }
2984 } else {
2985 if (dependent_sleeper(cpu, rq)) {
2986 next = rq->idle;
2987 goto switch_tasks; 3337 goto switch_tasks;
2988 } 3338 }
2989 /*
2990 * dependent_sleeper() releases and reacquires the runqueue
2991 * lock, hence go into the idle loop if the rq went
2992 * empty meanwhile:
2993 */
2994 if (unlikely(!rq->nr_running))
2995 goto go_idle;
2996 } 3339 }
2997 3340
2998 array = rq->active; 3341 array = rq->active;
@@ -3010,7 +3353,7 @@ go_idle:
3010 3353
3011 idx = sched_find_first_bit(array->bitmap); 3354 idx = sched_find_first_bit(array->bitmap);
3012 queue = array->queue + idx; 3355 queue = array->queue + idx;
3013 next = list_entry(queue->next, task_t, run_list); 3356 next = list_entry(queue->next, struct task_struct, run_list);
3014 3357
3015 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3358 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
3016 unsigned long long delta = now - next->timestamp; 3359 unsigned long long delta = now - next->timestamp;
@@ -3030,6 +3373,8 @@ go_idle:
3030 } 3373 }
3031 } 3374 }
3032 next->sleep_type = SLEEP_NORMAL; 3375 next->sleep_type = SLEEP_NORMAL;
3376 if (dependent_sleeper(cpu, rq, next))
3377 next = rq->idle;
3033switch_tasks: 3378switch_tasks:
3034 if (next == rq->idle) 3379 if (next == rq->idle)
3035 schedstat_inc(rq, sched_goidle); 3380 schedstat_inc(rq, sched_goidle);
@@ -3071,12 +3416,11 @@ switch_tasks:
3071 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3416 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3072 goto need_resched; 3417 goto need_resched;
3073} 3418}
3074
3075EXPORT_SYMBOL(schedule); 3419EXPORT_SYMBOL(schedule);
3076 3420
3077#ifdef CONFIG_PREEMPT 3421#ifdef CONFIG_PREEMPT
3078/* 3422/*
3079 * this is is the entry point to schedule() from in-kernel preemption 3423 * this is the entry point to schedule() from in-kernel preemption
3080 * off of preempt_enable. Kernel preemptions off return from interrupt 3424 * off of preempt_enable. Kernel preemptions off return from interrupt
3081 * occur there and call schedule directly. 3425 * occur there and call schedule directly.
3082 */ 3426 */
@@ -3116,11 +3460,10 @@ need_resched:
3116 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3460 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3117 goto need_resched; 3461 goto need_resched;
3118} 3462}
3119
3120EXPORT_SYMBOL(preempt_schedule); 3463EXPORT_SYMBOL(preempt_schedule);
3121 3464
3122/* 3465/*
3123 * this is is the entry point to schedule() from kernel preemption 3466 * this is the entry point to schedule() from kernel preemption
3124 * off of irq context. 3467 * off of irq context.
3125 * Note, that this is called and return with irqs disabled. This will 3468 * Note, that this is called and return with irqs disabled. This will
3126 * protect us against recursive calling from irq. 3469 * protect us against recursive calling from irq.
@@ -3132,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3132 struct task_struct *task = current; 3475 struct task_struct *task = current;
3133 int saved_lock_depth; 3476 int saved_lock_depth;
3134#endif 3477#endif
3135 /* Catch callers which need to be fixed*/ 3478 /* Catch callers which need to be fixed */
3136 BUG_ON(ti->preempt_count || !irqs_disabled()); 3479 BUG_ON(ti->preempt_count || !irqs_disabled());
3137 3480
3138need_resched: 3481need_resched:
@@ -3165,10 +3508,8 @@ need_resched:
3165int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 3508int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3166 void *key) 3509 void *key)
3167{ 3510{
3168 task_t *p = curr->private; 3511 return try_to_wake_up(curr->private, mode, sync);
3169 return try_to_wake_up(p, mode, sync);
3170} 3512}
3171
3172EXPORT_SYMBOL(default_wake_function); 3513EXPORT_SYMBOL(default_wake_function);
3173 3514
3174/* 3515/*
@@ -3186,13 +3527,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3186 struct list_head *tmp, *next; 3527 struct list_head *tmp, *next;
3187 3528
3188 list_for_each_safe(tmp, next, &q->task_list) { 3529 list_for_each_safe(tmp, next, &q->task_list) {
3189 wait_queue_t *curr; 3530 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3190 unsigned flags; 3531 unsigned flags = curr->flags;
3191 curr = list_entry(tmp, wait_queue_t, task_list); 3532
3192 flags = curr->flags;
3193 if (curr->func(curr, mode, sync, key) && 3533 if (curr->func(curr, mode, sync, key) &&
3194 (flags & WQ_FLAG_EXCLUSIVE) && 3534 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3195 !--nr_exclusive)
3196 break; 3535 break;
3197 } 3536 }
3198} 3537}
@@ -3213,7 +3552,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3213 __wake_up_common(q, mode, nr_exclusive, 0, key); 3552 __wake_up_common(q, mode, nr_exclusive, 0, key);
3214 spin_unlock_irqrestore(&q->lock, flags); 3553 spin_unlock_irqrestore(&q->lock, flags);
3215} 3554}
3216
3217EXPORT_SYMBOL(__wake_up); 3555EXPORT_SYMBOL(__wake_up);
3218 3556
3219/* 3557/*
@@ -3282,6 +3620,7 @@ EXPORT_SYMBOL(complete_all);
3282void fastcall __sched wait_for_completion(struct completion *x) 3620void fastcall __sched wait_for_completion(struct completion *x)
3283{ 3621{
3284 might_sleep(); 3622 might_sleep();
3623
3285 spin_lock_irq(&x->wait.lock); 3624 spin_lock_irq(&x->wait.lock);
3286 if (!x->done) { 3625 if (!x->done) {
3287 DECLARE_WAITQUEUE(wait, current); 3626 DECLARE_WAITQUEUE(wait, current);
@@ -3426,7 +3765,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3426 schedule(); 3765 schedule();
3427 SLEEP_ON_TAIL 3766 SLEEP_ON_TAIL
3428} 3767}
3429
3430EXPORT_SYMBOL(interruptible_sleep_on); 3768EXPORT_SYMBOL(interruptible_sleep_on);
3431 3769
3432long fastcall __sched 3770long fastcall __sched
@@ -3442,7 +3780,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3442 3780
3443 return timeout; 3781 return timeout;
3444} 3782}
3445
3446EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3783EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3447 3784
3448void fastcall __sched sleep_on(wait_queue_head_t *q) 3785void fastcall __sched sleep_on(wait_queue_head_t *q)
@@ -3455,7 +3792,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q)
3455 schedule(); 3792 schedule();
3456 SLEEP_ON_TAIL 3793 SLEEP_ON_TAIL
3457} 3794}
3458
3459EXPORT_SYMBOL(sleep_on); 3795EXPORT_SYMBOL(sleep_on);
3460 3796
3461long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3797long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
@@ -3473,12 +3809,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3473 3809
3474EXPORT_SYMBOL(sleep_on_timeout); 3810EXPORT_SYMBOL(sleep_on_timeout);
3475 3811
3476void set_user_nice(task_t *p, long nice) 3812#ifdef CONFIG_RT_MUTEXES
3813
3814/*
3815 * rt_mutex_setprio - set the current priority of a task
3816 * @p: task
3817 * @prio: prio value (kernel-internal form)
3818 *
3819 * This function changes the 'effective' priority of a task. It does
3820 * not touch ->normal_prio like __setscheduler().
3821 *
3822 * Used by the rt_mutex code to implement priority inheritance logic.
3823 */
3824void rt_mutex_setprio(struct task_struct *p, int prio)
3477{ 3825{
3826 struct prio_array *array;
3478 unsigned long flags; 3827 unsigned long flags;
3479 prio_array_t *array; 3828 struct rq *rq;
3480 runqueue_t *rq; 3829 int oldprio;
3481 int old_prio, new_prio, delta; 3830
3831 BUG_ON(prio < 0 || prio > MAX_PRIO);
3832
3833 rq = task_rq_lock(p, &flags);
3834
3835 oldprio = p->prio;
3836 array = p->array;
3837 if (array)
3838 dequeue_task(p, array);
3839 p->prio = prio;
3840
3841 if (array) {
3842 /*
3843 * If changing to an RT priority then queue it
3844 * in the active array!
3845 */
3846 if (rt_task(p))
3847 array = rq->active;
3848 enqueue_task(p, array);
3849 /*
3850 * Reschedule if we are currently running on this runqueue and
3851 * our priority decreased, or if we are not currently running on
3852 * this runqueue and our priority is higher than the current's
3853 */
3854 if (task_running(rq, p)) {
3855 if (p->prio > oldprio)
3856 resched_task(rq->curr);
3857 } else if (TASK_PREEMPTS_CURR(p, rq))
3858 resched_task(rq->curr);
3859 }
3860 task_rq_unlock(rq, &flags);
3861}
3862
3863#endif
3864
3865void set_user_nice(struct task_struct *p, long nice)
3866{
3867 struct prio_array *array;
3868 int old_prio, delta;
3869 unsigned long flags;
3870 struct rq *rq;
3482 3871
3483 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3872 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3484 return; 3873 return;
@@ -3493,22 +3882,25 @@ void set_user_nice(task_t *p, long nice)
3493 * it wont have any effect on scheduling until the task is 3882 * it wont have any effect on scheduling until the task is
3494 * not SCHED_NORMAL/SCHED_BATCH: 3883 * not SCHED_NORMAL/SCHED_BATCH:
3495 */ 3884 */
3496 if (rt_task(p)) { 3885 if (has_rt_policy(p)) {
3497 p->static_prio = NICE_TO_PRIO(nice); 3886 p->static_prio = NICE_TO_PRIO(nice);
3498 goto out_unlock; 3887 goto out_unlock;
3499 } 3888 }
3500 array = p->array; 3889 array = p->array;
3501 if (array) 3890 if (array) {
3502 dequeue_task(p, array); 3891 dequeue_task(p, array);
3892 dec_raw_weighted_load(rq, p);
3893 }
3503 3894
3504 old_prio = p->prio;
3505 new_prio = NICE_TO_PRIO(nice);
3506 delta = new_prio - old_prio;
3507 p->static_prio = NICE_TO_PRIO(nice); 3895 p->static_prio = NICE_TO_PRIO(nice);
3508 p->prio += delta; 3896 set_load_weight(p);
3897 old_prio = p->prio;
3898 p->prio = effective_prio(p);
3899 delta = p->prio - old_prio;
3509 3900
3510 if (array) { 3901 if (array) {
3511 enqueue_task(p, array); 3902 enqueue_task(p, array);
3903 inc_raw_weighted_load(rq, p);
3512 /* 3904 /*
3513 * If the task increased its priority or is running and 3905 * If the task increased its priority or is running and
3514 * lowered its priority, then reschedule its CPU: 3906 * lowered its priority, then reschedule its CPU:
@@ -3519,7 +3911,6 @@ void set_user_nice(task_t *p, long nice)
3519out_unlock: 3911out_unlock:
3520 task_rq_unlock(rq, &flags); 3912 task_rq_unlock(rq, &flags);
3521} 3913}
3522
3523EXPORT_SYMBOL(set_user_nice); 3914EXPORT_SYMBOL(set_user_nice);
3524 3915
3525/* 3916/*
@@ -3527,10 +3918,11 @@ EXPORT_SYMBOL(set_user_nice);
3527 * @p: task 3918 * @p: task
3528 * @nice: nice value 3919 * @nice: nice value
3529 */ 3920 */
3530int can_nice(const task_t *p, const int nice) 3921int can_nice(const struct task_struct *p, const int nice)
3531{ 3922{
3532 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3923 /* convert nice value [19,-20] to rlimit style value [1,40] */
3533 int nice_rlim = 20 - nice; 3924 int nice_rlim = 20 - nice;
3925
3534 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 3926 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3535 capable(CAP_SYS_NICE)); 3927 capable(CAP_SYS_NICE));
3536} 3928}
@@ -3546,8 +3938,7 @@ int can_nice(const task_t *p, const int nice)
3546 */ 3938 */
3547asmlinkage long sys_nice(int increment) 3939asmlinkage long sys_nice(int increment)
3548{ 3940{
3549 int retval; 3941 long nice, retval;
3550 long nice;
3551 3942
3552 /* 3943 /*
3553 * Setpriority might change our priority at the same moment. 3944 * Setpriority might change our priority at the same moment.
@@ -3586,7 +3977,7 @@ asmlinkage long sys_nice(int increment)
3586 * RT tasks are offset by -200. Normal tasks are centered 3977 * RT tasks are offset by -200. Normal tasks are centered
3587 * around 0, value goes from -16 to +15. 3978 * around 0, value goes from -16 to +15.
3588 */ 3979 */
3589int task_prio(const task_t *p) 3980int task_prio(const struct task_struct *p)
3590{ 3981{
3591 return p->prio - MAX_RT_PRIO; 3982 return p->prio - MAX_RT_PRIO;
3592} 3983}
@@ -3595,7 +3986,7 @@ int task_prio(const task_t *p)
3595 * task_nice - return the nice value of a given task. 3986 * task_nice - return the nice value of a given task.
3596 * @p: the task in question. 3987 * @p: the task in question.
3597 */ 3988 */
3598int task_nice(const task_t *p) 3989int task_nice(const struct task_struct *p)
3599{ 3990{
3600 return TASK_NICE(p); 3991 return TASK_NICE(p);
3601} 3992}
@@ -3614,7 +4005,7 @@ int idle_cpu(int cpu)
3614 * idle_task - return the idle task for a given cpu. 4005 * idle_task - return the idle task for a given cpu.
3615 * @cpu: the processor in question. 4006 * @cpu: the processor in question.
3616 */ 4007 */
3617task_t *idle_task(int cpu) 4008struct task_struct *idle_task(int cpu)
3618{ 4009{
3619 return cpu_rq(cpu)->idle; 4010 return cpu_rq(cpu)->idle;
3620} 4011}
@@ -3623,7 +4014,7 @@ task_t *idle_task(int cpu)
3623 * find_process_by_pid - find a process with a matching PID value. 4014 * find_process_by_pid - find a process with a matching PID value.
3624 * @pid: the pid in question. 4015 * @pid: the pid in question.
3625 */ 4016 */
3626static inline task_t *find_process_by_pid(pid_t pid) 4017static inline struct task_struct *find_process_by_pid(pid_t pid)
3627{ 4018{
3628 return pid ? find_task_by_pid(pid) : current; 4019 return pid ? find_task_by_pid(pid) : current;
3629} 4020}
@@ -3632,18 +4023,18 @@ static inline task_t *find_process_by_pid(pid_t pid)
3632static void __setscheduler(struct task_struct *p, int policy, int prio) 4023static void __setscheduler(struct task_struct *p, int policy, int prio)
3633{ 4024{
3634 BUG_ON(p->array); 4025 BUG_ON(p->array);
4026
3635 p->policy = policy; 4027 p->policy = policy;
3636 p->rt_priority = prio; 4028 p->rt_priority = prio;
3637 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 4029 p->normal_prio = normal_prio(p);
3638 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 4030 /* we are holding p->pi_lock already */
3639 } else { 4031 p->prio = rt_mutex_getprio(p);
3640 p->prio = p->static_prio; 4032 /*
3641 /* 4033 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3642 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 4034 */
3643 */ 4035 if (policy == SCHED_BATCH)
3644 if (policy == SCHED_BATCH) 4036 p->sleep_avg = 0;
3645 p->sleep_avg = 0; 4037 set_load_weight(p);
3646 }
3647} 4038}
3648 4039
3649/** 4040/**
@@ -3656,12 +4047,13 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3656int sched_setscheduler(struct task_struct *p, int policy, 4047int sched_setscheduler(struct task_struct *p, int policy,
3657 struct sched_param *param) 4048 struct sched_param *param)
3658{ 4049{
3659 int retval; 4050 int retval, oldprio, oldpolicy = -1;
3660 int oldprio, oldpolicy = -1; 4051 struct prio_array *array;
3661 prio_array_t *array;
3662 unsigned long flags; 4052 unsigned long flags;
3663 runqueue_t *rq; 4053 struct rq *rq;
3664 4054
4055 /* may grab non-irq protected spin_locks */
4056 BUG_ON(in_interrupt());
3665recheck: 4057recheck:
3666 /* double check policy once rq lock held */ 4058 /* double check policy once rq lock held */
3667 if (policy < 0) 4059 if (policy < 0)
@@ -3710,14 +4102,20 @@ recheck:
3710 if (retval) 4102 if (retval)
3711 return retval; 4103 return retval;
3712 /* 4104 /*
4105 * make sure no PI-waiters arrive (or leave) while we are
4106 * changing the priority of the task:
4107 */
4108 spin_lock_irqsave(&p->pi_lock, flags);
4109 /*
3713 * To be able to change p->policy safely, the apropriate 4110 * To be able to change p->policy safely, the apropriate
3714 * runqueue lock must be held. 4111 * runqueue lock must be held.
3715 */ 4112 */
3716 rq = task_rq_lock(p, &flags); 4113 rq = __task_rq_lock(p);
3717 /* recheck policy now with rq lock held */ 4114 /* recheck policy now with rq lock held */
3718 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4115 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3719 policy = oldpolicy = -1; 4116 policy = oldpolicy = -1;
3720 task_rq_unlock(rq, &flags); 4117 __task_rq_unlock(rq);
4118 spin_unlock_irqrestore(&p->pi_lock, flags);
3721 goto recheck; 4119 goto recheck;
3722 } 4120 }
3723 array = p->array; 4121 array = p->array;
@@ -3738,7 +4136,11 @@ recheck:
3738 } else if (TASK_PREEMPTS_CURR(p, rq)) 4136 } else if (TASK_PREEMPTS_CURR(p, rq))
3739 resched_task(rq->curr); 4137 resched_task(rq->curr);
3740 } 4138 }
3741 task_rq_unlock(rq, &flags); 4139 __task_rq_unlock(rq);
4140 spin_unlock_irqrestore(&p->pi_lock, flags);
4141
4142 rt_mutex_adjust_pi(p);
4143
3742 return 0; 4144 return 0;
3743} 4145}
3744EXPORT_SYMBOL_GPL(sched_setscheduler); 4146EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3746,9 +4148,9 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3746static int 4148static int
3747do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4149do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3748{ 4150{
3749 int retval;
3750 struct sched_param lparam; 4151 struct sched_param lparam;
3751 struct task_struct *p; 4152 struct task_struct *p;
4153 int retval;
3752 4154
3753 if (!param || pid < 0) 4155 if (!param || pid < 0)
3754 return -EINVAL; 4156 return -EINVAL;
@@ -3760,8 +4162,11 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3760 read_unlock_irq(&tasklist_lock); 4162 read_unlock_irq(&tasklist_lock);
3761 return -ESRCH; 4163 return -ESRCH;
3762 } 4164 }
3763 retval = sched_setscheduler(p, policy, &lparam); 4165 get_task_struct(p);
3764 read_unlock_irq(&tasklist_lock); 4166 read_unlock_irq(&tasklist_lock);
4167 retval = sched_setscheduler(p, policy, &lparam);
4168 put_task_struct(p);
4169
3765 return retval; 4170 return retval;
3766} 4171}
3767 4172
@@ -3797,8 +4202,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3797 */ 4202 */
3798asmlinkage long sys_sched_getscheduler(pid_t pid) 4203asmlinkage long sys_sched_getscheduler(pid_t pid)
3799{ 4204{
4205 struct task_struct *p;
3800 int retval = -EINVAL; 4206 int retval = -EINVAL;
3801 task_t *p;
3802 4207
3803 if (pid < 0) 4208 if (pid < 0)
3804 goto out_nounlock; 4209 goto out_nounlock;
@@ -3825,8 +4230,8 @@ out_nounlock:
3825asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 4230asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3826{ 4231{
3827 struct sched_param lp; 4232 struct sched_param lp;
4233 struct task_struct *p;
3828 int retval = -EINVAL; 4234 int retval = -EINVAL;
3829 task_t *p;
3830 4235
3831 if (!param || pid < 0) 4236 if (!param || pid < 0)
3832 goto out_nounlock; 4237 goto out_nounlock;
@@ -3859,9 +4264,9 @@ out_unlock:
3859 4264
3860long sched_setaffinity(pid_t pid, cpumask_t new_mask) 4265long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3861{ 4266{
3862 task_t *p;
3863 int retval;
3864 cpumask_t cpus_allowed; 4267 cpumask_t cpus_allowed;
4268 struct task_struct *p;
4269 int retval;
3865 4270
3866 lock_cpu_hotplug(); 4271 lock_cpu_hotplug();
3867 read_lock(&tasklist_lock); 4272 read_lock(&tasklist_lock);
@@ -3886,6 +4291,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3886 !capable(CAP_SYS_NICE)) 4291 !capable(CAP_SYS_NICE))
3887 goto out_unlock; 4292 goto out_unlock;
3888 4293
4294 retval = security_task_setscheduler(p, 0, NULL);
4295 if (retval)
4296 goto out_unlock;
4297
3889 cpus_allowed = cpuset_cpus_allowed(p); 4298 cpus_allowed = cpuset_cpus_allowed(p);
3890 cpus_and(new_mask, new_mask, cpus_allowed); 4299 cpus_and(new_mask, new_mask, cpus_allowed);
3891 retval = set_cpus_allowed(p, new_mask); 4300 retval = set_cpus_allowed(p, new_mask);
@@ -3943,8 +4352,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
3943 4352
3944long sched_getaffinity(pid_t pid, cpumask_t *mask) 4353long sched_getaffinity(pid_t pid, cpumask_t *mask)
3945{ 4354{
4355 struct task_struct *p;
3946 int retval; 4356 int retval;
3947 task_t *p;
3948 4357
3949 lock_cpu_hotplug(); 4358 lock_cpu_hotplug();
3950 read_lock(&tasklist_lock); 4359 read_lock(&tasklist_lock);
@@ -3954,7 +4363,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
3954 if (!p) 4363 if (!p)
3955 goto out_unlock; 4364 goto out_unlock;
3956 4365
3957 retval = 0; 4366 retval = security_task_getscheduler(p);
4367 if (retval)
4368 goto out_unlock;
4369
3958 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 4370 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
3959 4371
3960out_unlock: 4372out_unlock:
@@ -4000,9 +4412,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4000 */ 4412 */
4001asmlinkage long sys_sched_yield(void) 4413asmlinkage long sys_sched_yield(void)
4002{ 4414{
4003 runqueue_t *rq = this_rq_lock(); 4415 struct rq *rq = this_rq_lock();
4004 prio_array_t *array = current->array; 4416 struct prio_array *array = current->array, *target = rq->expired;
4005 prio_array_t *target = rq->expired;
4006 4417
4007 schedstat_inc(rq, yld_cnt); 4418 schedstat_inc(rq, yld_cnt);
4008 /* 4419 /*
@@ -4036,6 +4447,7 @@ asmlinkage long sys_sched_yield(void)
4036 * no need to preempt or enable interrupts: 4447 * no need to preempt or enable interrupts:
4037 */ 4448 */
4038 __release(rq->lock); 4449 __release(rq->lock);
4450 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4039 _raw_spin_unlock(&rq->lock); 4451 _raw_spin_unlock(&rq->lock);
4040 preempt_enable_no_resched(); 4452 preempt_enable_no_resched();
4041 4453
@@ -4044,17 +4456,25 @@ asmlinkage long sys_sched_yield(void)
4044 return 0; 4456 return 0;
4045} 4457}
4046 4458
4047static inline void __cond_resched(void) 4459static inline int __resched_legal(void)
4048{ 4460{
4461 if (unlikely(preempt_count()))
4462 return 0;
4463 if (unlikely(system_state != SYSTEM_RUNNING))
4464 return 0;
4465 return 1;
4466}
4467
4468static void __cond_resched(void)
4469{
4470#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4471 __might_sleep(__FILE__, __LINE__);
4472#endif
4049 /* 4473 /*
4050 * The BKS might be reacquired before we have dropped 4474 * The BKS might be reacquired before we have dropped
4051 * PREEMPT_ACTIVE, which could trigger a second 4475 * PREEMPT_ACTIVE, which could trigger a second
4052 * cond_resched() call. 4476 * cond_resched() call.
4053 */ 4477 */
4054 if (unlikely(preempt_count()))
4055 return;
4056 if (unlikely(system_state != SYSTEM_RUNNING))
4057 return;
4058 do { 4478 do {
4059 add_preempt_count(PREEMPT_ACTIVE); 4479 add_preempt_count(PREEMPT_ACTIVE);
4060 schedule(); 4480 schedule();
@@ -4064,13 +4484,12 @@ static inline void __cond_resched(void)
4064 4484
4065int __sched cond_resched(void) 4485int __sched cond_resched(void)
4066{ 4486{
4067 if (need_resched()) { 4487 if (need_resched() && __resched_legal()) {
4068 __cond_resched(); 4488 __cond_resched();
4069 return 1; 4489 return 1;
4070 } 4490 }
4071 return 0; 4491 return 0;
4072} 4492}
4073
4074EXPORT_SYMBOL(cond_resched); 4493EXPORT_SYMBOL(cond_resched);
4075 4494
4076/* 4495/*
@@ -4091,7 +4510,8 @@ int cond_resched_lock(spinlock_t *lock)
4091 ret = 1; 4510 ret = 1;
4092 spin_lock(lock); 4511 spin_lock(lock);
4093 } 4512 }
4094 if (need_resched()) { 4513 if (need_resched() && __resched_legal()) {
4514 spin_release(&lock->dep_map, 1, _THIS_IP_);
4095 _raw_spin_unlock(lock); 4515 _raw_spin_unlock(lock);
4096 preempt_enable_no_resched(); 4516 preempt_enable_no_resched();
4097 __cond_resched(); 4517 __cond_resched();
@@ -4100,25 +4520,24 @@ int cond_resched_lock(spinlock_t *lock)
4100 } 4520 }
4101 return ret; 4521 return ret;
4102} 4522}
4103
4104EXPORT_SYMBOL(cond_resched_lock); 4523EXPORT_SYMBOL(cond_resched_lock);
4105 4524
4106int __sched cond_resched_softirq(void) 4525int __sched cond_resched_softirq(void)
4107{ 4526{
4108 BUG_ON(!in_softirq()); 4527 BUG_ON(!in_softirq());
4109 4528
4110 if (need_resched()) { 4529 if (need_resched() && __resched_legal()) {
4111 __local_bh_enable(); 4530 raw_local_irq_disable();
4531 _local_bh_enable();
4532 raw_local_irq_enable();
4112 __cond_resched(); 4533 __cond_resched();
4113 local_bh_disable(); 4534 local_bh_disable();
4114 return 1; 4535 return 1;
4115 } 4536 }
4116 return 0; 4537 return 0;
4117} 4538}
4118
4119EXPORT_SYMBOL(cond_resched_softirq); 4539EXPORT_SYMBOL(cond_resched_softirq);
4120 4540
4121
4122/** 4541/**
4123 * yield - yield the current processor to other threads. 4542 * yield - yield the current processor to other threads.
4124 * 4543 *
@@ -4130,7 +4549,6 @@ void __sched yield(void)
4130 set_current_state(TASK_RUNNING); 4549 set_current_state(TASK_RUNNING);
4131 sys_sched_yield(); 4550 sys_sched_yield();
4132} 4551}
4133
4134EXPORT_SYMBOL(yield); 4552EXPORT_SYMBOL(yield);
4135 4553
4136/* 4554/*
@@ -4142,23 +4560,26 @@ EXPORT_SYMBOL(yield);
4142 */ 4560 */
4143void __sched io_schedule(void) 4561void __sched io_schedule(void)
4144{ 4562{
4145 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4563 struct rq *rq = &__raw_get_cpu_var(runqueues);
4146 4564
4565 delayacct_blkio_start();
4147 atomic_inc(&rq->nr_iowait); 4566 atomic_inc(&rq->nr_iowait);
4148 schedule(); 4567 schedule();
4149 atomic_dec(&rq->nr_iowait); 4568 atomic_dec(&rq->nr_iowait);
4569 delayacct_blkio_end();
4150} 4570}
4151
4152EXPORT_SYMBOL(io_schedule); 4571EXPORT_SYMBOL(io_schedule);
4153 4572
4154long __sched io_schedule_timeout(long timeout) 4573long __sched io_schedule_timeout(long timeout)
4155{ 4574{
4156 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4575 struct rq *rq = &__raw_get_cpu_var(runqueues);
4157 long ret; 4576 long ret;
4158 4577
4578 delayacct_blkio_start();
4159 atomic_inc(&rq->nr_iowait); 4579 atomic_inc(&rq->nr_iowait);
4160 ret = schedule_timeout(timeout); 4580 ret = schedule_timeout(timeout);
4161 atomic_dec(&rq->nr_iowait); 4581 atomic_dec(&rq->nr_iowait);
4582 delayacct_blkio_end();
4162 return ret; 4583 return ret;
4163} 4584}
4164 4585
@@ -4220,9 +4641,9 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4220asmlinkage 4641asmlinkage
4221long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4642long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4222{ 4643{
4644 struct task_struct *p;
4223 int retval = -EINVAL; 4645 int retval = -EINVAL;
4224 struct timespec t; 4646 struct timespec t;
4225 task_t *p;
4226 4647
4227 if (pid < 0) 4648 if (pid < 0)
4228 goto out_nounlock; 4649 goto out_nounlock;
@@ -4237,7 +4658,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4237 if (retval) 4658 if (retval)
4238 goto out_unlock; 4659 goto out_unlock;
4239 4660
4240 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4661 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4241 0 : task_timeslice(p), &t); 4662 0 : task_timeslice(p), &t);
4242 read_unlock(&tasklist_lock); 4663 read_unlock(&tasklist_lock);
4243 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4664 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4250,35 +4671,36 @@ out_unlock:
4250 4671
4251static inline struct task_struct *eldest_child(struct task_struct *p) 4672static inline struct task_struct *eldest_child(struct task_struct *p)
4252{ 4673{
4253 if (list_empty(&p->children)) return NULL; 4674 if (list_empty(&p->children))
4675 return NULL;
4254 return list_entry(p->children.next,struct task_struct,sibling); 4676 return list_entry(p->children.next,struct task_struct,sibling);
4255} 4677}
4256 4678
4257static inline struct task_struct *older_sibling(struct task_struct *p) 4679static inline struct task_struct *older_sibling(struct task_struct *p)
4258{ 4680{
4259 if (p->sibling.prev==&p->parent->children) return NULL; 4681 if (p->sibling.prev==&p->parent->children)
4682 return NULL;
4260 return list_entry(p->sibling.prev,struct task_struct,sibling); 4683 return list_entry(p->sibling.prev,struct task_struct,sibling);
4261} 4684}
4262 4685
4263static inline struct task_struct *younger_sibling(struct task_struct *p) 4686static inline struct task_struct *younger_sibling(struct task_struct *p)
4264{ 4687{
4265 if (p->sibling.next==&p->parent->children) return NULL; 4688 if (p->sibling.next==&p->parent->children)
4689 return NULL;
4266 return list_entry(p->sibling.next,struct task_struct,sibling); 4690 return list_entry(p->sibling.next,struct task_struct,sibling);
4267} 4691}
4268 4692
4269static void show_task(task_t *p) 4693static const char stat_nam[] = "RSDTtZX";
4694
4695static void show_task(struct task_struct *p)
4270{ 4696{
4271 task_t *relative; 4697 struct task_struct *relative;
4272 unsigned state;
4273 unsigned long free = 0; 4698 unsigned long free = 0;
4274 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; 4699 unsigned state;
4275 4700
4276 printk("%-13.13s ", p->comm);
4277 state = p->state ? __ffs(p->state) + 1 : 0; 4701 state = p->state ? __ffs(p->state) + 1 : 0;
4278 if (state < ARRAY_SIZE(stat_nam)) 4702 printk("%-13.13s %c", p->comm,
4279 printk(stat_nam[state]); 4703 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4280 else
4281 printk("?");
4282#if (BITS_PER_LONG == 32) 4704#if (BITS_PER_LONG == 32)
4283 if (state == TASK_RUNNING) 4705 if (state == TASK_RUNNING)
4284 printk(" running "); 4706 printk(" running ");
@@ -4322,7 +4744,7 @@ static void show_task(task_t *p)
4322 4744
4323void show_state(void) 4745void show_state(void)
4324{ 4746{
4325 task_t *g, *p; 4747 struct task_struct *g, *p;
4326 4748
4327#if (BITS_PER_LONG == 32) 4749#if (BITS_PER_LONG == 32)
4328 printk("\n" 4750 printk("\n"
@@ -4344,7 +4766,7 @@ void show_state(void)
4344 } while_each_thread(g, p); 4766 } while_each_thread(g, p);
4345 4767
4346 read_unlock(&tasklist_lock); 4768 read_unlock(&tasklist_lock);
4347 mutex_debug_show_all_locks(); 4769 debug_show_all_locks();
4348} 4770}
4349 4771
4350/** 4772/**
@@ -4355,15 +4777,15 @@ void show_state(void)
4355 * NOTE: this function does not set the idle thread's NEED_RESCHED 4777 * NOTE: this function does not set the idle thread's NEED_RESCHED
4356 * flag, to make booting more robust. 4778 * flag, to make booting more robust.
4357 */ 4779 */
4358void __devinit init_idle(task_t *idle, int cpu) 4780void __devinit init_idle(struct task_struct *idle, int cpu)
4359{ 4781{
4360 runqueue_t *rq = cpu_rq(cpu); 4782 struct rq *rq = cpu_rq(cpu);
4361 unsigned long flags; 4783 unsigned long flags;
4362 4784
4363 idle->timestamp = sched_clock(); 4785 idle->timestamp = sched_clock();
4364 idle->sleep_avg = 0; 4786 idle->sleep_avg = 0;
4365 idle->array = NULL; 4787 idle->array = NULL;
4366 idle->prio = MAX_PRIO; 4788 idle->prio = idle->normal_prio = MAX_PRIO;
4367 idle->state = TASK_RUNNING; 4789 idle->state = TASK_RUNNING;
4368 idle->cpus_allowed = cpumask_of_cpu(cpu); 4790 idle->cpus_allowed = cpumask_of_cpu(cpu);
4369 set_task_cpu(idle, cpu); 4791 set_task_cpu(idle, cpu);
@@ -4396,7 +4818,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4396/* 4818/*
4397 * This is how migration works: 4819 * This is how migration works:
4398 * 4820 *
4399 * 1) we queue a migration_req_t structure in the source CPU's 4821 * 1) we queue a struct migration_req structure in the source CPU's
4400 * runqueue and wake up that CPU's migration thread. 4822 * runqueue and wake up that CPU's migration thread.
4401 * 2) we down() the locked semaphore => thread blocks. 4823 * 2) we down() the locked semaphore => thread blocks.
4402 * 3) migration thread wakes up (implicitly it forces the migrated 4824 * 3) migration thread wakes up (implicitly it forces the migrated
@@ -4418,12 +4840,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4418 * task must not exit() & deallocate itself prematurely. The 4840 * task must not exit() & deallocate itself prematurely. The
4419 * call is not atomic; no spinlocks may be held. 4841 * call is not atomic; no spinlocks may be held.
4420 */ 4842 */
4421int set_cpus_allowed(task_t *p, cpumask_t new_mask) 4843int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4422{ 4844{
4845 struct migration_req req;
4423 unsigned long flags; 4846 unsigned long flags;
4847 struct rq *rq;
4424 int ret = 0; 4848 int ret = 0;
4425 migration_req_t req;
4426 runqueue_t *rq;
4427 4849
4428 rq = task_rq_lock(p, &flags); 4850 rq = task_rq_lock(p, &flags);
4429 if (!cpus_intersects(new_mask, cpu_online_map)) { 4851 if (!cpus_intersects(new_mask, cpu_online_map)) {
@@ -4446,9 +4868,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4446 } 4868 }
4447out: 4869out:
4448 task_rq_unlock(rq, &flags); 4870 task_rq_unlock(rq, &flags);
4871
4449 return ret; 4872 return ret;
4450} 4873}
4451
4452EXPORT_SYMBOL_GPL(set_cpus_allowed); 4874EXPORT_SYMBOL_GPL(set_cpus_allowed);
4453 4875
4454/* 4876/*
@@ -4459,13 +4881,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4459 * 4881 *
4460 * So we race with normal scheduler movements, but that's OK, as long 4882 * So we race with normal scheduler movements, but that's OK, as long
4461 * as the task is no longer on this CPU. 4883 * as the task is no longer on this CPU.
4884 *
4885 * Returns non-zero if task was successfully migrated.
4462 */ 4886 */
4463static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4887static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4464{ 4888{
4465 runqueue_t *rq_dest, *rq_src; 4889 struct rq *rq_dest, *rq_src;
4890 int ret = 0;
4466 4891
4467 if (unlikely(cpu_is_offline(dest_cpu))) 4892 if (unlikely(cpu_is_offline(dest_cpu)))
4468 return; 4893 return ret;
4469 4894
4470 rq_src = cpu_rq(src_cpu); 4895 rq_src = cpu_rq(src_cpu);
4471 rq_dest = cpu_rq(dest_cpu); 4896 rq_dest = cpu_rq(dest_cpu);
@@ -4489,13 +4914,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4489 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 4914 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4490 + rq_dest->timestamp_last_tick; 4915 + rq_dest->timestamp_last_tick;
4491 deactivate_task(p, rq_src); 4916 deactivate_task(p, rq_src);
4492 activate_task(p, rq_dest, 0); 4917 __activate_task(p, rq_dest);
4493 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4918 if (TASK_PREEMPTS_CURR(p, rq_dest))
4494 resched_task(rq_dest->curr); 4919 resched_task(rq_dest->curr);
4495 } 4920 }
4496 4921 ret = 1;
4497out: 4922out:
4498 double_rq_unlock(rq_src, rq_dest); 4923 double_rq_unlock(rq_src, rq_dest);
4924 return ret;
4499} 4925}
4500 4926
4501/* 4927/*
@@ -4505,16 +4931,16 @@ out:
4505 */ 4931 */
4506static int migration_thread(void *data) 4932static int migration_thread(void *data)
4507{ 4933{
4508 runqueue_t *rq;
4509 int cpu = (long)data; 4934 int cpu = (long)data;
4935 struct rq *rq;
4510 4936
4511 rq = cpu_rq(cpu); 4937 rq = cpu_rq(cpu);
4512 BUG_ON(rq->migration_thread != current); 4938 BUG_ON(rq->migration_thread != current);
4513 4939
4514 set_current_state(TASK_INTERRUPTIBLE); 4940 set_current_state(TASK_INTERRUPTIBLE);
4515 while (!kthread_should_stop()) { 4941 while (!kthread_should_stop()) {
4942 struct migration_req *req;
4516 struct list_head *head; 4943 struct list_head *head;
4517 migration_req_t *req;
4518 4944
4519 try_to_freeze(); 4945 try_to_freeze();
4520 4946
@@ -4538,7 +4964,7 @@ static int migration_thread(void *data)
4538 set_current_state(TASK_INTERRUPTIBLE); 4964 set_current_state(TASK_INTERRUPTIBLE);
4539 continue; 4965 continue;
4540 } 4966 }
4541 req = list_entry(head->next, migration_req_t, list); 4967 req = list_entry(head->next, struct migration_req, list);
4542 list_del_init(head->next); 4968 list_del_init(head->next);
4543 4969
4544 spin_unlock(&rq->lock); 4970 spin_unlock(&rq->lock);
@@ -4563,36 +4989,42 @@ wait_to_die:
4563 4989
4564#ifdef CONFIG_HOTPLUG_CPU 4990#ifdef CONFIG_HOTPLUG_CPU
4565/* Figure out where task on dead CPU should go, use force if neccessary. */ 4991/* Figure out where task on dead CPU should go, use force if neccessary. */
4566static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4992static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4567{ 4993{
4568 int dest_cpu; 4994 unsigned long flags;
4569 cpumask_t mask; 4995 cpumask_t mask;
4996 struct rq *rq;
4997 int dest_cpu;
4570 4998
4999restart:
4571 /* On same node? */ 5000 /* On same node? */
4572 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 5001 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4573 cpus_and(mask, mask, tsk->cpus_allowed); 5002 cpus_and(mask, mask, p->cpus_allowed);
4574 dest_cpu = any_online_cpu(mask); 5003 dest_cpu = any_online_cpu(mask);
4575 5004
4576 /* On any allowed CPU? */ 5005 /* On any allowed CPU? */
4577 if (dest_cpu == NR_CPUS) 5006 if (dest_cpu == NR_CPUS)
4578 dest_cpu = any_online_cpu(tsk->cpus_allowed); 5007 dest_cpu = any_online_cpu(p->cpus_allowed);
4579 5008
4580 /* No more Mr. Nice Guy. */ 5009 /* No more Mr. Nice Guy. */
4581 if (dest_cpu == NR_CPUS) { 5010 if (dest_cpu == NR_CPUS) {
4582 cpus_setall(tsk->cpus_allowed); 5011 rq = task_rq_lock(p, &flags);
4583 dest_cpu = any_online_cpu(tsk->cpus_allowed); 5012 cpus_setall(p->cpus_allowed);
5013 dest_cpu = any_online_cpu(p->cpus_allowed);
5014 task_rq_unlock(rq, &flags);
4584 5015
4585 /* 5016 /*
4586 * Don't tell them about moving exiting tasks or 5017 * Don't tell them about moving exiting tasks or
4587 * kernel threads (both mm NULL), since they never 5018 * kernel threads (both mm NULL), since they never
4588 * leave kernel. 5019 * leave kernel.
4589 */ 5020 */
4590 if (tsk->mm && printk_ratelimit()) 5021 if (p->mm && printk_ratelimit())
4591 printk(KERN_INFO "process %d (%s) no " 5022 printk(KERN_INFO "process %d (%s) no "
4592 "longer affine to cpu%d\n", 5023 "longer affine to cpu%d\n",
4593 tsk->pid, tsk->comm, dead_cpu); 5024 p->pid, p->comm, dead_cpu);
4594 } 5025 }
4595 __migrate_task(tsk, dead_cpu, dest_cpu); 5026 if (!__migrate_task(p, dead_cpu, dest_cpu))
5027 goto restart;
4596} 5028}
4597 5029
4598/* 5030/*
@@ -4602,9 +5034,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4602 * their home CPUs. So we just add the counter to another CPU's counter, 5034 * their home CPUs. So we just add the counter to another CPU's counter,
4603 * to keep the global sum constant after CPU-down: 5035 * to keep the global sum constant after CPU-down:
4604 */ 5036 */
4605static void migrate_nr_uninterruptible(runqueue_t *rq_src) 5037static void migrate_nr_uninterruptible(struct rq *rq_src)
4606{ 5038{
4607 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 5039 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4608 unsigned long flags; 5040 unsigned long flags;
4609 5041
4610 local_irq_save(flags); 5042 local_irq_save(flags);
@@ -4618,48 +5050,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4618/* Run through task list and migrate tasks from the dead cpu. */ 5050/* Run through task list and migrate tasks from the dead cpu. */
4619static void migrate_live_tasks(int src_cpu) 5051static void migrate_live_tasks(int src_cpu)
4620{ 5052{
4621 struct task_struct *tsk, *t; 5053 struct task_struct *p, *t;
4622 5054
4623 write_lock_irq(&tasklist_lock); 5055 write_lock_irq(&tasklist_lock);
4624 5056
4625 do_each_thread(t, tsk) { 5057 do_each_thread(t, p) {
4626 if (tsk == current) 5058 if (p == current)
4627 continue; 5059 continue;
4628 5060
4629 if (task_cpu(tsk) == src_cpu) 5061 if (task_cpu(p) == src_cpu)
4630 move_task_off_dead_cpu(src_cpu, tsk); 5062 move_task_off_dead_cpu(src_cpu, p);
4631 } while_each_thread(t, tsk); 5063 } while_each_thread(t, p);
4632 5064
4633 write_unlock_irq(&tasklist_lock); 5065 write_unlock_irq(&tasklist_lock);
4634} 5066}
4635 5067
4636/* Schedules idle task to be the next runnable task on current CPU. 5068/* Schedules idle task to be the next runnable task on current CPU.
4637 * It does so by boosting its priority to highest possible and adding it to 5069 * It does so by boosting its priority to highest possible and adding it to
4638 * the _front_ of runqueue. Used by CPU offline code. 5070 * the _front_ of the runqueue. Used by CPU offline code.
4639 */ 5071 */
4640void sched_idle_next(void) 5072void sched_idle_next(void)
4641{ 5073{
4642 int cpu = smp_processor_id(); 5074 int this_cpu = smp_processor_id();
4643 runqueue_t *rq = this_rq(); 5075 struct rq *rq = cpu_rq(this_cpu);
4644 struct task_struct *p = rq->idle; 5076 struct task_struct *p = rq->idle;
4645 unsigned long flags; 5077 unsigned long flags;
4646 5078
4647 /* cpu has to be offline */ 5079 /* cpu has to be offline */
4648 BUG_ON(cpu_online(cpu)); 5080 BUG_ON(cpu_online(this_cpu));
4649 5081
4650 /* Strictly not necessary since rest of the CPUs are stopped by now 5082 /*
4651 * and interrupts disabled on current cpu. 5083 * Strictly not necessary since rest of the CPUs are stopped by now
5084 * and interrupts disabled on the current cpu.
4652 */ 5085 */
4653 spin_lock_irqsave(&rq->lock, flags); 5086 spin_lock_irqsave(&rq->lock, flags);
4654 5087
4655 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5088 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4656 /* Add idle task to _front_ of it's priority queue */ 5089
5090 /* Add idle task to the _front_ of its priority queue: */
4657 __activate_idle_task(p, rq); 5091 __activate_idle_task(p, rq);
4658 5092
4659 spin_unlock_irqrestore(&rq->lock, flags); 5093 spin_unlock_irqrestore(&rq->lock, flags);
4660} 5094}
4661 5095
4662/* Ensures that the idle task is using init_mm right before its cpu goes 5096/*
5097 * Ensures that the idle task is using init_mm right before its cpu goes
4663 * offline. 5098 * offline.
4664 */ 5099 */
4665void idle_task_exit(void) 5100void idle_task_exit(void)
@@ -4673,17 +5108,17 @@ void idle_task_exit(void)
4673 mmdrop(mm); 5108 mmdrop(mm);
4674} 5109}
4675 5110
4676static void migrate_dead(unsigned int dead_cpu, task_t *tsk) 5111static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
4677{ 5112{
4678 struct runqueue *rq = cpu_rq(dead_cpu); 5113 struct rq *rq = cpu_rq(dead_cpu);
4679 5114
4680 /* Must be exiting, otherwise would be on tasklist. */ 5115 /* Must be exiting, otherwise would be on tasklist. */
4681 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); 5116 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
4682 5117
4683 /* Cannot have done final schedule yet: would have vanished. */ 5118 /* Cannot have done final schedule yet: would have vanished. */
4684 BUG_ON(tsk->flags & PF_DEAD); 5119 BUG_ON(p->flags & PF_DEAD);
4685 5120
4686 get_task_struct(tsk); 5121 get_task_struct(p);
4687 5122
4688 /* 5123 /*
4689 * Drop lock around migration; if someone else moves it, 5124 * Drop lock around migration; if someone else moves it,
@@ -4691,25 +5126,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4691 * fine. 5126 * fine.
4692 */ 5127 */
4693 spin_unlock_irq(&rq->lock); 5128 spin_unlock_irq(&rq->lock);
4694 move_task_off_dead_cpu(dead_cpu, tsk); 5129 move_task_off_dead_cpu(dead_cpu, p);
4695 spin_lock_irq(&rq->lock); 5130 spin_lock_irq(&rq->lock);
4696 5131
4697 put_task_struct(tsk); 5132 put_task_struct(p);
4698} 5133}
4699 5134
4700/* release_task() removes task from tasklist, so we won't find dead tasks. */ 5135/* release_task() removes task from tasklist, so we won't find dead tasks. */
4701static void migrate_dead_tasks(unsigned int dead_cpu) 5136static void migrate_dead_tasks(unsigned int dead_cpu)
4702{ 5137{
4703 unsigned arr, i; 5138 struct rq *rq = cpu_rq(dead_cpu);
4704 struct runqueue *rq = cpu_rq(dead_cpu); 5139 unsigned int arr, i;
4705 5140
4706 for (arr = 0; arr < 2; arr++) { 5141 for (arr = 0; arr < 2; arr++) {
4707 for (i = 0; i < MAX_PRIO; i++) { 5142 for (i = 0; i < MAX_PRIO; i++) {
4708 struct list_head *list = &rq->arrays[arr].queue[i]; 5143 struct list_head *list = &rq->arrays[arr].queue[i];
5144
4709 while (!list_empty(list)) 5145 while (!list_empty(list))
4710 migrate_dead(dead_cpu, 5146 migrate_dead(dead_cpu, list_entry(list->next,
4711 list_entry(list->next, task_t, 5147 struct task_struct, run_list));
4712 run_list));
4713 } 5148 }
4714 } 5149 }
4715} 5150}
@@ -4719,13 +5154,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
4719 * migration_call - callback that gets triggered when a CPU is added. 5154 * migration_call - callback that gets triggered when a CPU is added.
4720 * Here we can start up the necessary migration thread for the new CPU. 5155 * Here we can start up the necessary migration thread for the new CPU.
4721 */ 5156 */
4722static int migration_call(struct notifier_block *nfb, unsigned long action, 5157static int __cpuinit
4723 void *hcpu) 5158migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4724{ 5159{
4725 int cpu = (long)hcpu;
4726 struct task_struct *p; 5160 struct task_struct *p;
4727 struct runqueue *rq; 5161 int cpu = (long)hcpu;
4728 unsigned long flags; 5162 unsigned long flags;
5163 struct rq *rq;
4729 5164
4730 switch (action) { 5165 switch (action) {
4731 case CPU_UP_PREPARE: 5166 case CPU_UP_PREPARE:
@@ -4740,18 +5175,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4740 task_rq_unlock(rq, &flags); 5175 task_rq_unlock(rq, &flags);
4741 cpu_rq(cpu)->migration_thread = p; 5176 cpu_rq(cpu)->migration_thread = p;
4742 break; 5177 break;
5178
4743 case CPU_ONLINE: 5179 case CPU_ONLINE:
4744 /* Strictly unneccessary, as first user will wake it. */ 5180 /* Strictly unneccessary, as first user will wake it. */
4745 wake_up_process(cpu_rq(cpu)->migration_thread); 5181 wake_up_process(cpu_rq(cpu)->migration_thread);
4746 break; 5182 break;
5183
4747#ifdef CONFIG_HOTPLUG_CPU 5184#ifdef CONFIG_HOTPLUG_CPU
4748 case CPU_UP_CANCELED: 5185 case CPU_UP_CANCELED:
5186 if (!cpu_rq(cpu)->migration_thread)
5187 break;
4749 /* Unbind it from offline cpu so it can run. Fall thru. */ 5188 /* Unbind it from offline cpu so it can run. Fall thru. */
4750 kthread_bind(cpu_rq(cpu)->migration_thread, 5189 kthread_bind(cpu_rq(cpu)->migration_thread,
4751 any_online_cpu(cpu_online_map)); 5190 any_online_cpu(cpu_online_map));
4752 kthread_stop(cpu_rq(cpu)->migration_thread); 5191 kthread_stop(cpu_rq(cpu)->migration_thread);
4753 cpu_rq(cpu)->migration_thread = NULL; 5192 cpu_rq(cpu)->migration_thread = NULL;
4754 break; 5193 break;
5194
4755 case CPU_DEAD: 5195 case CPU_DEAD:
4756 migrate_live_tasks(cpu); 5196 migrate_live_tasks(cpu);
4757 rq = cpu_rq(cpu); 5197 rq = cpu_rq(cpu);
@@ -4772,9 +5212,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4772 * the requestors. */ 5212 * the requestors. */
4773 spin_lock_irq(&rq->lock); 5213 spin_lock_irq(&rq->lock);
4774 while (!list_empty(&rq->migration_queue)) { 5214 while (!list_empty(&rq->migration_queue)) {
4775 migration_req_t *req; 5215 struct migration_req *req;
5216
4776 req = list_entry(rq->migration_queue.next, 5217 req = list_entry(rq->migration_queue.next,
4777 migration_req_t, list); 5218 struct migration_req, list);
4778 list_del_init(&req->list); 5219 list_del_init(&req->list);
4779 complete(&req->done); 5220 complete(&req->done);
4780 } 5221 }
@@ -4788,7 +5229,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4788/* Register at highest priority so that task migration (migrate_all_tasks) 5229/* Register at highest priority so that task migration (migrate_all_tasks)
4789 * happens before everything else. 5230 * happens before everything else.
4790 */ 5231 */
4791static struct notifier_block migration_notifier = { 5232static struct notifier_block __cpuinitdata migration_notifier = {
4792 .notifier_call = migration_call, 5233 .notifier_call = migration_call,
4793 .priority = 10 5234 .priority = 10
4794}; 5235};
@@ -4796,10 +5237,12 @@ static struct notifier_block migration_notifier = {
4796int __init migration_init(void) 5237int __init migration_init(void)
4797{ 5238{
4798 void *cpu = (void *)(long)smp_processor_id(); 5239 void *cpu = (void *)(long)smp_processor_id();
4799 /* Start one for boot CPU. */ 5240
5241 /* Start one for the boot CPU: */
4800 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5242 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4801 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5243 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4802 register_cpu_notifier(&migration_notifier); 5244 register_cpu_notifier(&migration_notifier);
5245
4803 return 0; 5246 return 0;
4804} 5247}
4805#endif 5248#endif
@@ -4895,7 +5338,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4895 } while (sd); 5338 } while (sd);
4896} 5339}
4897#else 5340#else
4898#define sched_domain_debug(sd, cpu) {} 5341# define sched_domain_debug(sd, cpu) do { } while (0)
4899#endif 5342#endif
4900 5343
4901static int sd_degenerate(struct sched_domain *sd) 5344static int sd_degenerate(struct sched_domain *sd)
@@ -4921,8 +5364,8 @@ static int sd_degenerate(struct sched_domain *sd)
4921 return 1; 5364 return 1;
4922} 5365}
4923 5366
4924static int sd_parent_degenerate(struct sched_domain *sd, 5367static int
4925 struct sched_domain *parent) 5368sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4926{ 5369{
4927 unsigned long cflags = sd->flags, pflags = parent->flags; 5370 unsigned long cflags = sd->flags, pflags = parent->flags;
4928 5371
@@ -4955,7 +5398,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4955 */ 5398 */
4956static void cpu_attach_domain(struct sched_domain *sd, int cpu) 5399static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4957{ 5400{
4958 runqueue_t *rq = cpu_rq(cpu); 5401 struct rq *rq = cpu_rq(cpu);
4959 struct sched_domain *tmp; 5402 struct sched_domain *tmp;
4960 5403
4961 /* Remove the sched domains which do not contribute to scheduling. */ 5404 /* Remove the sched domains which do not contribute to scheduling. */
@@ -5217,8 +5660,8 @@ static void touch_cache(void *__cache, unsigned long __size)
5217/* 5660/*
5218 * Measure the cache-cost of one task migration. Returns in units of nsec. 5661 * Measure the cache-cost of one task migration. Returns in units of nsec.
5219 */ 5662 */
5220static unsigned long long measure_one(void *cache, unsigned long size, 5663static unsigned long long
5221 int source, int target) 5664measure_one(void *cache, unsigned long size, int source, int target)
5222{ 5665{
5223 cpumask_t mask, saved_mask; 5666 cpumask_t mask, saved_mask;
5224 unsigned long long t0, t1, t2, t3, cost; 5667 unsigned long long t0, t1, t2, t3, cost;
@@ -5370,7 +5813,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5370 cache = vmalloc(max_size); 5813 cache = vmalloc(max_size);
5371 if (!cache) { 5814 if (!cache) {
5372 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 5815 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5373 return 1000000; // return 1 msec on very small boxen 5816 return 1000000; /* return 1 msec on very small boxen */
5374 } 5817 }
5375 5818
5376 while (size <= max_size) { 5819 while (size <= max_size) {
@@ -5568,9 +6011,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
5568 */ 6011 */
5569static cpumask_t sched_domain_node_span(int node) 6012static cpumask_t sched_domain_node_span(int node)
5570{ 6013{
5571 int i;
5572 cpumask_t span, nodemask;
5573 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 6014 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
6015 cpumask_t span, nodemask;
6016 int i;
5574 6017
5575 cpus_clear(span); 6018 cpus_clear(span);
5576 bitmap_zero(used_nodes, MAX_NUMNODES); 6019 bitmap_zero(used_nodes, MAX_NUMNODES);
@@ -5581,6 +6024,7 @@ static cpumask_t sched_domain_node_span(int node)
5581 6024
5582 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6025 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5583 int next_node = find_next_best_node(node, used_nodes); 6026 int next_node = find_next_best_node(node, used_nodes);
6027
5584 nodemask = node_to_cpumask(next_node); 6028 nodemask = node_to_cpumask(next_node);
5585 cpus_or(span, span, nodemask); 6029 cpus_or(span, span, nodemask);
5586 } 6030 }
@@ -5589,22 +6033,27 @@ static cpumask_t sched_domain_node_span(int node)
5589} 6033}
5590#endif 6034#endif
5591 6035
6036int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6037
5592/* 6038/*
5593 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 6039 * SMT sched-domains:
5594 * can switch it on easily if needed.
5595 */ 6040 */
5596#ifdef CONFIG_SCHED_SMT 6041#ifdef CONFIG_SCHED_SMT
5597static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6042static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5598static struct sched_group sched_group_cpus[NR_CPUS]; 6043static struct sched_group sched_group_cpus[NR_CPUS];
6044
5599static int cpu_to_cpu_group(int cpu) 6045static int cpu_to_cpu_group(int cpu)
5600{ 6046{
5601 return cpu; 6047 return cpu;
5602} 6048}
5603#endif 6049#endif
5604 6050
6051/*
6052 * multi-core sched-domains:
6053 */
5605#ifdef CONFIG_SCHED_MC 6054#ifdef CONFIG_SCHED_MC
5606static DEFINE_PER_CPU(struct sched_domain, core_domains); 6055static DEFINE_PER_CPU(struct sched_domain, core_domains);
5607static struct sched_group sched_group_core[NR_CPUS]; 6056static struct sched_group *sched_group_core_bycpu[NR_CPUS];
5608#endif 6057#endif
5609 6058
5610#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6059#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5620,10 +6069,11 @@ static int cpu_to_core_group(int cpu)
5620#endif 6069#endif
5621 6070
5622static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6071static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5623static struct sched_group sched_group_phys[NR_CPUS]; 6072static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
6073
5624static int cpu_to_phys_group(int cpu) 6074static int cpu_to_phys_group(int cpu)
5625{ 6075{
5626#if defined(CONFIG_SCHED_MC) 6076#ifdef CONFIG_SCHED_MC
5627 cpumask_t mask = cpu_coregroup_map(cpu); 6077 cpumask_t mask = cpu_coregroup_map(cpu);
5628 return first_cpu(mask); 6078 return first_cpu(mask);
5629#elif defined(CONFIG_SCHED_SMT) 6079#elif defined(CONFIG_SCHED_SMT)
@@ -5677,13 +6127,74 @@ next_sg:
5677} 6127}
5678#endif 6128#endif
5679 6129
6130/* Free memory allocated for various sched_group structures */
6131static void free_sched_groups(const cpumask_t *cpu_map)
6132{
6133 int cpu;
6134#ifdef CONFIG_NUMA
6135 int i;
6136
6137 for_each_cpu_mask(cpu, *cpu_map) {
6138 struct sched_group *sched_group_allnodes
6139 = sched_group_allnodes_bycpu[cpu];
6140 struct sched_group **sched_group_nodes
6141 = sched_group_nodes_bycpu[cpu];
6142
6143 if (sched_group_allnodes) {
6144 kfree(sched_group_allnodes);
6145 sched_group_allnodes_bycpu[cpu] = NULL;
6146 }
6147
6148 if (!sched_group_nodes)
6149 continue;
6150
6151 for (i = 0; i < MAX_NUMNODES; i++) {
6152 cpumask_t nodemask = node_to_cpumask(i);
6153 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6154
6155 cpus_and(nodemask, nodemask, *cpu_map);
6156 if (cpus_empty(nodemask))
6157 continue;
6158
6159 if (sg == NULL)
6160 continue;
6161 sg = sg->next;
6162next_sg:
6163 oldsg = sg;
6164 sg = sg->next;
6165 kfree(oldsg);
6166 if (oldsg != sched_group_nodes[i])
6167 goto next_sg;
6168 }
6169 kfree(sched_group_nodes);
6170 sched_group_nodes_bycpu[cpu] = NULL;
6171 }
6172#endif
6173 for_each_cpu_mask(cpu, *cpu_map) {
6174 if (sched_group_phys_bycpu[cpu]) {
6175 kfree(sched_group_phys_bycpu[cpu]);
6176 sched_group_phys_bycpu[cpu] = NULL;
6177 }
6178#ifdef CONFIG_SCHED_MC
6179 if (sched_group_core_bycpu[cpu]) {
6180 kfree(sched_group_core_bycpu[cpu]);
6181 sched_group_core_bycpu[cpu] = NULL;
6182 }
6183#endif
6184 }
6185}
6186
5680/* 6187/*
5681 * Build sched domains for a given set of cpus and attach the sched domains 6188 * Build sched domains for a given set of cpus and attach the sched domains
5682 * to the individual cpus 6189 * to the individual cpus
5683 */ 6190 */
5684void build_sched_domains(const cpumask_t *cpu_map) 6191static int build_sched_domains(const cpumask_t *cpu_map)
5685{ 6192{
5686 int i; 6193 int i;
6194 struct sched_group *sched_group_phys = NULL;
6195#ifdef CONFIG_SCHED_MC
6196 struct sched_group *sched_group_core = NULL;
6197#endif
5687#ifdef CONFIG_NUMA 6198#ifdef CONFIG_NUMA
5688 struct sched_group **sched_group_nodes = NULL; 6199 struct sched_group **sched_group_nodes = NULL;
5689 struct sched_group *sched_group_allnodes = NULL; 6200 struct sched_group *sched_group_allnodes = NULL;
@@ -5691,11 +6202,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
5691 /* 6202 /*
5692 * Allocate the per-node list of sched groups 6203 * Allocate the per-node list of sched groups
5693 */ 6204 */
5694 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6205 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5695 GFP_ATOMIC); 6206 GFP_KERNEL);
5696 if (!sched_group_nodes) { 6207 if (!sched_group_nodes) {
5697 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6208 printk(KERN_WARNING "Can not alloc sched group node list\n");
5698 return; 6209 return -ENOMEM;
5699 } 6210 }
5700 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6211 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5701#endif 6212#endif
@@ -5721,7 +6232,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
5721 if (!sched_group_allnodes) { 6232 if (!sched_group_allnodes) {
5722 printk(KERN_WARNING 6233 printk(KERN_WARNING
5723 "Can not alloc allnodes sched group\n"); 6234 "Can not alloc allnodes sched group\n");
5724 break; 6235 goto error;
5725 } 6236 }
5726 sched_group_allnodes_bycpu[i] 6237 sched_group_allnodes_bycpu[i]
5727 = sched_group_allnodes; 6238 = sched_group_allnodes;
@@ -5742,6 +6253,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5742 cpus_and(sd->span, sd->span, *cpu_map); 6253 cpus_and(sd->span, sd->span, *cpu_map);
5743#endif 6254#endif
5744 6255
6256 if (!sched_group_phys) {
6257 sched_group_phys
6258 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6259 GFP_KERNEL);
6260 if (!sched_group_phys) {
6261 printk (KERN_WARNING "Can not alloc phys sched"
6262 "group\n");
6263 goto error;
6264 }
6265 sched_group_phys_bycpu[i] = sched_group_phys;
6266 }
6267
5745 p = sd; 6268 p = sd;
5746 sd = &per_cpu(phys_domains, i); 6269 sd = &per_cpu(phys_domains, i);
5747 group = cpu_to_phys_group(i); 6270 group = cpu_to_phys_group(i);
@@ -5751,6 +6274,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5751 sd->groups = &sched_group_phys[group]; 6274 sd->groups = &sched_group_phys[group];
5752 6275
5753#ifdef CONFIG_SCHED_MC 6276#ifdef CONFIG_SCHED_MC
6277 if (!sched_group_core) {
6278 sched_group_core
6279 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6280 GFP_KERNEL);
6281 if (!sched_group_core) {
6282 printk (KERN_WARNING "Can not alloc core sched"
6283 "group\n");
6284 goto error;
6285 }
6286 sched_group_core_bycpu[i] = sched_group_core;
6287 }
6288
5754 p = sd; 6289 p = sd;
5755 sd = &per_cpu(core_domains, i); 6290 sd = &per_cpu(core_domains, i);
5756 group = cpu_to_core_group(i); 6291 group = cpu_to_core_group(i);
@@ -5834,24 +6369,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
5834 domainspan = sched_domain_node_span(i); 6369 domainspan = sched_domain_node_span(i);
5835 cpus_and(domainspan, domainspan, *cpu_map); 6370 cpus_and(domainspan, domainspan, *cpu_map);
5836 6371
5837 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6372 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6373 if (!sg) {
6374 printk(KERN_WARNING "Can not alloc domain group for "
6375 "node %d\n", i);
6376 goto error;
6377 }
5838 sched_group_nodes[i] = sg; 6378 sched_group_nodes[i] = sg;
5839 for_each_cpu_mask(j, nodemask) { 6379 for_each_cpu_mask(j, nodemask) {
5840 struct sched_domain *sd; 6380 struct sched_domain *sd;
5841 sd = &per_cpu(node_domains, j); 6381 sd = &per_cpu(node_domains, j);
5842 sd->groups = sg; 6382 sd->groups = sg;
5843 if (sd->groups == NULL) {
5844 /* Turn off balancing if we have no groups */
5845 sd->flags = 0;
5846 }
5847 }
5848 if (!sg) {
5849 printk(KERN_WARNING
5850 "Can not alloc domain group for node %d\n", i);
5851 continue;
5852 } 6383 }
5853 sg->cpu_power = 0; 6384 sg->cpu_power = 0;
5854 sg->cpumask = nodemask; 6385 sg->cpumask = nodemask;
6386 sg->next = sg;
5855 cpus_or(covered, covered, nodemask); 6387 cpus_or(covered, covered, nodemask);
5856 prev = sg; 6388 prev = sg;
5857 6389
@@ -5870,54 +6402,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
5870 if (cpus_empty(tmp)) 6402 if (cpus_empty(tmp))
5871 continue; 6403 continue;
5872 6404
5873 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6405 sg = kmalloc_node(sizeof(struct sched_group),
6406 GFP_KERNEL, i);
5874 if (!sg) { 6407 if (!sg) {
5875 printk(KERN_WARNING 6408 printk(KERN_WARNING
5876 "Can not alloc domain group for node %d\n", j); 6409 "Can not alloc domain group for node %d\n", j);
5877 break; 6410 goto error;
5878 } 6411 }
5879 sg->cpu_power = 0; 6412 sg->cpu_power = 0;
5880 sg->cpumask = tmp; 6413 sg->cpumask = tmp;
6414 sg->next = prev->next;
5881 cpus_or(covered, covered, tmp); 6415 cpus_or(covered, covered, tmp);
5882 prev->next = sg; 6416 prev->next = sg;
5883 prev = sg; 6417 prev = sg;
5884 } 6418 }
5885 prev->next = sched_group_nodes[i];
5886 } 6419 }
5887#endif 6420#endif
5888 6421
5889 /* Calculate CPU power for physical packages and nodes */ 6422 /* Calculate CPU power for physical packages and nodes */
6423#ifdef CONFIG_SCHED_SMT
5890 for_each_cpu_mask(i, *cpu_map) { 6424 for_each_cpu_mask(i, *cpu_map) {
5891 int power;
5892 struct sched_domain *sd; 6425 struct sched_domain *sd;
5893#ifdef CONFIG_SCHED_SMT
5894 sd = &per_cpu(cpu_domains, i); 6426 sd = &per_cpu(cpu_domains, i);
5895 power = SCHED_LOAD_SCALE; 6427 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5896 sd->groups->cpu_power = power; 6428 }
5897#endif 6429#endif
5898#ifdef CONFIG_SCHED_MC 6430#ifdef CONFIG_SCHED_MC
6431 for_each_cpu_mask(i, *cpu_map) {
6432 int power;
6433 struct sched_domain *sd;
5899 sd = &per_cpu(core_domains, i); 6434 sd = &per_cpu(core_domains, i);
5900 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6435 if (sched_smt_power_savings)
6436 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6437 else
6438 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5901 * SCHED_LOAD_SCALE / 10; 6439 * SCHED_LOAD_SCALE / 10;
5902 sd->groups->cpu_power = power; 6440 sd->groups->cpu_power = power;
6441 }
6442#endif
5903 6443
6444 for_each_cpu_mask(i, *cpu_map) {
6445 struct sched_domain *sd;
6446#ifdef CONFIG_SCHED_MC
5904 sd = &per_cpu(phys_domains, i); 6447 sd = &per_cpu(phys_domains, i);
6448 if (i != first_cpu(sd->groups->cpumask))
6449 continue;
5905 6450
5906 /* 6451 sd->groups->cpu_power = 0;
5907 * This has to be < 2 * SCHED_LOAD_SCALE 6452 if (sched_mc_power_savings || sched_smt_power_savings) {
5908 * Lets keep it SCHED_LOAD_SCALE, so that 6453 int j;
5909 * while calculating NUMA group's cpu_power 6454
5910 * we can simply do 6455 for_each_cpu_mask(j, sd->groups->cpumask) {
5911 * numa_group->cpu_power += phys_group->cpu_power; 6456 struct sched_domain *sd1;
5912 * 6457 sd1 = &per_cpu(core_domains, j);
5913 * See "only add power once for each physical pkg" 6458 /*
5914 * comment below 6459 * for each core we will add once
5915 */ 6460 * to the group in physical domain
5916 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6461 */
6462 if (j != first_cpu(sd1->groups->cpumask))
6463 continue;
6464
6465 if (sched_smt_power_savings)
6466 sd->groups->cpu_power += sd1->groups->cpu_power;
6467 else
6468 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6469 }
6470 } else
6471 /*
6472 * This has to be < 2 * SCHED_LOAD_SCALE
6473 * Lets keep it SCHED_LOAD_SCALE, so that
6474 * while calculating NUMA group's cpu_power
6475 * we can simply do
6476 * numa_group->cpu_power += phys_group->cpu_power;
6477 *
6478 * See "only add power once for each physical pkg"
6479 * comment below
6480 */
6481 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5917#else 6482#else
6483 int power;
5918 sd = &per_cpu(phys_domains, i); 6484 sd = &per_cpu(phys_domains, i);
5919 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6485 if (sched_smt_power_savings)
5920 (cpus_weight(sd->groups->cpumask)-1) / 10; 6486 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6487 else
6488 power = SCHED_LOAD_SCALE;
5921 sd->groups->cpu_power = power; 6489 sd->groups->cpu_power = power;
5922#endif 6490#endif
5923 } 6491 }
@@ -5945,13 +6513,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
5945 * Tune cache-hot values: 6513 * Tune cache-hot values:
5946 */ 6514 */
5947 calibrate_migration_costs(cpu_map); 6515 calibrate_migration_costs(cpu_map);
6516
6517 return 0;
6518
6519error:
6520 free_sched_groups(cpu_map);
6521 return -ENOMEM;
5948} 6522}
5949/* 6523/*
5950 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6524 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5951 */ 6525 */
5952static void arch_init_sched_domains(const cpumask_t *cpu_map) 6526static int arch_init_sched_domains(const cpumask_t *cpu_map)
5953{ 6527{
5954 cpumask_t cpu_default_map; 6528 cpumask_t cpu_default_map;
6529 int err;
5955 6530
5956 /* 6531 /*
5957 * Setup mask for cpus without special case scheduling requirements. 6532 * Setup mask for cpus without special case scheduling requirements.
@@ -5960,51 +6535,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
5960 */ 6535 */
5961 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6536 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5962 6537
5963 build_sched_domains(&cpu_default_map); 6538 err = build_sched_domains(&cpu_default_map);
6539
6540 return err;
5964} 6541}
5965 6542
5966static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6543static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5967{ 6544{
5968#ifdef CONFIG_NUMA 6545 free_sched_groups(cpu_map);
5969 int i;
5970 int cpu;
5971
5972 for_each_cpu_mask(cpu, *cpu_map) {
5973 struct sched_group *sched_group_allnodes
5974 = sched_group_allnodes_bycpu[cpu];
5975 struct sched_group **sched_group_nodes
5976 = sched_group_nodes_bycpu[cpu];
5977
5978 if (sched_group_allnodes) {
5979 kfree(sched_group_allnodes);
5980 sched_group_allnodes_bycpu[cpu] = NULL;
5981 }
5982
5983 if (!sched_group_nodes)
5984 continue;
5985
5986 for (i = 0; i < MAX_NUMNODES; i++) {
5987 cpumask_t nodemask = node_to_cpumask(i);
5988 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5989
5990 cpus_and(nodemask, nodemask, *cpu_map);
5991 if (cpus_empty(nodemask))
5992 continue;
5993
5994 if (sg == NULL)
5995 continue;
5996 sg = sg->next;
5997next_sg:
5998 oldsg = sg;
5999 sg = sg->next;
6000 kfree(oldsg);
6001 if (oldsg != sched_group_nodes[i])
6002 goto next_sg;
6003 }
6004 kfree(sched_group_nodes);
6005 sched_group_nodes_bycpu[cpu] = NULL;
6006 }
6007#endif
6008} 6546}
6009 6547
6010/* 6548/*
@@ -6029,9 +6567,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6029 * correct sched domains 6567 * correct sched domains
6030 * Call with hotplug lock held 6568 * Call with hotplug lock held
6031 */ 6569 */
6032void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6570int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6033{ 6571{
6034 cpumask_t change_map; 6572 cpumask_t change_map;
6573 int err = 0;
6035 6574
6036 cpus_and(*partition1, *partition1, cpu_online_map); 6575 cpus_and(*partition1, *partition1, cpu_online_map);
6037 cpus_and(*partition2, *partition2, cpu_online_map); 6576 cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6040,11 +6579,90 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6040 /* Detach sched domains from all of the affected cpus */ 6579 /* Detach sched domains from all of the affected cpus */
6041 detach_destroy_domains(&change_map); 6580 detach_destroy_domains(&change_map);
6042 if (!cpus_empty(*partition1)) 6581 if (!cpus_empty(*partition1))
6043 build_sched_domains(partition1); 6582 err = build_sched_domains(partition1);
6044 if (!cpus_empty(*partition2)) 6583 if (!err && !cpus_empty(*partition2))
6045 build_sched_domains(partition2); 6584 err = build_sched_domains(partition2);
6585
6586 return err;
6046} 6587}
6047 6588
6589#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6590int arch_reinit_sched_domains(void)
6591{
6592 int err;
6593
6594 lock_cpu_hotplug();
6595 detach_destroy_domains(&cpu_online_map);
6596 err = arch_init_sched_domains(&cpu_online_map);
6597 unlock_cpu_hotplug();
6598
6599 return err;
6600}
6601
6602static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6603{
6604 int ret;
6605
6606 if (buf[0] != '0' && buf[0] != '1')
6607 return -EINVAL;
6608
6609 if (smt)
6610 sched_smt_power_savings = (buf[0] == '1');
6611 else
6612 sched_mc_power_savings = (buf[0] == '1');
6613
6614 ret = arch_reinit_sched_domains();
6615
6616 return ret ? ret : count;
6617}
6618
6619int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6620{
6621 int err = 0;
6622
6623#ifdef CONFIG_SCHED_SMT
6624 if (smt_capable())
6625 err = sysfs_create_file(&cls->kset.kobj,
6626 &attr_sched_smt_power_savings.attr);
6627#endif
6628#ifdef CONFIG_SCHED_MC
6629 if (!err && mc_capable())
6630 err = sysfs_create_file(&cls->kset.kobj,
6631 &attr_sched_mc_power_savings.attr);
6632#endif
6633 return err;
6634}
6635#endif
6636
6637#ifdef CONFIG_SCHED_MC
6638static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6639{
6640 return sprintf(page, "%u\n", sched_mc_power_savings);
6641}
6642static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6643 const char *buf, size_t count)
6644{
6645 return sched_power_savings_store(buf, count, 0);
6646}
6647SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6648 sched_mc_power_savings_store);
6649#endif
6650
6651#ifdef CONFIG_SCHED_SMT
6652static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6653{
6654 return sprintf(page, "%u\n", sched_smt_power_savings);
6655}
6656static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6657 const char *buf, size_t count)
6658{
6659 return sched_power_savings_store(buf, count, 1);
6660}
6661SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6662 sched_smt_power_savings_store);
6663#endif
6664
6665
6048#ifdef CONFIG_HOTPLUG_CPU 6666#ifdef CONFIG_HOTPLUG_CPU
6049/* 6667/*
6050 * Force a reinitialization of the sched domains hierarchy. The domains 6668 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -6098,6 +6716,7 @@ int in_sched_functions(unsigned long addr)
6098{ 6716{
6099 /* Linker adds these: start and end of __sched functions */ 6717 /* Linker adds these: start and end of __sched functions */
6100 extern char __sched_text_start[], __sched_text_end[]; 6718 extern char __sched_text_start[], __sched_text_end[];
6719
6101 return in_lock_functions(addr) || 6720 return in_lock_functions(addr) ||
6102 (addr >= (unsigned long)__sched_text_start 6721 (addr >= (unsigned long)__sched_text_start
6103 && addr < (unsigned long)__sched_text_end); 6722 && addr < (unsigned long)__sched_text_end);
@@ -6105,14 +6724,15 @@ int in_sched_functions(unsigned long addr)
6105 6724
6106void __init sched_init(void) 6725void __init sched_init(void)
6107{ 6726{
6108 runqueue_t *rq;
6109 int i, j, k; 6727 int i, j, k;
6110 6728
6111 for_each_possible_cpu(i) { 6729 for_each_possible_cpu(i) {
6112 prio_array_t *array; 6730 struct prio_array *array;
6731 struct rq *rq;
6113 6732
6114 rq = cpu_rq(i); 6733 rq = cpu_rq(i);
6115 spin_lock_init(&rq->lock); 6734 spin_lock_init(&rq->lock);
6735 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6116 rq->nr_running = 0; 6736 rq->nr_running = 0;
6117 rq->active = rq->arrays; 6737 rq->active = rq->arrays;
6118 rq->expired = rq->arrays + 1; 6738 rq->expired = rq->arrays + 1;
@@ -6126,7 +6746,6 @@ void __init sched_init(void)
6126 rq->push_cpu = 0; 6746 rq->push_cpu = 0;
6127 rq->migration_thread = NULL; 6747 rq->migration_thread = NULL;
6128 INIT_LIST_HEAD(&rq->migration_queue); 6748 INIT_LIST_HEAD(&rq->migration_queue);
6129 rq->cpu = i;
6130#endif 6749#endif
6131 atomic_set(&rq->nr_iowait, 0); 6750 atomic_set(&rq->nr_iowait, 0);
6132 6751
@@ -6141,6 +6760,7 @@ void __init sched_init(void)
6141 } 6760 }
6142 } 6761 }
6143 6762
6763 set_load_weight(&init_task);
6144 /* 6764 /*
6145 * The boot idle thread does lazy MMU switching as well: 6765 * The boot idle thread does lazy MMU switching as well:
6146 */ 6766 */
@@ -6159,7 +6779,7 @@ void __init sched_init(void)
6159#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6779#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6160void __might_sleep(char *file, int line) 6780void __might_sleep(char *file, int line)
6161{ 6781{
6162#if defined(in_atomic) 6782#ifdef in_atomic
6163 static unsigned long prev_jiffy; /* ratelimiting */ 6783 static unsigned long prev_jiffy; /* ratelimiting */
6164 6784
6165 if ((in_atomic() || irqs_disabled()) && 6785 if ((in_atomic() || irqs_disabled()) &&
@@ -6181,17 +6801,18 @@ EXPORT_SYMBOL(__might_sleep);
6181#ifdef CONFIG_MAGIC_SYSRQ 6801#ifdef CONFIG_MAGIC_SYSRQ
6182void normalize_rt_tasks(void) 6802void normalize_rt_tasks(void)
6183{ 6803{
6804 struct prio_array *array;
6184 struct task_struct *p; 6805 struct task_struct *p;
6185 prio_array_t *array;
6186 unsigned long flags; 6806 unsigned long flags;
6187 runqueue_t *rq; 6807 struct rq *rq;
6188 6808
6189 read_lock_irq(&tasklist_lock); 6809 read_lock_irq(&tasklist_lock);
6190 for_each_process (p) { 6810 for_each_process(p) {
6191 if (!rt_task(p)) 6811 if (!rt_task(p))
6192 continue; 6812 continue;
6193 6813
6194 rq = task_rq_lock(p, &flags); 6814 spin_lock_irqsave(&p->pi_lock, flags);
6815 rq = __task_rq_lock(p);
6195 6816
6196 array = p->array; 6817 array = p->array;
6197 if (array) 6818 if (array)
@@ -6202,7 +6823,8 @@ void normalize_rt_tasks(void)
6202 resched_task(rq->curr); 6823 resched_task(rq->curr);
6203 } 6824 }
6204 6825
6205 task_rq_unlock(rq, &flags); 6826 __task_rq_unlock(rq);
6827 spin_unlock_irqrestore(&p->pi_lock, flags);
6206 } 6828 }
6207 read_unlock_irq(&tasklist_lock); 6829 read_unlock_irq(&tasklist_lock);
6208} 6830}
@@ -6226,7 +6848,7 @@ void normalize_rt_tasks(void)
6226 * 6848 *
6227 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6849 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6228 */ 6850 */
6229task_t *curr_task(int cpu) 6851struct task_struct *curr_task(int cpu)
6230{ 6852{
6231 return cpu_curr(cpu); 6853 return cpu_curr(cpu);
6232} 6854}
@@ -6246,7 +6868,7 @@ task_t *curr_task(int cpu)
6246 * 6868 *
6247 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6869 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6248 */ 6870 */
6249void set_curr_task(int cpu, task_t *p) 6871void set_curr_task(int cpu, struct task_struct *p)
6250{ 6872{
6251 cpu_curr(cpu) = p; 6873 cpu_curr(cpu) = p;
6252} 6874}