aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c3136
1 files changed, 1229 insertions, 1907 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 13cdab3b4c48..3332bbb5d5cf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,13 +16,19 @@
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
19 */ 25 */
20 26
21#include <linux/mm.h> 27#include <linux/mm.h>
22#include <linux/module.h> 28#include <linux/module.h>
23#include <linux/nmi.h> 29#include <linux/nmi.h>
24#include <linux/init.h> 30#include <linux/init.h>
25#include <asm/uaccess.h> 31#include <linux/uaccess.h>
26#include <linux/highmem.h> 32#include <linux/highmem.h>
27#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -53,9 +59,9 @@
53#include <linux/kprobes.h> 59#include <linux/kprobes.h>
54#include <linux/delayacct.h> 60#include <linux/delayacct.h>
55#include <linux/reciprocal_div.h> 61#include <linux/reciprocal_div.h>
62#include <linux/unistd.h>
56 63
57#include <asm/tlb.h> 64#include <asm/tlb.h>
58#include <asm/unistd.h>
59 65
60/* 66/*
61 * Scheduler clock - returns current time in nanosec units. 67 * Scheduler clock - returns current time in nanosec units.
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93 99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
94/* 103/*
95 * These are the 'tuning knobs' of the scheduler: 104 * These are the 'tuning knobs' of the scheduler:
96 * 105 *
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
100 */ 109 */
101#define MIN_TIMESLICE max(5 * HZ / 1000, 1) 110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
102#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
103#define ON_RUNQUEUE_WEIGHT 30
104#define CHILD_PENALTY 95
105#define PARENT_PENALTY 100
106#define EXIT_WEIGHT 3
107#define PRIO_BONUS_RATIO 25
108#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
109#define INTERACTIVE_DELTA 2
110#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
111#define STARVATION_LIMIT (MAX_SLEEP_AVG)
112#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
113
114/*
115 * If a task is 'interactive' then we reinsert it in the active
116 * array after it has expired its current timeslice. (it will not
117 * continue to run immediately, it will still roundrobin with
118 * other interactive tasks.)
119 *
120 * This part scales the interactivity limit depending on niceness.
121 *
122 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
123 * Here are a few examples of different nice levels:
124 *
125 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
126 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
127 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
128 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
129 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
130 *
131 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
132 * priority range a task can explore, a value of '1' means the
133 * task is rated interactive.)
134 *
135 * Ie. nice +19 tasks can never get 'interactive' enough to be
136 * reinserted into the active array. And only heavily CPU-hog nice -20
137 * tasks will be expired. Default nice 0 tasks are somewhere between,
138 * it takes some effort for them to get interactive, but it's not
139 * too hard.
140 */
141
142#define CURRENT_BONUS(p) \
143 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
144 MAX_SLEEP_AVG)
145
146#define GRANULARITY (10 * HZ / 1000 ? : 1)
147
148#ifdef CONFIG_SMP
149#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
150 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
151 num_online_cpus())
152#else
153#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
154 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
155#endif
156
157#define SCALE(v1,v1_max,v2_max) \
158 (v1) * (v2_max) / (v1_max)
159
160#define DELTA(p) \
161 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
162 INTERACTIVE_DELTA)
163
164#define TASK_INTERACTIVE(p) \
165 ((p)->prio <= (p)->static_prio - DELTA(p))
166
167#define INTERACTIVE_SLEEP(p) \
168 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
169 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
170
171#define TASK_PREEMPTS_CURR(p, rq) \
172 ((p)->prio < (rq)->curr->prio)
173
174#define SCALE_PRIO(x, prio) \
175 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
176
177static unsigned int static_prio_timeslice(int static_prio)
178{
179 if (static_prio < NICE_TO_PRIO(0))
180 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
181 else
182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
183}
184 112
185#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
186/* 114/*
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203} 131}
204#endif 132#endif
205 133
134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
136
206/* 137/*
207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
208 * to time slice values: [800ms ... 100ms ... 5ms] 139 * to time slice values: [800ms ... 100ms ... 5ms]
209 *
210 * The higher a thread's priority, the bigger timeslices
211 * it gets during one round of execution. But even the lowest
212 * priority thread gets MIN_TIMESLICE worth of execution time.
213 */ 140 */
141static unsigned int static_prio_timeslice(int static_prio)
142{
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
214 145
215static inline unsigned int task_timeslice(struct task_struct *p) 146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
150}
151
152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
158
159static inline int task_has_rt_policy(struct task_struct *p)
216{ 160{
217 return static_prio_timeslice(p->static_prio); 161 return rt_policy(p->policy);
218} 162}
219 163
220/* 164/*
221 * These are the runqueue data structures: 165 * This is the priority-queue data structure of the RT scheduling class:
222 */ 166 */
167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
223 171
224struct prio_array { 172struct load_stat {
225 unsigned int nr_active; 173 struct load_weight load;
226 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ 174 u64 load_update_start, load_update_last;
227 struct list_head queue[MAX_PRIO]; 175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
198
199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
201 * (like users, containers etc.)
202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
209
210/* Real-Time classes' related field in a runqueue: */
211struct rt_rq {
212 struct rt_prio_array active;
213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
228}; 215};
229 216
230/* 217/*
@@ -235,22 +222,28 @@ struct prio_array {
235 * acquire operations must be ordered by ascending &runqueue. 222 * acquire operations must be ordered by ascending &runqueue.
236 */ 223 */
237struct rq { 224struct rq {
238 spinlock_t lock; 225 spinlock_t lock; /* runqueue lock */
239 226
240 /* 227 /*
241 * nr_running and cpu_load should be in the same cacheline because 228 * nr_running and cpu_load should be in the same cacheline because
242 * remote CPUs use both these fields when doing load calculation. 229 * remote CPUs use both these fields when doing load calculation.
243 */ 230 */
244 unsigned long nr_running; 231 unsigned long nr_running;
245 unsigned long raw_weighted_load; 232 #define CPU_LOAD_IDX_MAX 5
246#ifdef CONFIG_SMP 233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
247 unsigned long cpu_load[3];
248 unsigned char idle_at_tick; 234 unsigned char idle_at_tick;
249#ifdef CONFIG_NO_HZ 235#ifdef CONFIG_NO_HZ
250 unsigned char in_nohz_recently; 236 unsigned char in_nohz_recently;
251#endif 237#endif
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
252#endif 245#endif
253 unsigned long long nr_switches; 246 struct rt_rq rt;
254 247
255 /* 248 /*
256 * This is part of a global counter where only the total sum 249 * This is part of a global counter where only the total sum
@@ -260,14 +253,18 @@ struct rq {
260 */ 253 */
261 unsigned long nr_uninterruptible; 254 unsigned long nr_uninterruptible;
262 255
263 unsigned long expired_timestamp;
264 /* Cached timestamp set by update_cpu_clock() */
265 unsigned long long most_recent_timestamp;
266 struct task_struct *curr, *idle; 256 struct task_struct *curr, *idle;
267 unsigned long next_balance; 257 unsigned long next_balance;
268 struct mm_struct *prev_mm; 258 struct mm_struct *prev_mm;
269 struct prio_array *active, *expired, arrays[2]; 259
270 int best_expired_prio; 260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
271 atomic_t nr_iowait; 268 atomic_t nr_iowait;
272 269
273#ifdef CONFIG_SMP 270#ifdef CONFIG_SMP
@@ -307,6 +304,11 @@ struct rq {
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex); 305static DEFINE_MUTEX(sched_hotcpu_mutex);
309 306
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
310static inline int cpu_of(struct rq *rq) 312static inline int cpu_of(struct rq *rq)
311{ 313{
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq)
317} 319}
318 320
319/* 321/*
322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
367/*
320 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
321 * See detach_destroy_domains: synchronize_sched for details. 369 * See detach_destroy_domains: synchronize_sched for details.
322 * 370 *
@@ -331,6 +379,18 @@ static inline int cpu_of(struct rq *rq)
331#define task_rq(p) cpu_rq(task_cpu(p)) 379#define task_rq(p) cpu_rq(task_cpu(p))
332#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
333 381
382#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p)
385{
386 p->se.cfs_rq = &task_rq(p)->cfs;
387}
388#else
389static inline void set_task_cfs_rq(struct task_struct *p)
390{
391}
392#endif
393
334#ifndef prepare_arch_switch 394#ifndef prepare_arch_switch
335# define prepare_arch_switch(next) do { } while (0) 395# define prepare_arch_switch(next) do { } while (0)
336#endif 396#endif
@@ -460,134 +520,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
460 spin_unlock_irqrestore(&rq->lock, *flags); 520 spin_unlock_irqrestore(&rq->lock, *flags);
461} 521}
462 522
463#ifdef CONFIG_SCHEDSTATS
464/*
465 * bump this up when changing the output format or the meaning of an existing
466 * format, so that tools can adapt (or abort)
467 */
468#define SCHEDSTAT_VERSION 14
469
470static int show_schedstat(struct seq_file *seq, void *v)
471{
472 int cpu;
473
474 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
475 seq_printf(seq, "timestamp %lu\n", jiffies);
476 for_each_online_cpu(cpu) {
477 struct rq *rq = cpu_rq(cpu);
478#ifdef CONFIG_SMP
479 struct sched_domain *sd;
480 int dcnt = 0;
481#endif
482
483 /* runqueue-specific stats */
484 seq_printf(seq,
485 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
486 cpu, rq->yld_both_empty,
487 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
488 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
489 rq->ttwu_cnt, rq->ttwu_local,
490 rq->rq_sched_info.cpu_time,
491 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
492
493 seq_printf(seq, "\n");
494
495#ifdef CONFIG_SMP
496 /* domain-specific stats */
497 preempt_disable();
498 for_each_domain(cpu, sd) {
499 enum idle_type itype;
500 char mask_str[NR_CPUS];
501
502 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
503 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
504 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
505 itype++) {
506 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
507 "%lu",
508 sd->lb_cnt[itype],
509 sd->lb_balanced[itype],
510 sd->lb_failed[itype],
511 sd->lb_imbalance[itype],
512 sd->lb_gained[itype],
513 sd->lb_hot_gained[itype],
514 sd->lb_nobusyq[itype],
515 sd->lb_nobusyg[itype]);
516 }
517 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
518 " %lu %lu %lu\n",
519 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
520 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
521 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
522 sd->ttwu_wake_remote, sd->ttwu_move_affine,
523 sd->ttwu_move_balance);
524 }
525 preempt_enable();
526#endif
527 }
528 return 0;
529}
530
531static int schedstat_open(struct inode *inode, struct file *file)
532{
533 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
534 char *buf = kmalloc(size, GFP_KERNEL);
535 struct seq_file *m;
536 int res;
537
538 if (!buf)
539 return -ENOMEM;
540 res = single_open(file, show_schedstat, NULL);
541 if (!res) {
542 m = file->private_data;
543 m->buf = buf;
544 m->size = size;
545 } else
546 kfree(buf);
547 return res;
548}
549
550const struct file_operations proc_schedstat_operations = {
551 .open = schedstat_open,
552 .read = seq_read,
553 .llseek = seq_lseek,
554 .release = single_release,
555};
556
557/*
558 * Expects runqueue lock to be held for atomicity of update
559 */
560static inline void
561rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
562{
563 if (rq) {
564 rq->rq_sched_info.run_delay += delta_jiffies;
565 rq->rq_sched_info.pcnt++;
566 }
567}
568
569/*
570 * Expects runqueue lock to be held for atomicity of update
571 */
572static inline void
573rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
574{
575 if (rq)
576 rq->rq_sched_info.cpu_time += delta_jiffies;
577}
578# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
579# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
580#else /* !CONFIG_SCHEDSTATS */
581static inline void
582rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
583{}
584static inline void
585rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
586{}
587# define schedstat_inc(rq, field) do { } while (0)
588# define schedstat_add(rq, field, amt) do { } while (0)
589#endif
590
591/* 523/*
592 * this_rq_lock - lock this runqueue and disable interrupts. 524 * this_rq_lock - lock this runqueue and disable interrupts.
593 */ 525 */
@@ -603,177 +535,172 @@ static inline struct rq *this_rq_lock(void)
603 return rq; 535 return rq;
604} 536}
605 537
606#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
607/* 538/*
608 * Called when a process is dequeued from the active array and given 539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
609 * the cpu. We should note that with the exception of interactive
610 * tasks, the expired queue will become the active queue after the active
611 * queue is empty, without explicitly dequeuing and requeuing tasks in the
612 * expired queue. (Interactive tasks may be requeued directly to the
613 * active queue, thus delaying tasks in the expired queue from running;
614 * see scheduler_tick()).
615 *
616 * This function is only called from sched_info_arrive(), rather than
617 * dequeue_task(). Even though a task may be queued and dequeued multiple
618 * times as it is shuffled about, we're really interested in knowing how
619 * long it was from the *first* time it was queued to the time that it
620 * finally hit a cpu.
621 */ 540 */
622static inline void sched_info_dequeued(struct task_struct *t) 541void sched_clock_unstable_event(void)
623{ 542{
624 t->sched_info.last_queued = 0; 543 unsigned long flags;
544 struct rq *rq;
545
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
625} 550}
626 551
627/* 552/*
628 * Called when a task finally hits the cpu. We can now calculate how 553 * resched_task - mark a task 'to be rescheduled now'.
629 * long it was waiting to run. We also note when it began so that we 554 *
630 * can keep stats on how long its timeslice is. 555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
631 */ 558 */
632static void sched_info_arrive(struct task_struct *t) 559#ifdef CONFIG_SMP
560
561#ifndef tsk_is_polling
562#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563#endif
564
565static void resched_task(struct task_struct *p)
633{ 566{
634 unsigned long now = jiffies, delta_jiffies = 0; 567 int cpu;
635 568
636 if (t->sched_info.last_queued) 569 assert_spin_locked(&task_rq(p)->lock);
637 delta_jiffies = now - t->sched_info.last_queued; 570
638 sched_info_dequeued(t); 571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
639 t->sched_info.run_delay += delta_jiffies; 572 return;
640 t->sched_info.last_arrival = now;
641 t->sched_info.pcnt++;
642 573
643 rq_sched_info_arrive(task_rq(t), delta_jiffies); 574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
575
576 cpu = task_cpu(p);
577 if (cpu == smp_processor_id())
578 return;
579
580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
644} 584}
645 585
646/* 586static void resched_cpu(int cpu)
647 * Called when a process is queued into either the active or expired
648 * array. The time is noted and later used to determine how long we
649 * had to wait for us to reach the cpu. Since the expired queue will
650 * become the active queue after active queue is empty, without dequeuing
651 * and requeuing any tasks, we are interested in queuing to either. It
652 * is unusual but not impossible for tasks to be dequeued and immediately
653 * requeued in the same or another array: this can happen in sched_yield(),
654 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
655 * to runqueue.
656 *
657 * This function is only called from enqueue_task(), but also only updates
658 * the timestamp if it is already not set. It's assumed that
659 * sched_info_dequeued() will clear that stamp when appropriate.
660 */
661static inline void sched_info_queued(struct task_struct *t)
662{ 587{
663 if (unlikely(sched_info_on())) 588 struct rq *rq = cpu_rq(cpu);
664 if (!t->sched_info.last_queued) 589 unsigned long flags;
665 t->sched_info.last_queued = jiffies; 590
591 if (!spin_trylock_irqsave(&rq->lock, flags))
592 return;
593 resched_task(cpu_curr(cpu));
594 spin_unlock_irqrestore(&rq->lock, flags);
595}
596#else
597static inline void resched_task(struct task_struct *p)
598{
599 assert_spin_locked(&task_rq(p)->lock);
600 set_tsk_need_resched(p);
666} 601}
602#endif
667 603
668/* 604static u64 div64_likely32(u64 divident, unsigned long divisor)
669 * Called when a process ceases being the active-running process, either
670 * voluntarily or involuntarily. Now we can calculate how long we ran.
671 */
672static inline void sched_info_depart(struct task_struct *t)
673{ 605{
674 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; 606#if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
675 610
676 t->sched_info.cpu_time += delta_jiffies; 611 return divident;
677 rq_sched_info_depart(task_rq(t), delta_jiffies); 612#else
613 return divident / divisor;
614#endif
678} 615}
679 616
680/* 617#if BITS_PER_LONG == 32
681 * Called when tasks are switched involuntarily due, typically, to expiring 618# define WMULT_CONST (~0UL)
682 * their time slice. (This may also be called when switching to or from 619#else
683 * the idle task.) We are only called when prev != next. 620# define WMULT_CONST (1UL << 32)
684 */ 621#endif
685static inline void 622
686__sched_info_switch(struct task_struct *prev, struct task_struct *next) 623#define WMULT_SHIFT 32
624
625static inline unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
687{ 628{
688 struct rq *rq = task_rq(prev); 629 u64 tmp;
630
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
689 633
634 tmp = (u64)delta_exec * weight;
690 /* 635 /*
691 * prev now departs the cpu. It's not interesting to record 636 * Check whether we'd overflow the 64-bit multiplication:
692 * stats about how efficient we were at scheduling the idle
693 * process, however.
694 */ 637 */
695 if (prev != rq->idle) 638 if (unlikely(tmp > WMULT_CONST)) {
696 sched_info_depart(prev); 639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
643 }
697 644
698 if (next != rq->idle) 645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
699 sched_info_arrive(next);
700} 646}
701static inline void
702sched_info_switch(struct task_struct *prev, struct task_struct *next)
703{
704 if (unlikely(sched_info_on()))
705 __sched_info_switch(prev, next);
706}
707#else
708#define sched_info_queued(t) do { } while (0)
709#define sched_info_switch(t, next) do { } while (0)
710#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
711 647
712/* 648static inline unsigned long
713 * Adding/removing a task to/from a priority array: 649calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
714 */
715static void dequeue_task(struct task_struct *p, struct prio_array *array)
716{ 650{
717 array->nr_active--; 651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
718 list_del(&p->run_list);
719 if (list_empty(array->queue + p->prio))
720 __clear_bit(p->prio, array->bitmap);
721} 652}
722 653
723static void enqueue_task(struct task_struct *p, struct prio_array *array) 654static void update_load_add(struct load_weight *lw, unsigned long inc)
724{ 655{
725 sched_info_queued(p); 656 lw->weight += inc;
726 list_add_tail(&p->run_list, array->queue + p->prio); 657 lw->inv_weight = 0;
727 __set_bit(p->prio, array->bitmap);
728 array->nr_active++;
729 p->array = array;
730} 658}
731 659
732/* 660static void update_load_sub(struct load_weight *lw, unsigned long dec)
733 * Put task to the end of the run list without the overhead of dequeue
734 * followed by enqueue.
735 */
736static void requeue_task(struct task_struct *p, struct prio_array *array)
737{ 661{
738 list_move_tail(&p->run_list, array->queue + p->prio); 662 lw->weight -= dec;
663 lw->inv_weight = 0;
739} 664}
740 665
741static inline void 666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
742enqueue_task_head(struct task_struct *p, struct prio_array *array)
743{ 667{
744 list_add(&p->run_list, array->queue + p->prio); 668 if (rq->curr != rq->idle && ls->load.weight) {
745 __set_bit(p->prio, array->bitmap); 669 ls->delta_exec += ls->delta_stat;
746 array->nr_active++; 670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
747 p->array = array; 671 ls->delta_stat = 0;
672 }
748} 673}
749 674
750/* 675/*
751 * __normal_prio - return the priority that is based on the static 676 * Update delta_exec, delta_fair fields for rq.
752 * priority but is modified by bonuses/penalties.
753 *
754 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
755 * into the -5 ... 0 ... +5 bonus/penalty range.
756 * 677 *
757 * We use 25% of the full 0...39 priority range so that: 678 * delta_fair clock advances at a rate inversely proportional to
679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
758 * 682 *
759 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
760 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
761 * 686 *
762 * Both properties are important to certain workloads. 687 * This function is called /before/ updating rq->ls.load
688 * and when switching tasks.
763 */ 689 */
764 690static void update_curr_load(struct rq *rq, u64 now)
765static inline int __normal_prio(struct task_struct *p)
766{ 691{
767 int bonus, prio; 692 struct load_stat *ls = &rq->ls;
693 u64 start;
768 694
769 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 695 start = ls->load_update_start;
770 696 ls->load_update_start = now;
771 prio = p->static_prio - bonus; 697 ls->delta_stat += now - start;
772 if (prio < MAX_RT_PRIO) 698 /*
773 prio = MAX_RT_PRIO; 699 * Stagger updates to ls->delta_fair. Very frequent updates
774 if (prio > MAX_PRIO-1) 700 * can be expensive.
775 prio = MAX_PRIO-1; 701 */
776 return prio; 702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
777} 704}
778 705
779/* 706/*
@@ -791,53 +718,146 @@ static inline int __normal_prio(struct task_struct *p)
791 * this code will need modification 718 * this code will need modification
792 */ 719 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
794#define LOAD_WEIGHT(lp) \ 721#define load_weight(lp) \
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \ 723#define PRIO_TO_LOAD_WEIGHT(prio) \
797 LOAD_WEIGHT(static_prio_timeslice(prio)) 724 load_weight(static_prio_timeslice(prio))
798#define RTPRIO_TO_LOAD_WEIGHT(rp) \ 725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800 727
801static void set_load_weight(struct task_struct *p) 728#define WEIGHT_IDLEPRIO 2
802{ 729#define WMULT_IDLEPRIO (1 << 31)
803 if (has_rt_policy(p)) { 730
804#ifdef CONFIG_SMP 731/*
805 if (p == task_rq(p)->migration_thread) 732 * Nice levels are multiplicative, with a gentle 10% change for every
806 /* 733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * The migration thread does the actual balancing. 734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * Giving its load any weight will skew balancing 735 * that remained on nice 0.
809 * adversely. 736 *
810 */ 737 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 p->load_weight = 0; 738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 else 739 * it's +10% CPU usage.
813#endif 740 */
814 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); 741static const int prio_to_weight[40] = {
815 } else 742/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); 743/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817} 744/* 0 */ NICE_0_LOAD /* 1024 */,
745/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
746/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
747};
748
749static const u32 prio_to_wmult[40] = {
750 48356, 60446, 75558, 94446, 118058, 147573,
751 184467, 230589, 288233, 360285, 450347,
752 562979, 703746, 879575, 1099582, 1374389,
753 1717986, 2147483, 2684354, 3355443, 4194304,
754 5244160, 6557201, 8196502, 10250518, 12782640,
755 16025997, 19976592, 24970740, 31350126, 39045157,
756 49367440, 61356675, 76695844, 95443717, 119304647,
757 148102320, 186737708, 238609294, 286331153,
758};
818 759
819static inline void 760static inline void
820inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 761inc_load(struct rq *rq, const struct task_struct *p, u64 now)
821{ 762{
822 rq->raw_weighted_load += p->load_weight; 763 update_curr_load(rq, now);
764 update_load_add(&rq->ls.load, p->se.load.weight);
823} 765}
824 766
825static inline void 767static inline void
826dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 768dec_load(struct rq *rq, const struct task_struct *p, u64 now)
827{ 769{
828 rq->raw_weighted_load -= p->load_weight; 770 update_curr_load(rq, now);
771 update_load_sub(&rq->ls.load, p->se.load.weight);
829} 772}
830 773
831static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 774static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
832{ 775{
833 rq->nr_running++; 776 rq->nr_running++;
834 inc_raw_weighted_load(rq, p); 777 inc_load(rq, p, now);
835} 778}
836 779
837static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 780static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
838{ 781{
839 rq->nr_running--; 782 rq->nr_running--;
840 dec_raw_weighted_load(rq, p); 783 dec_load(rq, p, now);
784}
785
786static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
787
788/*
789 * runqueue iterator, to support SMP load-balancing between different
790 * scheduling classes, without having to expose their internal data
791 * structures to the load-balancing proper:
792 */
793struct rq_iterator {
794 void *arg;
795 struct task_struct *(*start)(void *);
796 struct task_struct *(*next)(void *);
797};
798
799static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
800 unsigned long max_nr_move, unsigned long max_load_move,
801 struct sched_domain *sd, enum cpu_idle_type idle,
802 int *all_pinned, unsigned long *load_moved,
803 int this_best_prio, int best_prio, int best_prio_seen,
804 struct rq_iterator *iterator);
805
806#include "sched_stats.h"
807#include "sched_rt.c"
808#include "sched_fair.c"
809#include "sched_idletask.c"
810#ifdef CONFIG_SCHED_DEBUG
811# include "sched_debug.c"
812#endif
813
814#define sched_class_highest (&rt_sched_class)
815
816static void set_load_weight(struct task_struct *p)
817{
818 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
819 p->se.wait_runtime = 0;
820
821 if (task_has_rt_policy(p)) {
822 p->se.load.weight = prio_to_weight[0] * 2;
823 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
824 return;
825 }
826
827 /*
828 * SCHED_IDLE tasks get minimal weight:
829 */
830 if (p->policy == SCHED_IDLE) {
831 p->se.load.weight = WEIGHT_IDLEPRIO;
832 p->se.load.inv_weight = WMULT_IDLEPRIO;
833 return;
834 }
835
836 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
837 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
838}
839
840static void
841enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
842{
843 sched_info_queued(p);
844 p->sched_class->enqueue_task(rq, p, wakeup, now);
845 p->se.on_rq = 1;
846}
847
848static void
849dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
850{
851 p->sched_class->dequeue_task(rq, p, sleep, now);
852 p->se.on_rq = 0;
853}
854
855/*
856 * __normal_prio - return the priority that is based on the static prio
857 */
858static inline int __normal_prio(struct task_struct *p)
859{
860 return p->static_prio;
841} 861}
842 862
843/* 863/*
@@ -851,7 +871,7 @@ static inline int normal_prio(struct task_struct *p)
851{ 871{
852 int prio; 872 int prio;
853 873
854 if (has_rt_policy(p)) 874 if (task_has_rt_policy(p))
855 prio = MAX_RT_PRIO-1 - p->rt_priority; 875 prio = MAX_RT_PRIO-1 - p->rt_priority;
856 else 876 else
857 prio = __normal_prio(p); 877 prio = __normal_prio(p);
@@ -879,221 +899,46 @@ static int effective_prio(struct task_struct *p)
879} 899}
880 900
881/* 901/*
882 * __activate_task - move a task to the runqueue. 902 * activate_task - move a task to the runqueue.
883 */
884static void __activate_task(struct task_struct *p, struct rq *rq)
885{
886 struct prio_array *target = rq->active;
887
888 if (batch_task(p))
889 target = rq->expired;
890 enqueue_task(p, target);
891 inc_nr_running(p, rq);
892}
893
894/*
895 * __activate_idle_task - move idle task to the _front_ of runqueue.
896 */ 903 */
897static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) 904static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
898{ 905{
899 enqueue_task_head(p, rq->active); 906 u64 now = rq_clock(rq);
900 inc_nr_running(p, rq);
901}
902 907
903/* 908 if (p->state == TASK_UNINTERRUPTIBLE)
904 * Recalculate p->normal_prio and p->prio after having slept, 909 rq->nr_uninterruptible--;
905 * updating the sleep-average too:
906 */
907static int recalc_task_prio(struct task_struct *p, unsigned long long now)
908{
909 /* Caller must always ensure 'now >= p->timestamp' */
910 unsigned long sleep_time = now - p->timestamp;
911
912 if (batch_task(p))
913 sleep_time = 0;
914
915 if (likely(sleep_time > 0)) {
916 /*
917 * This ceiling is set to the lowest priority that would allow
918 * a task to be reinserted into the active array on timeslice
919 * completion.
920 */
921 unsigned long ceiling = INTERACTIVE_SLEEP(p);
922
923 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
924 /*
925 * Prevents user tasks from achieving best priority
926 * with one single large enough sleep.
927 */
928 p->sleep_avg = ceiling;
929 /*
930 * Using INTERACTIVE_SLEEP() as a ceiling places a
931 * nice(0) task 1ms sleep away from promotion, and
932 * gives it 700ms to round-robin with no chance of
933 * being demoted. This is more than generous, so
934 * mark this sleep as non-interactive to prevent the
935 * on-runqueue bonus logic from intervening should
936 * this task not receive cpu immediately.
937 */
938 p->sleep_type = SLEEP_NONINTERACTIVE;
939 } else {
940 /*
941 * Tasks waking from uninterruptible sleep are
942 * limited in their sleep_avg rise as they
943 * are likely to be waiting on I/O
944 */
945 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
946 if (p->sleep_avg >= ceiling)
947 sleep_time = 0;
948 else if (p->sleep_avg + sleep_time >=
949 ceiling) {
950 p->sleep_avg = ceiling;
951 sleep_time = 0;
952 }
953 }
954
955 /*
956 * This code gives a bonus to interactive tasks.
957 *
958 * The boost works by updating the 'average sleep time'
959 * value here, based on ->timestamp. The more time a
960 * task spends sleeping, the higher the average gets -
961 * and the higher the priority boost gets as well.
962 */
963 p->sleep_avg += sleep_time;
964
965 }
966 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
967 p->sleep_avg = NS_MAX_SLEEP_AVG;
968 }
969 910
970 return effective_prio(p); 911 enqueue_task(rq, p, wakeup, now);
912 inc_nr_running(p, rq, now);
971} 913}
972 914
973/* 915/*
974 * activate_task - move a task to the runqueue and do priority recalculation 916 * activate_idle_task - move idle task to the _front_ of runqueue.
975 *
976 * Update all the scheduling statistics stuff. (sleep average
977 * calculation, priority modifiers, etc.)
978 */ 917 */
979static void activate_task(struct task_struct *p, struct rq *rq, int local) 918static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
980{ 919{
981 unsigned long long now; 920 u64 now = rq_clock(rq);
982
983 if (rt_task(p))
984 goto out;
985
986 now = sched_clock();
987#ifdef CONFIG_SMP
988 if (!local) {
989 /* Compensate for drifting sched_clock */
990 struct rq *this_rq = this_rq();
991 now = (now - this_rq->most_recent_timestamp)
992 + rq->most_recent_timestamp;
993 }
994#endif
995 921
996 /* 922 if (p->state == TASK_UNINTERRUPTIBLE)
997 * Sleep time is in units of nanosecs, so shift by 20 to get a 923 rq->nr_uninterruptible--;
998 * milliseconds-range estimation of the amount of time that the task
999 * spent sleeping:
1000 */
1001 if (unlikely(prof_on == SLEEP_PROFILING)) {
1002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1004 (now - p->timestamp) >> 20);
1005 }
1006
1007 p->prio = recalc_task_prio(p, now);
1008 924
1009 /* 925 enqueue_task(rq, p, 0, now);
1010 * This checks to make sure it's not an uninterruptible task 926 inc_nr_running(p, rq, now);
1011 * that is now waking up.
1012 */
1013 if (p->sleep_type == SLEEP_NORMAL) {
1014 /*
1015 * Tasks which were woken up by interrupts (ie. hw events)
1016 * are most likely of interactive nature. So we give them
1017 * the credit of extending their sleep time to the period
1018 * of time they spend on the runqueue, waiting for execution
1019 * on a CPU, first time around:
1020 */
1021 if (in_interrupt())
1022 p->sleep_type = SLEEP_INTERRUPTED;
1023 else {
1024 /*
1025 * Normal first-time wakeups get a credit too for
1026 * on-runqueue time, but it will be weighted down:
1027 */
1028 p->sleep_type = SLEEP_INTERACTIVE;
1029 }
1030 }
1031 p->timestamp = now;
1032out:
1033 __activate_task(p, rq);
1034} 927}
1035 928
1036/* 929/*
1037 * deactivate_task - remove a task from the runqueue. 930 * deactivate_task - remove a task from the runqueue.
1038 */ 931 */
1039static void deactivate_task(struct task_struct *p, struct rq *rq) 932static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1040{ 933{
1041 dec_nr_running(p, rq); 934 u64 now = rq_clock(rq);
1042 dequeue_task(p, p->array);
1043 p->array = NULL;
1044}
1045 935
1046/* 936 if (p->state == TASK_UNINTERRUPTIBLE)
1047 * resched_task - mark a task 'to be rescheduled now'. 937 rq->nr_uninterruptible++;
1048 *
1049 * On UP this means the setting of the need_resched flag, on SMP it
1050 * might also involve a cross-CPU call to trigger the scheduler on
1051 * the target CPU.
1052 */
1053#ifdef CONFIG_SMP
1054
1055#ifndef tsk_is_polling
1056#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1057#endif
1058
1059static void resched_task(struct task_struct *p)
1060{
1061 int cpu;
1062
1063 assert_spin_locked(&task_rq(p)->lock);
1064
1065 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1066 return;
1067
1068 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1069
1070 cpu = task_cpu(p);
1071 if (cpu == smp_processor_id())
1072 return;
1073
1074 /* NEED_RESCHED must be visible before we test polling */
1075 smp_mb();
1076 if (!tsk_is_polling(p))
1077 smp_send_reschedule(cpu);
1078}
1079
1080static void resched_cpu(int cpu)
1081{
1082 struct rq *rq = cpu_rq(cpu);
1083 unsigned long flags;
1084 938
1085 if (!spin_trylock_irqsave(&rq->lock, flags)) 939 dequeue_task(rq, p, sleep, now);
1086 return; 940 dec_nr_running(p, rq, now);
1087 resched_task(cpu_curr(cpu));
1088 spin_unlock_irqrestore(&rq->lock, flags);
1089} 941}
1090#else
1091static inline void resched_task(struct task_struct *p)
1092{
1093 assert_spin_locked(&task_rq(p)->lock);
1094 set_tsk_need_resched(p);
1095}
1096#endif
1097 942
1098/** 943/**
1099 * task_curr - is this task currently executing on a CPU? 944 * task_curr - is this task currently executing on a CPU?
@@ -1107,10 +952,42 @@ inline int task_curr(const struct task_struct *p)
1107/* Used instead of source_load when we know the type == 0 */ 952/* Used instead of source_load when we know the type == 0 */
1108unsigned long weighted_cpuload(const int cpu) 953unsigned long weighted_cpuload(const int cpu)
1109{ 954{
1110 return cpu_rq(cpu)->raw_weighted_load; 955 return cpu_rq(cpu)->ls.load.weight;
1111} 956}
1112 957
958static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
959{
1113#ifdef CONFIG_SMP 960#ifdef CONFIG_SMP
961 task_thread_info(p)->cpu = cpu;
962 set_task_cfs_rq(p);
963#endif
964}
965
966#ifdef CONFIG_SMP
967
968void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
969{
970 int old_cpu = task_cpu(p);
971 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
972 u64 clock_offset, fair_clock_offset;
973
974 clock_offset = old_rq->clock - new_rq->clock;
975 fair_clock_offset = old_rq->cfs.fair_clock -
976 new_rq->cfs.fair_clock;
977 if (p->se.wait_start)
978 p->se.wait_start -= clock_offset;
979 if (p->se.wait_start_fair)
980 p->se.wait_start_fair -= fair_clock_offset;
981 if (p->se.sleep_start)
982 p->se.sleep_start -= clock_offset;
983 if (p->se.block_start)
984 p->se.block_start -= clock_offset;
985 if (p->se.sleep_start_fair)
986 p->se.sleep_start_fair -= fair_clock_offset;
987
988 __set_task_cpu(p, new_cpu);
989}
990
1114struct migration_req { 991struct migration_req {
1115 struct list_head list; 992 struct list_head list;
1116 993
@@ -1133,7 +1010,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1133 * If the task is not on a runqueue (and not running), then 1010 * If the task is not on a runqueue (and not running), then
1134 * it is sufficient to simply update the task's cpu field. 1011 * it is sufficient to simply update the task's cpu field.
1135 */ 1012 */
1136 if (!p->array && !task_running(rq, p)) { 1013 if (!p->se.on_rq && !task_running(rq, p)) {
1137 set_task_cpu(p, dest_cpu); 1014 set_task_cpu(p, dest_cpu);
1138 return 0; 1015 return 0;
1139 } 1016 }
@@ -1158,22 +1035,72 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1158void wait_task_inactive(struct task_struct *p) 1035void wait_task_inactive(struct task_struct *p)
1159{ 1036{
1160 unsigned long flags; 1037 unsigned long flags;
1038 int running, on_rq;
1161 struct rq *rq; 1039 struct rq *rq;
1162 int preempted;
1163 1040
1164repeat: 1041repeat:
1042 /*
1043 * We do the initial early heuristics without holding
1044 * any task-queue locks at all. We'll only try to get
1045 * the runqueue lock when things look like they will
1046 * work out!
1047 */
1048 rq = task_rq(p);
1049
1050 /*
1051 * If the task is actively running on another CPU
1052 * still, just relax and busy-wait without holding
1053 * any locks.
1054 *
1055 * NOTE! Since we don't hold any locks, it's not
1056 * even sure that "rq" stays as the right runqueue!
1057 * But we don't care, since "task_running()" will
1058 * return false if the runqueue has changed and p
1059 * is actually now running somewhere else!
1060 */
1061 while (task_running(rq, p))
1062 cpu_relax();
1063
1064 /*
1065 * Ok, time to look more closely! We need the rq
1066 * lock now, to be *sure*. If we're wrong, we'll
1067 * just go back and repeat.
1068 */
1165 rq = task_rq_lock(p, &flags); 1069 rq = task_rq_lock(p, &flags);
1166 /* Must be off runqueue entirely, not preempted. */ 1070 running = task_running(rq, p);
1167 if (unlikely(p->array || task_running(rq, p))) { 1071 on_rq = p->se.on_rq;
1168 /* If it's preempted, we yield. It could be a while. */ 1072 task_rq_unlock(rq, &flags);
1169 preempted = !task_running(rq, p); 1073
1170 task_rq_unlock(rq, &flags); 1074 /*
1075 * Was it really running after all now that we
1076 * checked with the proper locks actually held?
1077 *
1078 * Oops. Go back and try again..
1079 */
1080 if (unlikely(running)) {
1171 cpu_relax(); 1081 cpu_relax();
1172 if (preempted)
1173 yield();
1174 goto repeat; 1082 goto repeat;
1175 } 1083 }
1176 task_rq_unlock(rq, &flags); 1084
1085 /*
1086 * It's not enough that it's not actively running,
1087 * it must be off the runqueue _entirely_, and not
1088 * preempted!
1089 *
1090 * So if it wa still runnable (but just not actively
1091 * running right now), it's preempted, and we should
1092 * yield - it could be a while.
1093 */
1094 if (unlikely(on_rq)) {
1095 yield();
1096 goto repeat;
1097 }
1098
1099 /*
1100 * Ahh, all good. It wasn't running, and it wasn't
1101 * runnable, which means that it will never become
1102 * running in the future either. We're all done!
1103 */
1177} 1104}
1178 1105
1179/*** 1106/***
@@ -1210,11 +1137,12 @@ void kick_process(struct task_struct *p)
1210static inline unsigned long source_load(int cpu, int type) 1137static inline unsigned long source_load(int cpu, int type)
1211{ 1138{
1212 struct rq *rq = cpu_rq(cpu); 1139 struct rq *rq = cpu_rq(cpu);
1140 unsigned long total = weighted_cpuload(cpu);
1213 1141
1214 if (type == 0) 1142 if (type == 0)
1215 return rq->raw_weighted_load; 1143 return total;
1216 1144
1217 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1145 return min(rq->cpu_load[type-1], total);
1218} 1146}
1219 1147
1220/* 1148/*
@@ -1224,11 +1152,12 @@ static inline unsigned long source_load(int cpu, int type)
1224static inline unsigned long target_load(int cpu, int type) 1152static inline unsigned long target_load(int cpu, int type)
1225{ 1153{
1226 struct rq *rq = cpu_rq(cpu); 1154 struct rq *rq = cpu_rq(cpu);
1155 unsigned long total = weighted_cpuload(cpu);
1227 1156
1228 if (type == 0) 1157 if (type == 0)
1229 return rq->raw_weighted_load; 1158 return total;
1230 1159
1231 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1160 return max(rq->cpu_load[type-1], total);
1232} 1161}
1233 1162
1234/* 1163/*
@@ -1237,9 +1166,10 @@ static inline unsigned long target_load(int cpu, int type)
1237static inline unsigned long cpu_avg_load_per_task(int cpu) 1166static inline unsigned long cpu_avg_load_per_task(int cpu)
1238{ 1167{
1239 struct rq *rq = cpu_rq(cpu); 1168 struct rq *rq = cpu_rq(cpu);
1169 unsigned long total = weighted_cpuload(cpu);
1240 unsigned long n = rq->nr_running; 1170 unsigned long n = rq->nr_running;
1241 1171
1242 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1172 return n ? total / n : SCHED_LOAD_SCALE;
1243} 1173}
1244 1174
1245/* 1175/*
@@ -1341,9 +1271,9 @@ static int sched_balance_self(int cpu, int flag)
1341 struct sched_domain *tmp, *sd = NULL; 1271 struct sched_domain *tmp, *sd = NULL;
1342 1272
1343 for_each_domain(cpu, tmp) { 1273 for_each_domain(cpu, tmp) {
1344 /* 1274 /*
1345 * If power savings logic is enabled for a domain, stop there. 1275 * If power savings logic is enabled for a domain, stop there.
1346 */ 1276 */
1347 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1277 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1348 break; 1278 break;
1349 if (tmp->flags & flag) 1279 if (tmp->flags & flag)
@@ -1426,9 +1356,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1426 if (idle_cpu(i)) 1356 if (idle_cpu(i))
1427 return i; 1357 return i;
1428 } 1358 }
1429 } 1359 } else {
1430 else
1431 break; 1360 break;
1361 }
1432 } 1362 }
1433 return cpu; 1363 return cpu;
1434} 1364}
@@ -1470,7 +1400,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1470 if (!(old_state & state)) 1400 if (!(old_state & state))
1471 goto out; 1401 goto out;
1472 1402
1473 if (p->array) 1403 if (p->se.on_rq)
1474 goto out_running; 1404 goto out_running;
1475 1405
1476 cpu = task_cpu(p); 1406 cpu = task_cpu(p);
@@ -1525,11 +1455,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1525 * of the current CPU: 1455 * of the current CPU:
1526 */ 1456 */
1527 if (sync) 1457 if (sync)
1528 tl -= current->load_weight; 1458 tl -= current->se.load.weight;
1529 1459
1530 if ((tl <= load && 1460 if ((tl <= load &&
1531 tl + target_load(cpu, idx) <= tl_per_task) || 1461 tl + target_load(cpu, idx) <= tl_per_task) ||
1532 100*(tl + p->load_weight) <= imbalance*load) { 1462 100*(tl + p->se.load.weight) <= imbalance*load) {
1533 /* 1463 /*
1534 * This domain has SD_WAKE_AFFINE and 1464 * This domain has SD_WAKE_AFFINE and
1535 * p is cache cold in this domain, and 1465 * p is cache cold in this domain, and
@@ -1563,7 +1493,7 @@ out_set_cpu:
1563 old_state = p->state; 1493 old_state = p->state;
1564 if (!(old_state & state)) 1494 if (!(old_state & state))
1565 goto out; 1495 goto out;
1566 if (p->array) 1496 if (p->se.on_rq)
1567 goto out_running; 1497 goto out_running;
1568 1498
1569 this_cpu = smp_processor_id(); 1499 this_cpu = smp_processor_id();
@@ -1572,25 +1502,7 @@ out_set_cpu:
1572 1502
1573out_activate: 1503out_activate:
1574#endif /* CONFIG_SMP */ 1504#endif /* CONFIG_SMP */
1575 if (old_state == TASK_UNINTERRUPTIBLE) { 1505 activate_task(rq, p, 1);
1576 rq->nr_uninterruptible--;
1577 /*
1578 * Tasks on involuntary sleep don't earn
1579 * sleep_avg beyond just interactive state.
1580 */
1581 p->sleep_type = SLEEP_NONINTERACTIVE;
1582 } else
1583
1584 /*
1585 * Tasks that have marked their sleep as noninteractive get
1586 * woken up with their sleep average not weighted in an
1587 * interactive way.
1588 */
1589 if (old_state & TASK_NONINTERACTIVE)
1590 p->sleep_type = SLEEP_NONINTERACTIVE;
1591
1592
1593 activate_task(p, rq, cpu == this_cpu);
1594 /* 1506 /*
1595 * Sync wakeups (i.e. those types of wakeups where the waker 1507 * Sync wakeups (i.e. those types of wakeups where the waker
1596 * has indicated that it will leave the CPU in short order) 1508 * has indicated that it will leave the CPU in short order)
@@ -1599,10 +1511,8 @@ out_activate:
1599 * the waker guarantees that the freshly woken up task is going 1511 * the waker guarantees that the freshly woken up task is going
1600 * to be considered on this CPU.) 1512 * to be considered on this CPU.)
1601 */ 1513 */
1602 if (!sync || cpu != this_cpu) { 1514 if (!sync || cpu != this_cpu)
1603 if (TASK_PREEMPTS_CURR(p, rq)) 1515 check_preempt_curr(rq, p);
1604 resched_task(rq->curr);
1605 }
1606 success = 1; 1516 success = 1;
1607 1517
1608out_running: 1518out_running:
@@ -1625,19 +1535,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1625 return try_to_wake_up(p, state, 0); 1535 return try_to_wake_up(p, state, 0);
1626} 1536}
1627 1537
1628static void task_running_tick(struct rq *rq, struct task_struct *p);
1629/* 1538/*
1630 * Perform scheduler related setup for a newly forked process p. 1539 * Perform scheduler related setup for a newly forked process p.
1631 * p is forked by current. 1540 * p is forked by current.
1632 */ 1541 *
1633void fastcall sched_fork(struct task_struct *p, int clone_flags) 1542 * __sched_fork() is basic setup used by init_idle() too:
1634{ 1543 */
1635 int cpu = get_cpu(); 1544static void __sched_fork(struct task_struct *p)
1545{
1546 p->se.wait_start_fair = 0;
1547 p->se.wait_start = 0;
1548 p->se.exec_start = 0;
1549 p->se.sum_exec_runtime = 0;
1550 p->se.delta_exec = 0;
1551 p->se.delta_fair_run = 0;
1552 p->se.delta_fair_sleep = 0;
1553 p->se.wait_runtime = 0;
1554 p->se.sum_wait_runtime = 0;
1555 p->se.sum_sleep_runtime = 0;
1556 p->se.sleep_start = 0;
1557 p->se.sleep_start_fair = 0;
1558 p->se.block_start = 0;
1559 p->se.sleep_max = 0;
1560 p->se.block_max = 0;
1561 p->se.exec_max = 0;
1562 p->se.wait_max = 0;
1563 p->se.wait_runtime_overruns = 0;
1564 p->se.wait_runtime_underruns = 0;
1636 1565
1637#ifdef CONFIG_SMP 1566 INIT_LIST_HEAD(&p->run_list);
1638 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1567 p->se.on_rq = 0;
1639#endif
1640 set_task_cpu(p, cpu);
1641 1568
1642 /* 1569 /*
1643 * We mark the process as running here, but have not actually 1570 * We mark the process as running here, but have not actually
@@ -1646,16 +1573,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1646 * event cannot wake it up and insert it on the runqueue either. 1573 * event cannot wake it up and insert it on the runqueue either.
1647 */ 1574 */
1648 p->state = TASK_RUNNING; 1575 p->state = TASK_RUNNING;
1576}
1577
1578/*
1579 * fork()/clone()-time setup:
1580 */
1581void sched_fork(struct task_struct *p, int clone_flags)
1582{
1583 int cpu = get_cpu();
1584
1585 __sched_fork(p);
1586
1587#ifdef CONFIG_SMP
1588 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1589#endif
1590 __set_task_cpu(p, cpu);
1649 1591
1650 /* 1592 /*
1651 * Make sure we do not leak PI boosting priority to the child: 1593 * Make sure we do not leak PI boosting priority to the child:
1652 */ 1594 */
1653 p->prio = current->normal_prio; 1595 p->prio = current->normal_prio;
1654 1596
1655 INIT_LIST_HEAD(&p->run_list);
1656 p->array = NULL;
1657#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1597#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1658 if (unlikely(sched_info_on())) 1598 if (likely(sched_info_on()))
1659 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1599 memset(&p->sched_info, 0, sizeof(p->sched_info));
1660#endif 1600#endif
1661#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1601#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1665,34 +1605,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1665 /* Want to start with kernel preemption disabled. */ 1605 /* Want to start with kernel preemption disabled. */
1666 task_thread_info(p)->preempt_count = 1; 1606 task_thread_info(p)->preempt_count = 1;
1667#endif 1607#endif
1668 /*
1669 * Share the timeslice between parent and child, thus the
1670 * total amount of pending timeslices in the system doesn't change,
1671 * resulting in more scheduling fairness.
1672 */
1673 local_irq_disable();
1674 p->time_slice = (current->time_slice + 1) >> 1;
1675 /*
1676 * The remainder of the first timeslice might be recovered by
1677 * the parent if the child exits early enough.
1678 */
1679 p->first_time_slice = 1;
1680 current->time_slice >>= 1;
1681 p->timestamp = sched_clock();
1682 if (unlikely(!current->time_slice)) {
1683 /*
1684 * This case is rare, it happens when the parent has only
1685 * a single jiffy left from its timeslice. Taking the
1686 * runqueue lock is not a problem.
1687 */
1688 current->time_slice = 1;
1689 task_running_tick(cpu_rq(cpu), current);
1690 }
1691 local_irq_enable();
1692 put_cpu(); 1608 put_cpu();
1693} 1609}
1694 1610
1695/* 1611/*
1612 * After fork, child runs first. (default) If set to 0 then
1613 * parent will (try to) run first.
1614 */
1615unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1616
1617/*
1696 * wake_up_new_task - wake up a newly created task for the first time. 1618 * wake_up_new_task - wake up a newly created task for the first time.
1697 * 1619 *
1698 * This function will do some initial scheduler statistics housekeeping 1620 * This function will do some initial scheduler statistics housekeeping
@@ -1701,107 +1623,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1701 */ 1623 */
1702void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1624void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1703{ 1625{
1704 struct rq *rq, *this_rq;
1705 unsigned long flags; 1626 unsigned long flags;
1706 int this_cpu, cpu; 1627 struct rq *rq;
1628 int this_cpu;
1707 1629
1708 rq = task_rq_lock(p, &flags); 1630 rq = task_rq_lock(p, &flags);
1709 BUG_ON(p->state != TASK_RUNNING); 1631 BUG_ON(p->state != TASK_RUNNING);
1710 this_cpu = smp_processor_id(); 1632 this_cpu = smp_processor_id(); /* parent's CPU */
1711 cpu = task_cpu(p);
1712
1713 /*
1714 * We decrease the sleep average of forking parents
1715 * and children as well, to keep max-interactive tasks
1716 * from forking tasks that are max-interactive. The parent
1717 * (current) is done further down, under its lock.
1718 */
1719 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1720 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1721 1633
1722 p->prio = effective_prio(p); 1634 p->prio = effective_prio(p);
1723 1635
1724 if (likely(cpu == this_cpu)) { 1636 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1725 if (!(clone_flags & CLONE_VM)) { 1637 task_cpu(p) != this_cpu || !current->se.on_rq) {
1726 /* 1638 activate_task(rq, p, 0);
1727 * The VM isn't cloned, so we're in a good position to
1728 * do child-runs-first in anticipation of an exec. This
1729 * usually avoids a lot of COW overhead.
1730 */
1731 if (unlikely(!current->array))
1732 __activate_task(p, rq);
1733 else {
1734 p->prio = current->prio;
1735 p->normal_prio = current->normal_prio;
1736 list_add_tail(&p->run_list, &current->run_list);
1737 p->array = current->array;
1738 p->array->nr_active++;
1739 inc_nr_running(p, rq);
1740 }
1741 set_need_resched();
1742 } else
1743 /* Run child last */
1744 __activate_task(p, rq);
1745 /*
1746 * We skip the following code due to cpu == this_cpu
1747 *
1748 * task_rq_unlock(rq, &flags);
1749 * this_rq = task_rq_lock(current, &flags);
1750 */
1751 this_rq = rq;
1752 } else { 1639 } else {
1753 this_rq = cpu_rq(this_cpu);
1754
1755 /* 1640 /*
1756 * Not the local CPU - must adjust timestamp. This should 1641 * Let the scheduling class do new task startup
1757 * get optimised away in the !CONFIG_SMP case. 1642 * management (if any):
1758 */ 1643 */
1759 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) 1644 p->sched_class->task_new(rq, p);
1760 + rq->most_recent_timestamp;
1761 __activate_task(p, rq);
1762 if (TASK_PREEMPTS_CURR(p, rq))
1763 resched_task(rq->curr);
1764
1765 /*
1766 * Parent and child are on different CPUs, now get the
1767 * parent runqueue to update the parent's ->sleep_avg:
1768 */
1769 task_rq_unlock(rq, &flags);
1770 this_rq = task_rq_lock(current, &flags);
1771 }
1772 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1773 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1774 task_rq_unlock(this_rq, &flags);
1775}
1776
1777/*
1778 * Potentially available exiting-child timeslices are
1779 * retrieved here - this way the parent does not get
1780 * penalized for creating too many threads.
1781 *
1782 * (this cannot be used to 'generate' timeslices
1783 * artificially, because any timeslice recovered here
1784 * was given away by the parent in the first place.)
1785 */
1786void fastcall sched_exit(struct task_struct *p)
1787{
1788 unsigned long flags;
1789 struct rq *rq;
1790
1791 /*
1792 * If the child was a (relative-) CPU hog then decrease
1793 * the sleep_avg of the parent as well.
1794 */
1795 rq = task_rq_lock(p->parent, &flags);
1796 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1797 p->parent->time_slice += p->time_slice;
1798 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1799 p->parent->time_slice = task_timeslice(p);
1800 } 1645 }
1801 if (p->sleep_avg < p->parent->sleep_avg) 1646 check_preempt_curr(rq, p);
1802 p->parent->sleep_avg = p->parent->sleep_avg /
1803 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1804 (EXIT_WEIGHT + 1);
1805 task_rq_unlock(rq, &flags); 1647 task_rq_unlock(rq, &flags);
1806} 1648}
1807 1649
@@ -1866,7 +1708,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1866 /* 1708 /*
1867 * Remove function-return probe instances associated with this 1709 * Remove function-return probe instances associated with this
1868 * task and put them back on the free list. 1710 * task and put them back on the free list.
1869 */ 1711 */
1870 kprobe_flush_task(prev); 1712 kprobe_flush_task(prev);
1871 put_task_struct(prev); 1713 put_task_struct(prev);
1872 } 1714 }
@@ -1894,13 +1736,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1894 * context_switch - switch to the new MM and the new 1736 * context_switch - switch to the new MM and the new
1895 * thread's register state. 1737 * thread's register state.
1896 */ 1738 */
1897static inline struct task_struct * 1739static inline void
1898context_switch(struct rq *rq, struct task_struct *prev, 1740context_switch(struct rq *rq, struct task_struct *prev,
1899 struct task_struct *next) 1741 struct task_struct *next)
1900{ 1742{
1901 struct mm_struct *mm = next->mm; 1743 struct mm_struct *mm, *oldmm;
1902 struct mm_struct *oldmm = prev->active_mm;
1903 1744
1745 prepare_task_switch(rq, next);
1746 mm = next->mm;
1747 oldmm = prev->active_mm;
1904 /* 1748 /*
1905 * For paravirt, this is coupled with an exit in switch_to to 1749 * For paravirt, this is coupled with an exit in switch_to to
1906 * combine the page table reload and the switch backend into 1750 * combine the page table reload and the switch backend into
@@ -1908,16 +1752,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1908 */ 1752 */
1909 arch_enter_lazy_cpu_mode(); 1753 arch_enter_lazy_cpu_mode();
1910 1754
1911 if (!mm) { 1755 if (unlikely(!mm)) {
1912 next->active_mm = oldmm; 1756 next->active_mm = oldmm;
1913 atomic_inc(&oldmm->mm_count); 1757 atomic_inc(&oldmm->mm_count);
1914 enter_lazy_tlb(oldmm, next); 1758 enter_lazy_tlb(oldmm, next);
1915 } else 1759 } else
1916 switch_mm(oldmm, mm, next); 1760 switch_mm(oldmm, mm, next);
1917 1761
1918 if (!prev->mm) { 1762 if (unlikely(!prev->mm)) {
1919 prev->active_mm = NULL; 1763 prev->active_mm = NULL;
1920 WARN_ON(rq->prev_mm);
1921 rq->prev_mm = oldmm; 1764 rq->prev_mm = oldmm;
1922 } 1765 }
1923 /* 1766 /*
@@ -1933,7 +1776,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1933 /* Here we just switch the register state and the stack. */ 1776 /* Here we just switch the register state and the stack. */
1934 switch_to(prev, next, prev); 1777 switch_to(prev, next, prev);
1935 1778
1936 return prev; 1779 barrier();
1780 /*
1781 * this_rq must be evaluated again because prev may have moved
1782 * CPUs since it called schedule(), thus the 'rq' on its stack
1783 * frame will be invalid.
1784 */
1785 finish_task_switch(this_rq(), prev);
1937} 1786}
1938 1787
1939/* 1788/*
@@ -2006,17 +1855,65 @@ unsigned long nr_active(void)
2006 return running + uninterruptible; 1855 return running + uninterruptible;
2007} 1856}
2008 1857
2009#ifdef CONFIG_SMP
2010
2011/* 1858/*
2012 * Is this task likely cache-hot: 1859 * Update rq->cpu_load[] statistics. This function is usually called every
1860 * scheduler tick (TICK_NSEC).
2013 */ 1861 */
2014static inline int 1862static void update_cpu_load(struct rq *this_rq)
2015task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2016{ 1863{
2017 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1864 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1865 unsigned long total_load = this_rq->ls.load.weight;
1866 unsigned long this_load = total_load;
1867 struct load_stat *ls = &this_rq->ls;
1868 u64 now = __rq_clock(this_rq);
1869 int i, scale;
1870
1871 this_rq->nr_load_updates++;
1872 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1873 goto do_avg;
1874
1875 /* Update delta_fair/delta_exec fields first */
1876 update_curr_load(this_rq, now);
1877
1878 fair_delta64 = ls->delta_fair + 1;
1879 ls->delta_fair = 0;
1880
1881 exec_delta64 = ls->delta_exec + 1;
1882 ls->delta_exec = 0;
1883
1884 sample_interval64 = now - ls->load_update_last;
1885 ls->load_update_last = now;
1886
1887 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1888 sample_interval64 = TICK_NSEC;
1889
1890 if (exec_delta64 > sample_interval64)
1891 exec_delta64 = sample_interval64;
1892
1893 idle_delta64 = sample_interval64 - exec_delta64;
1894
1895 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1896 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1897
1898 this_load = (unsigned long)tmp64;
1899
1900do_avg:
1901
1902 /* Update our load: */
1903 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1904 unsigned long old_load, new_load;
1905
1906 /* scale is effectively 1 << i now, and >> i divides by scale */
1907
1908 old_load = this_rq->cpu_load[i];
1909 new_load = this_load;
1910
1911 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1912 }
2018} 1913}
2019 1914
1915#ifdef CONFIG_SMP
1916
2020/* 1917/*
2021 * double_rq_lock - safely lock two runqueues 1918 * double_rq_lock - safely lock two runqueues
2022 * 1919 *
@@ -2133,23 +2030,17 @@ void sched_exec(void)
2133 * pull_task - move a task from a remote runqueue to the local runqueue. 2030 * pull_task - move a task from a remote runqueue to the local runqueue.
2134 * Both runqueues must be locked. 2031 * Both runqueues must be locked.
2135 */ 2032 */
2136static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2033static void pull_task(struct rq *src_rq, struct task_struct *p,
2137 struct task_struct *p, struct rq *this_rq, 2034 struct rq *this_rq, int this_cpu)
2138 struct prio_array *this_array, int this_cpu)
2139{ 2035{
2140 dequeue_task(p, src_array); 2036 deactivate_task(src_rq, p, 0);
2141 dec_nr_running(p, src_rq);
2142 set_task_cpu(p, this_cpu); 2037 set_task_cpu(p, this_cpu);
2143 inc_nr_running(p, this_rq); 2038 activate_task(this_rq, p, 0);
2144 enqueue_task(p, this_array);
2145 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2146 + this_rq->most_recent_timestamp;
2147 /* 2039 /*
2148 * Note that idle threads have a prio of MAX_PRIO, for this test 2040 * Note that idle threads have a prio of MAX_PRIO, for this test
2149 * to be always true for them. 2041 * to be always true for them.
2150 */ 2042 */
2151 if (TASK_PREEMPTS_CURR(p, this_rq)) 2043 check_preempt_curr(this_rq, p);
2152 resched_task(this_rq->curr);
2153} 2044}
2154 2045
2155/* 2046/*
@@ -2157,7 +2048,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2157 */ 2048 */
2158static 2049static
2159int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2050int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2160 struct sched_domain *sd, enum idle_type idle, 2051 struct sched_domain *sd, enum cpu_idle_type idle,
2161 int *all_pinned) 2052 int *all_pinned)
2162{ 2053{
2163 /* 2054 /*
@@ -2174,132 +2065,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2174 return 0; 2065 return 0;
2175 2066
2176 /* 2067 /*
2177 * Aggressive migration if: 2068 * Aggressive migration if too many balance attempts have failed:
2178 * 1) task is cache cold, or
2179 * 2) too many balance attempts have failed.
2180 */ 2069 */
2181 2070 if (sd->nr_balance_failed > sd->cache_nice_tries)
2182 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2183#ifdef CONFIG_SCHEDSTATS
2184 if (task_hot(p, rq->most_recent_timestamp, sd))
2185 schedstat_inc(sd, lb_hot_gained[idle]);
2186#endif
2187 return 1; 2071 return 1;
2188 }
2189 2072
2190 if (task_hot(p, rq->most_recent_timestamp, sd))
2191 return 0;
2192 return 1; 2073 return 1;
2193} 2074}
2194 2075
2195#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2076static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2196
2197/*
2198 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2199 * load from busiest to this_rq, as part of a balancing operation within
2200 * "domain". Returns the number of tasks moved.
2201 *
2202 * Called with both runqueues locked.
2203 */
2204static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2205 unsigned long max_nr_move, unsigned long max_load_move, 2077 unsigned long max_nr_move, unsigned long max_load_move,
2206 struct sched_domain *sd, enum idle_type idle, 2078 struct sched_domain *sd, enum cpu_idle_type idle,
2207 int *all_pinned) 2079 int *all_pinned, unsigned long *load_moved,
2080 int this_best_prio, int best_prio, int best_prio_seen,
2081 struct rq_iterator *iterator)
2208{ 2082{
2209 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2083 int pulled = 0, pinned = 0, skip_for_load;
2210 best_prio_seen, skip_for_load; 2084 struct task_struct *p;
2211 struct prio_array *array, *dst_array; 2085 long rem_load_move = max_load_move;
2212 struct list_head *head, *curr;
2213 struct task_struct *tmp;
2214 long rem_load_move;
2215 2086
2216 if (max_nr_move == 0 || max_load_move == 0) 2087 if (max_nr_move == 0 || max_load_move == 0)
2217 goto out; 2088 goto out;
2218 2089
2219 rem_load_move = max_load_move;
2220 pinned = 1; 2090 pinned = 1;
2221 this_best_prio = rq_best_prio(this_rq);
2222 best_prio = rq_best_prio(busiest);
2223 /*
2224 * Enable handling of the case where there is more than one task
2225 * with the best priority. If the current running task is one
2226 * of those with prio==best_prio we know it won't be moved
2227 * and therefore it's safe to override the skip (based on load) of
2228 * any task we find with that prio.
2229 */
2230 best_prio_seen = best_prio == busiest->curr->prio;
2231 2091
2232 /* 2092 /*
2233 * We first consider expired tasks. Those will likely not be 2093 * Start the load-balancing iterator:
2234 * executed in the near future, and they are most likely to
2235 * be cache-cold, thus switching CPUs has the least effect
2236 * on them.
2237 */ 2094 */
2238 if (busiest->expired->nr_active) { 2095 p = iterator->start(iterator->arg);
2239 array = busiest->expired; 2096next:
2240 dst_array = this_rq->expired; 2097 if (!p)
2241 } else {
2242 array = busiest->active;
2243 dst_array = this_rq->active;
2244 }
2245
2246new_array:
2247 /* Start searching at priority 0: */
2248 idx = 0;
2249skip_bitmap:
2250 if (!idx)
2251 idx = sched_find_first_bit(array->bitmap);
2252 else
2253 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2254 if (idx >= MAX_PRIO) {
2255 if (array == busiest->expired && busiest->active->nr_active) {
2256 array = busiest->active;
2257 dst_array = this_rq->active;
2258 goto new_array;
2259 }
2260 goto out; 2098 goto out;
2261 }
2262
2263 head = array->queue + idx;
2264 curr = head->prev;
2265skip_queue:
2266 tmp = list_entry(curr, struct task_struct, run_list);
2267
2268 curr = curr->prev;
2269
2270 /* 2099 /*
2271 * To help distribute high priority tasks accross CPUs we don't 2100 * To help distribute high priority tasks accross CPUs we don't
2272 * skip a task if it will be the highest priority task (i.e. smallest 2101 * skip a task if it will be the highest priority task (i.e. smallest
2273 * prio value) on its new queue regardless of its load weight 2102 * prio value) on its new queue regardless of its load weight
2274 */ 2103 */
2275 skip_for_load = tmp->load_weight > rem_load_move; 2104 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2276 if (skip_for_load && idx < this_best_prio) 2105 SCHED_LOAD_SCALE_FUZZ;
2277 skip_for_load = !best_prio_seen && idx == best_prio; 2106 if (skip_for_load && p->prio < this_best_prio)
2107 skip_for_load = !best_prio_seen && p->prio == best_prio;
2278 if (skip_for_load || 2108 if (skip_for_load ||
2279 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2109 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2280 2110
2281 best_prio_seen |= idx == best_prio; 2111 best_prio_seen |= p->prio == best_prio;
2282 if (curr != head) 2112 p = iterator->next(iterator->arg);
2283 goto skip_queue; 2113 goto next;
2284 idx++;
2285 goto skip_bitmap;
2286 } 2114 }
2287 2115
2288 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2116 pull_task(busiest, p, this_rq, this_cpu);
2289 pulled++; 2117 pulled++;
2290 rem_load_move -= tmp->load_weight; 2118 rem_load_move -= p->se.load.weight;
2291 2119
2292 /* 2120 /*
2293 * We only want to steal up to the prescribed number of tasks 2121 * We only want to steal up to the prescribed number of tasks
2294 * and the prescribed amount of weighted load. 2122 * and the prescribed amount of weighted load.
2295 */ 2123 */
2296 if (pulled < max_nr_move && rem_load_move > 0) { 2124 if (pulled < max_nr_move && rem_load_move > 0) {
2297 if (idx < this_best_prio) 2125 if (p->prio < this_best_prio)
2298 this_best_prio = idx; 2126 this_best_prio = p->prio;
2299 if (curr != head) 2127 p = iterator->next(iterator->arg);
2300 goto skip_queue; 2128 goto next;
2301 idx++;
2302 goto skip_bitmap;
2303 } 2129 }
2304out: 2130out:
2305 /* 2131 /*
@@ -2311,18 +2137,48 @@ out:
2311 2137
2312 if (all_pinned) 2138 if (all_pinned)
2313 *all_pinned = pinned; 2139 *all_pinned = pinned;
2140 *load_moved = max_load_move - rem_load_move;
2314 return pulled; 2141 return pulled;
2315} 2142}
2316 2143
2317/* 2144/*
2145 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2146 * load from busiest to this_rq, as part of a balancing operation within
2147 * "domain". Returns the number of tasks moved.
2148 *
2149 * Called with both runqueues locked.
2150 */
2151static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2152 unsigned long max_nr_move, unsigned long max_load_move,
2153 struct sched_domain *sd, enum cpu_idle_type idle,
2154 int *all_pinned)
2155{
2156 struct sched_class *class = sched_class_highest;
2157 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2158 long rem_load_move = max_load_move;
2159
2160 do {
2161 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2162 max_nr_move, (unsigned long)rem_load_move,
2163 sd, idle, all_pinned, &load_moved);
2164 total_nr_moved += nr_moved;
2165 max_nr_move -= nr_moved;
2166 rem_load_move -= load_moved;
2167 class = class->next;
2168 } while (class && max_nr_move && rem_load_move > 0);
2169
2170 return total_nr_moved;
2171}
2172
2173/*
2318 * find_busiest_group finds and returns the busiest CPU group within the 2174 * find_busiest_group finds and returns the busiest CPU group within the
2319 * domain. It calculates and returns the amount of weighted load which 2175 * domain. It calculates and returns the amount of weighted load which
2320 * should be moved to restore balance via the imbalance parameter. 2176 * should be moved to restore balance via the imbalance parameter.
2321 */ 2177 */
2322static struct sched_group * 2178static struct sched_group *
2323find_busiest_group(struct sched_domain *sd, int this_cpu, 2179find_busiest_group(struct sched_domain *sd, int this_cpu,
2324 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2180 unsigned long *imbalance, enum cpu_idle_type idle,
2325 cpumask_t *cpus, int *balance) 2181 int *sd_idle, cpumask_t *cpus, int *balance)
2326{ 2182{
2327 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2183 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2328 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2184 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2340,9 +2196,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2340 max_load = this_load = total_load = total_pwr = 0; 2196 max_load = this_load = total_load = total_pwr = 0;
2341 busiest_load_per_task = busiest_nr_running = 0; 2197 busiest_load_per_task = busiest_nr_running = 0;
2342 this_load_per_task = this_nr_running = 0; 2198 this_load_per_task = this_nr_running = 0;
2343 if (idle == NOT_IDLE) 2199 if (idle == CPU_NOT_IDLE)
2344 load_idx = sd->busy_idx; 2200 load_idx = sd->busy_idx;
2345 else if (idle == NEWLY_IDLE) 2201 else if (idle == CPU_NEWLY_IDLE)
2346 load_idx = sd->newidle_idx; 2202 load_idx = sd->newidle_idx;
2347 else 2203 else
2348 load_idx = sd->idle_idx; 2204 load_idx = sd->idle_idx;
@@ -2386,7 +2242,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2386 2242
2387 avg_load += load; 2243 avg_load += load;
2388 sum_nr_running += rq->nr_running; 2244 sum_nr_running += rq->nr_running;
2389 sum_weighted_load += rq->raw_weighted_load; 2245 sum_weighted_load += weighted_cpuload(i);
2390 } 2246 }
2391 2247
2392 /* 2248 /*
@@ -2426,8 +2282,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2426 * Busy processors will not participate in power savings 2282 * Busy processors will not participate in power savings
2427 * balance. 2283 * balance.
2428 */ 2284 */
2429 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2285 if (idle == CPU_NOT_IDLE ||
2430 goto group_next; 2286 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2287 goto group_next;
2431 2288
2432 /* 2289 /*
2433 * If the local group is idle or completely loaded 2290 * If the local group is idle or completely loaded
@@ -2437,42 +2294,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2437 !this_nr_running)) 2294 !this_nr_running))
2438 power_savings_balance = 0; 2295 power_savings_balance = 0;
2439 2296
2440 /* 2297 /*
2441 * If a group is already running at full capacity or idle, 2298 * If a group is already running at full capacity or idle,
2442 * don't include that group in power savings calculations 2299 * don't include that group in power savings calculations
2443 */ 2300 */
2444 if (!power_savings_balance || sum_nr_running >= group_capacity 2301 if (!power_savings_balance || sum_nr_running >= group_capacity
2445 || !sum_nr_running) 2302 || !sum_nr_running)
2446 goto group_next; 2303 goto group_next;
2447 2304
2448 /* 2305 /*
2449 * Calculate the group which has the least non-idle load. 2306 * Calculate the group which has the least non-idle load.
2450 * This is the group from where we need to pick up the load 2307 * This is the group from where we need to pick up the load
2451 * for saving power 2308 * for saving power
2452 */ 2309 */
2453 if ((sum_nr_running < min_nr_running) || 2310 if ((sum_nr_running < min_nr_running) ||
2454 (sum_nr_running == min_nr_running && 2311 (sum_nr_running == min_nr_running &&
2455 first_cpu(group->cpumask) < 2312 first_cpu(group->cpumask) <
2456 first_cpu(group_min->cpumask))) { 2313 first_cpu(group_min->cpumask))) {
2457 group_min = group; 2314 group_min = group;
2458 min_nr_running = sum_nr_running; 2315 min_nr_running = sum_nr_running;
2459 min_load_per_task = sum_weighted_load / 2316 min_load_per_task = sum_weighted_load /
2460 sum_nr_running; 2317 sum_nr_running;
2461 } 2318 }
2462 2319
2463 /* 2320 /*
2464 * Calculate the group which is almost near its 2321 * Calculate the group which is almost near its
2465 * capacity but still has some space to pick up some load 2322 * capacity but still has some space to pick up some load
2466 * from other group and save more power 2323 * from other group and save more power
2467 */ 2324 */
2468 if (sum_nr_running <= group_capacity - 1) { 2325 if (sum_nr_running <= group_capacity - 1) {
2469 if (sum_nr_running > leader_nr_running || 2326 if (sum_nr_running > leader_nr_running ||
2470 (sum_nr_running == leader_nr_running && 2327 (sum_nr_running == leader_nr_running &&
2471 first_cpu(group->cpumask) > 2328 first_cpu(group->cpumask) >
2472 first_cpu(group_leader->cpumask))) { 2329 first_cpu(group_leader->cpumask))) {
2473 group_leader = group; 2330 group_leader = group;
2474 leader_nr_running = sum_nr_running; 2331 leader_nr_running = sum_nr_running;
2475 } 2332 }
2476 } 2333 }
2477group_next: 2334group_next:
2478#endif 2335#endif
@@ -2527,7 +2384,7 @@ group_next:
2527 * a think about bumping its value to force at least one task to be 2384 * a think about bumping its value to force at least one task to be
2528 * moved 2385 * moved
2529 */ 2386 */
2530 if (*imbalance < busiest_load_per_task) { 2387 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2531 unsigned long tmp, pwr_now, pwr_move; 2388 unsigned long tmp, pwr_now, pwr_move;
2532 unsigned int imbn; 2389 unsigned int imbn;
2533 2390
@@ -2541,7 +2398,8 @@ small_imbalance:
2541 } else 2398 } else
2542 this_load_per_task = SCHED_LOAD_SCALE; 2399 this_load_per_task = SCHED_LOAD_SCALE;
2543 2400
2544 if (max_load - this_load >= busiest_load_per_task * imbn) { 2401 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2402 busiest_load_per_task * imbn) {
2545 *imbalance = busiest_load_per_task; 2403 *imbalance = busiest_load_per_task;
2546 return busiest; 2404 return busiest;
2547 } 2405 }
@@ -2588,7 +2446,7 @@ small_imbalance:
2588 2446
2589out_balanced: 2447out_balanced:
2590#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2448#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2591 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2449 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2592 goto ret; 2450 goto ret;
2593 2451
2594 if (this == group_leader && group_leader != group_min) { 2452 if (this == group_leader && group_leader != group_min) {
@@ -2605,7 +2463,7 @@ ret:
2605 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2463 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2606 */ 2464 */
2607static struct rq * 2465static struct rq *
2608find_busiest_queue(struct sched_group *group, enum idle_type idle, 2466find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2609 unsigned long imbalance, cpumask_t *cpus) 2467 unsigned long imbalance, cpumask_t *cpus)
2610{ 2468{
2611 struct rq *busiest = NULL, *rq; 2469 struct rq *busiest = NULL, *rq;
@@ -2613,17 +2471,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle,
2613 int i; 2471 int i;
2614 2472
2615 for_each_cpu_mask(i, group->cpumask) { 2473 for_each_cpu_mask(i, group->cpumask) {
2474 unsigned long wl;
2616 2475
2617 if (!cpu_isset(i, *cpus)) 2476 if (!cpu_isset(i, *cpus))
2618 continue; 2477 continue;
2619 2478
2620 rq = cpu_rq(i); 2479 rq = cpu_rq(i);
2480 wl = weighted_cpuload(i);
2621 2481
2622 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2482 if (rq->nr_running == 1 && wl > imbalance)
2623 continue; 2483 continue;
2624 2484
2625 if (rq->raw_weighted_load > max_load) { 2485 if (wl > max_load) {
2626 max_load = rq->raw_weighted_load; 2486 max_load = wl;
2627 busiest = rq; 2487 busiest = rq;
2628 } 2488 }
2629 } 2489 }
@@ -2647,7 +2507,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2647 * tasks if there is an imbalance. 2507 * tasks if there is an imbalance.
2648 */ 2508 */
2649static int load_balance(int this_cpu, struct rq *this_rq, 2509static int load_balance(int this_cpu, struct rq *this_rq,
2650 struct sched_domain *sd, enum idle_type idle, 2510 struct sched_domain *sd, enum cpu_idle_type idle,
2651 int *balance) 2511 int *balance)
2652{ 2512{
2653 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2513 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2660,10 +2520,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2660 /* 2520 /*
2661 * When power savings policy is enabled for the parent domain, idle 2521 * When power savings policy is enabled for the parent domain, idle
2662 * sibling can pick up load irrespective of busy siblings. In this case, 2522 * sibling can pick up load irrespective of busy siblings. In this case,
2663 * let the state of idle sibling percolate up as IDLE, instead of 2523 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2664 * portraying it as NOT_IDLE. 2524 * portraying it as CPU_NOT_IDLE.
2665 */ 2525 */
2666 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2526 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2667 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2527 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2668 sd_idle = 1; 2528 sd_idle = 1;
2669 2529
@@ -2797,7 +2657,7 @@ out_one_pinned:
2797 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2657 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2798 * tasks if there is an imbalance. 2658 * tasks if there is an imbalance.
2799 * 2659 *
2800 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2660 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2801 * this_rq is locked. 2661 * this_rq is locked.
2802 */ 2662 */
2803static int 2663static int
@@ -2814,31 +2674,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2814 * When power savings policy is enabled for the parent domain, idle 2674 * When power savings policy is enabled for the parent domain, idle
2815 * sibling can pick up load irrespective of busy siblings. In this case, 2675 * sibling can pick up load irrespective of busy siblings. In this case,
2816 * let the state of idle sibling percolate up as IDLE, instead of 2676 * let the state of idle sibling percolate up as IDLE, instead of
2817 * portraying it as NOT_IDLE. 2677 * portraying it as CPU_NOT_IDLE.
2818 */ 2678 */
2819 if (sd->flags & SD_SHARE_CPUPOWER && 2679 if (sd->flags & SD_SHARE_CPUPOWER &&
2820 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2821 sd_idle = 1; 2681 sd_idle = 1;
2822 2682
2823 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2683 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2824redo: 2684redo:
2825 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2685 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2826 &sd_idle, &cpus, NULL); 2686 &sd_idle, &cpus, NULL);
2827 if (!group) { 2687 if (!group) {
2828 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2688 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2829 goto out_balanced; 2689 goto out_balanced;
2830 } 2690 }
2831 2691
2832 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, 2692 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2833 &cpus); 2693 &cpus);
2834 if (!busiest) { 2694 if (!busiest) {
2835 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2695 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2836 goto out_balanced; 2696 goto out_balanced;
2837 } 2697 }
2838 2698
2839 BUG_ON(busiest == this_rq); 2699 BUG_ON(busiest == this_rq);
2840 2700
2841 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2701 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2842 2702
2843 nr_moved = 0; 2703 nr_moved = 0;
2844 if (busiest->nr_running > 1) { 2704 if (busiest->nr_running > 1) {
@@ -2846,7 +2706,7 @@ redo:
2846 double_lock_balance(this_rq, busiest); 2706 double_lock_balance(this_rq, busiest);
2847 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2707 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2848 minus_1_or_zero(busiest->nr_running), 2708 minus_1_or_zero(busiest->nr_running),
2849 imbalance, sd, NEWLY_IDLE, NULL); 2709 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2850 spin_unlock(&busiest->lock); 2710 spin_unlock(&busiest->lock);
2851 2711
2852 if (!nr_moved) { 2712 if (!nr_moved) {
@@ -2857,7 +2717,7 @@ redo:
2857 } 2717 }
2858 2718
2859 if (!nr_moved) { 2719 if (!nr_moved) {
2860 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2720 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2861 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2862 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2863 return -1; 2723 return -1;
@@ -2867,7 +2727,7 @@ redo:
2867 return nr_moved; 2727 return nr_moved;
2868 2728
2869out_balanced: 2729out_balanced:
2870 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2730 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2871 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2872 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2873 return -1; 2733 return -1;
@@ -2883,28 +2743,33 @@ out_balanced:
2883static void idle_balance(int this_cpu, struct rq *this_rq) 2743static void idle_balance(int this_cpu, struct rq *this_rq)
2884{ 2744{
2885 struct sched_domain *sd; 2745 struct sched_domain *sd;
2886 int pulled_task = 0; 2746 int pulled_task = -1;
2887 unsigned long next_balance = jiffies + 60 * HZ; 2747 unsigned long next_balance = jiffies + HZ;
2888 2748
2889 for_each_domain(this_cpu, sd) { 2749 for_each_domain(this_cpu, sd) {
2890 if (sd->flags & SD_BALANCE_NEWIDLE) { 2750 unsigned long interval;
2751
2752 if (!(sd->flags & SD_LOAD_BALANCE))
2753 continue;
2754
2755 if (sd->flags & SD_BALANCE_NEWIDLE)
2891 /* If we've pulled tasks over stop searching: */ 2756 /* If we've pulled tasks over stop searching: */
2892 pulled_task = load_balance_newidle(this_cpu, 2757 pulled_task = load_balance_newidle(this_cpu,
2893 this_rq, sd); 2758 this_rq, sd);
2894 if (time_after(next_balance, 2759
2895 sd->last_balance + sd->balance_interval)) 2760 interval = msecs_to_jiffies(sd->balance_interval);
2896 next_balance = sd->last_balance 2761 if (time_after(next_balance, sd->last_balance + interval))
2897 + sd->balance_interval; 2762 next_balance = sd->last_balance + interval;
2898 if (pulled_task) 2763 if (pulled_task)
2899 break; 2764 break;
2900 }
2901 } 2765 }
2902 if (!pulled_task) 2766 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2903 /* 2767 /*
2904 * We are going idle. next_balance may be set based on 2768 * We are going idle. next_balance may be set based on
2905 * a busy processor. So reset next_balance. 2769 * a busy processor. So reset next_balance.
2906 */ 2770 */
2907 this_rq->next_balance = next_balance; 2771 this_rq->next_balance = next_balance;
2772 }
2908} 2773}
2909 2774
2910/* 2775/*
@@ -2948,7 +2813,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2948 schedstat_inc(sd, alb_cnt); 2813 schedstat_inc(sd, alb_cnt);
2949 2814
2950 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2815 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2951 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, 2816 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2952 NULL)) 2817 NULL))
2953 schedstat_inc(sd, alb_pushed); 2818 schedstat_inc(sd, alb_pushed);
2954 else 2819 else
@@ -2957,32 +2822,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2957 spin_unlock(&target_rq->lock); 2822 spin_unlock(&target_rq->lock);
2958} 2823}
2959 2824
2960static void update_load(struct rq *this_rq)
2961{
2962 unsigned long this_load;
2963 unsigned int i, scale;
2964
2965 this_load = this_rq->raw_weighted_load;
2966
2967 /* Update our load: */
2968 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
2969 unsigned long old_load, new_load;
2970
2971 /* scale is effectively 1 << i now, and >> i divides by scale */
2972
2973 old_load = this_rq->cpu_load[i];
2974 new_load = this_load;
2975 /*
2976 * Round up the averaging division if load is increasing. This
2977 * prevents us from getting stuck on 9 if the load is 10, for
2978 * example.
2979 */
2980 if (new_load > old_load)
2981 new_load += scale-1;
2982 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2983 }
2984}
2985
2986#ifdef CONFIG_NO_HZ 2825#ifdef CONFIG_NO_HZ
2987static struct { 2826static struct {
2988 atomic_t load_balancer; 2827 atomic_t load_balancer;
@@ -3065,7 +2904,7 @@ static DEFINE_SPINLOCK(balancing);
3065 * 2904 *
3066 * Balancing parameters are set up in arch_init_sched_domains. 2905 * Balancing parameters are set up in arch_init_sched_domains.
3067 */ 2906 */
3068static inline void rebalance_domains(int cpu, enum idle_type idle) 2907static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3069{ 2908{
3070 int balance = 1; 2909 int balance = 1;
3071 struct rq *rq = cpu_rq(cpu); 2910 struct rq *rq = cpu_rq(cpu);
@@ -3079,13 +2918,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3079 continue; 2918 continue;
3080 2919
3081 interval = sd->balance_interval; 2920 interval = sd->balance_interval;
3082 if (idle != SCHED_IDLE) 2921 if (idle != CPU_IDLE)
3083 interval *= sd->busy_factor; 2922 interval *= sd->busy_factor;
3084 2923
3085 /* scale ms to jiffies */ 2924 /* scale ms to jiffies */
3086 interval = msecs_to_jiffies(interval); 2925 interval = msecs_to_jiffies(interval);
3087 if (unlikely(!interval)) 2926 if (unlikely(!interval))
3088 interval = 1; 2927 interval = 1;
2928 if (interval > HZ*NR_CPUS/10)
2929 interval = HZ*NR_CPUS/10;
2930
3089 2931
3090 if (sd->flags & SD_SERIALIZE) { 2932 if (sd->flags & SD_SERIALIZE) {
3091 if (!spin_trylock(&balancing)) 2933 if (!spin_trylock(&balancing))
@@ -3099,7 +2941,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3099 * longer idle, or one of our SMT siblings is 2941 * longer idle, or one of our SMT siblings is
3100 * not idle. 2942 * not idle.
3101 */ 2943 */
3102 idle = NOT_IDLE; 2944 idle = CPU_NOT_IDLE;
3103 } 2945 }
3104 sd->last_balance = jiffies; 2946 sd->last_balance = jiffies;
3105 } 2947 }
@@ -3127,11 +2969,12 @@ out:
3127 */ 2969 */
3128static void run_rebalance_domains(struct softirq_action *h) 2970static void run_rebalance_domains(struct softirq_action *h)
3129{ 2971{
3130 int local_cpu = smp_processor_id(); 2972 int this_cpu = smp_processor_id();
3131 struct rq *local_rq = cpu_rq(local_cpu); 2973 struct rq *this_rq = cpu_rq(this_cpu);
3132 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 2974 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2975 CPU_IDLE : CPU_NOT_IDLE;
3133 2976
3134 rebalance_domains(local_cpu, idle); 2977 rebalance_domains(this_cpu, idle);
3135 2978
3136#ifdef CONFIG_NO_HZ 2979#ifdef CONFIG_NO_HZ
3137 /* 2980 /*
@@ -3139,13 +2982,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3139 * balancing on behalf of the other idle cpus whose ticks are 2982 * balancing on behalf of the other idle cpus whose ticks are
3140 * stopped. 2983 * stopped.
3141 */ 2984 */
3142 if (local_rq->idle_at_tick && 2985 if (this_rq->idle_at_tick &&
3143 atomic_read(&nohz.load_balancer) == local_cpu) { 2986 atomic_read(&nohz.load_balancer) == this_cpu) {
3144 cpumask_t cpus = nohz.cpu_mask; 2987 cpumask_t cpus = nohz.cpu_mask;
3145 struct rq *rq; 2988 struct rq *rq;
3146 int balance_cpu; 2989 int balance_cpu;
3147 2990
3148 cpu_clear(local_cpu, cpus); 2991 cpu_clear(this_cpu, cpus);
3149 for_each_cpu_mask(balance_cpu, cpus) { 2992 for_each_cpu_mask(balance_cpu, cpus) {
3150 /* 2993 /*
3151 * If this cpu gets work to do, stop the load balancing 2994 * If this cpu gets work to do, stop the load balancing
@@ -3158,8 +3001,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3158 rebalance_domains(balance_cpu, SCHED_IDLE); 3001 rebalance_domains(balance_cpu, SCHED_IDLE);
3159 3002
3160 rq = cpu_rq(balance_cpu); 3003 rq = cpu_rq(balance_cpu);
3161 if (time_after(local_rq->next_balance, rq->next_balance)) 3004 if (time_after(this_rq->next_balance, rq->next_balance))
3162 local_rq->next_balance = rq->next_balance; 3005 this_rq->next_balance = rq->next_balance;
3163 } 3006 }
3164 } 3007 }
3165#endif 3008#endif
@@ -3172,9 +3015,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3172 * idle load balancing owner or decide to stop the periodic load balancing, 3015 * idle load balancing owner or decide to stop the periodic load balancing,
3173 * if the whole system is idle. 3016 * if the whole system is idle.
3174 */ 3017 */
3175static inline void trigger_load_balance(int cpu) 3018static inline void trigger_load_balance(struct rq *rq, int cpu)
3176{ 3019{
3177 struct rq *rq = cpu_rq(cpu);
3178#ifdef CONFIG_NO_HZ 3020#ifdef CONFIG_NO_HZ
3179 /* 3021 /*
3180 * If we were in the nohz mode recently and busy at the current 3022 * If we were in the nohz mode recently and busy at the current
@@ -3226,13 +3068,29 @@ static inline void trigger_load_balance(int cpu)
3226 if (time_after_eq(jiffies, rq->next_balance)) 3068 if (time_after_eq(jiffies, rq->next_balance))
3227 raise_softirq(SCHED_SOFTIRQ); 3069 raise_softirq(SCHED_SOFTIRQ);
3228} 3070}
3229#else 3071
3072#else /* CONFIG_SMP */
3073
3230/* 3074/*
3231 * on UP we do not need to balance between CPUs: 3075 * on UP we do not need to balance between CPUs:
3232 */ 3076 */
3233static inline void idle_balance(int cpu, struct rq *rq) 3077static inline void idle_balance(int cpu, struct rq *rq)
3234{ 3078{
3235} 3079}
3080
3081/* Avoid "used but not defined" warning on UP */
3082static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 unsigned long max_nr_move, unsigned long max_load_move,
3084 struct sched_domain *sd, enum cpu_idle_type idle,
3085 int *all_pinned, unsigned long *load_moved,
3086 int this_best_prio, int best_prio, int best_prio_seen,
3087 struct rq_iterator *iterator)
3088{
3089 *load_moved = 0;
3090
3091 return 0;
3092}
3093
3236#endif 3094#endif
3237 3095
3238DEFINE_PER_CPU(struct kernel_stat, kstat); 3096DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3240,54 +3098,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
3240EXPORT_PER_CPU_SYMBOL(kstat); 3098EXPORT_PER_CPU_SYMBOL(kstat);
3241 3099
3242/* 3100/*
3243 * This is called on clock ticks and on context switches. 3101 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3244 * Bank in p->sched_time the ns elapsed since the last tick or switch. 3102 * that have not yet been banked in case the task is currently running.
3245 */ 3103 */
3246static inline void 3104unsigned long long task_sched_runtime(struct task_struct *p)
3247update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3248{
3249 p->sched_time += now - p->last_ran;
3250 p->last_ran = rq->most_recent_timestamp = now;
3251}
3252
3253/*
3254 * Return current->sched_time plus any more ns on the sched_clock
3255 * that have not yet been banked.
3256 */
3257unsigned long long current_sched_time(const struct task_struct *p)
3258{ 3105{
3259 unsigned long long ns;
3260 unsigned long flags; 3106 unsigned long flags;
3107 u64 ns, delta_exec;
3108 struct rq *rq;
3261 3109
3262 local_irq_save(flags); 3110 rq = task_rq_lock(p, &flags);
3263 ns = p->sched_time + sched_clock() - p->last_ran; 3111 ns = p->se.sum_exec_runtime;
3264 local_irq_restore(flags); 3112 if (rq->curr == p) {
3113 delta_exec = rq_clock(rq) - p->se.exec_start;
3114 if ((s64)delta_exec > 0)
3115 ns += delta_exec;
3116 }
3117 task_rq_unlock(rq, &flags);
3265 3118
3266 return ns; 3119 return ns;
3267} 3120}
3268 3121
3269/* 3122/*
3270 * We place interactive tasks back into the active array, if possible.
3271 *
3272 * To guarantee that this does not starve expired tasks we ignore the
3273 * interactivity of a task if the first expired task had to wait more
3274 * than a 'reasonable' amount of time. This deadline timeout is
3275 * load-dependent, as the frequency of array switched decreases with
3276 * increasing number of running tasks. We also ignore the interactivity
3277 * if a better static_prio task has expired:
3278 */
3279static inline int expired_starving(struct rq *rq)
3280{
3281 if (rq->curr->static_prio > rq->best_expired_prio)
3282 return 1;
3283 if (!STARVATION_LIMIT || !rq->expired_timestamp)
3284 return 0;
3285 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3286 return 1;
3287 return 0;
3288}
3289
3290/*
3291 * Account user cpu time to a process. 3123 * Account user cpu time to a process.
3292 * @p: the process that the cpu time gets accounted to 3124 * @p: the process that the cpu time gets accounted to
3293 * @hardirq_offset: the offset to subtract from hardirq_count() 3125 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3360,81 +3192,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3360 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3192 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3361} 3193}
3362 3194
3363static void task_running_tick(struct rq *rq, struct task_struct *p)
3364{
3365 if (p->array != rq->active) {
3366 /* Task has expired but was not scheduled yet */
3367 set_tsk_need_resched(p);
3368 return;
3369 }
3370 spin_lock(&rq->lock);
3371 /*
3372 * The task was running during this tick - update the
3373 * time slice counter. Note: we do not update a thread's
3374 * priority until it either goes to sleep or uses up its
3375 * timeslice. This makes it possible for interactive tasks
3376 * to use up their timeslices at their highest priority levels.
3377 */
3378 if (rt_task(p)) {
3379 /*
3380 * RR tasks need a special form of timeslice management.
3381 * FIFO tasks have no timeslices.
3382 */
3383 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3384 p->time_slice = task_timeslice(p);
3385 p->first_time_slice = 0;
3386 set_tsk_need_resched(p);
3387
3388 /* put it at the end of the queue: */
3389 requeue_task(p, rq->active);
3390 }
3391 goto out_unlock;
3392 }
3393 if (!--p->time_slice) {
3394 dequeue_task(p, rq->active);
3395 set_tsk_need_resched(p);
3396 p->prio = effective_prio(p);
3397 p->time_slice = task_timeslice(p);
3398 p->first_time_slice = 0;
3399
3400 if (!rq->expired_timestamp)
3401 rq->expired_timestamp = jiffies;
3402 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3403 enqueue_task(p, rq->expired);
3404 if (p->static_prio < rq->best_expired_prio)
3405 rq->best_expired_prio = p->static_prio;
3406 } else
3407 enqueue_task(p, rq->active);
3408 } else {
3409 /*
3410 * Prevent a too long timeslice allowing a task to monopolize
3411 * the CPU. We do this by splitting up the timeslice into
3412 * smaller pieces.
3413 *
3414 * Note: this does not mean the task's timeslices expire or
3415 * get lost in any way, they just might be preempted by
3416 * another task of equal priority. (one with higher
3417 * priority would have preempted this task already.) We
3418 * requeue this task to the end of the list on this priority
3419 * level, which is in essence a round-robin of tasks with
3420 * equal priority.
3421 *
3422 * This only applies to tasks in the interactive
3423 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3424 */
3425 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3426 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3427 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3428 (p->array == rq->active)) {
3429
3430 requeue_task(p, rq->active);
3431 set_tsk_need_resched(p);
3432 }
3433 }
3434out_unlock:
3435 spin_unlock(&rq->lock);
3436}
3437
3438/* 3195/*
3439 * This function gets called by the timer code, with HZ frequency. 3196 * This function gets called by the timer code, with HZ frequency.
3440 * We call it with interrupts disabled. 3197 * We call it with interrupts disabled.
@@ -3444,20 +3201,19 @@ out_unlock:
3444 */ 3201 */
3445void scheduler_tick(void) 3202void scheduler_tick(void)
3446{ 3203{
3447 unsigned long long now = sched_clock();
3448 struct task_struct *p = current;
3449 int cpu = smp_processor_id(); 3204 int cpu = smp_processor_id();
3450 int idle_at_tick = idle_cpu(cpu);
3451 struct rq *rq = cpu_rq(cpu); 3205 struct rq *rq = cpu_rq(cpu);
3206 struct task_struct *curr = rq->curr;
3452 3207
3453 update_cpu_clock(p, rq, now); 3208 spin_lock(&rq->lock);
3209 if (curr != rq->idle) /* FIXME: needed? */
3210 curr->sched_class->task_tick(rq, curr);
3211 update_cpu_load(rq);
3212 spin_unlock(&rq->lock);
3454 3213
3455 if (!idle_at_tick)
3456 task_running_tick(rq, p);
3457#ifdef CONFIG_SMP 3214#ifdef CONFIG_SMP
3458 update_load(rq); 3215 rq->idle_at_tick = idle_cpu(cpu);
3459 rq->idle_at_tick = idle_at_tick; 3216 trigger_load_balance(rq, cpu);
3460 trigger_load_balance(cpu);
3461#endif 3217#endif
3462} 3218}
3463 3219
@@ -3499,170 +3255,129 @@ EXPORT_SYMBOL(sub_preempt_count);
3499 3255
3500#endif 3256#endif
3501 3257
3502static inline int interactive_sleep(enum sleep_type sleep_type) 3258/*
3259 * Print scheduling while atomic bug:
3260 */
3261static noinline void __schedule_bug(struct task_struct *prev)
3503{ 3262{
3504 return (sleep_type == SLEEP_INTERACTIVE || 3263 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3505 sleep_type == SLEEP_INTERRUPTED); 3264 prev->comm, preempt_count(), prev->pid);
3265 debug_show_held_locks(prev);
3266 if (irqs_disabled())
3267 print_irqtrace_events(prev);
3268 dump_stack();
3506} 3269}
3507 3270
3508/* 3271/*
3509 * schedule() is the main scheduler function. 3272 * Various schedule()-time debugging checks and statistics:
3510 */ 3273 */
3511asmlinkage void __sched schedule(void) 3274static inline void schedule_debug(struct task_struct *prev)
3512{ 3275{
3513 struct task_struct *prev, *next;
3514 struct prio_array *array;
3515 struct list_head *queue;
3516 unsigned long long now;
3517 unsigned long run_time;
3518 int cpu, idx, new_prio;
3519 long *switch_count;
3520 struct rq *rq;
3521
3522 /* 3276 /*
3523 * Test if we are atomic. Since do_exit() needs to call into 3277 * Test if we are atomic. Since do_exit() needs to call into
3524 * schedule() atomically, we ignore that path for now. 3278 * schedule() atomically, we ignore that path for now.
3525 * Otherwise, whine if we are scheduling when we should not be. 3279 * Otherwise, whine if we are scheduling when we should not be.
3526 */ 3280 */
3527 if (unlikely(in_atomic() && !current->exit_state)) { 3281 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3528 printk(KERN_ERR "BUG: scheduling while atomic: " 3282 __schedule_bug(prev);
3529 "%s/0x%08x/%d\n",
3530 current->comm, preempt_count(), current->pid);
3531 debug_show_held_locks(current);
3532 if (irqs_disabled())
3533 print_irqtrace_events(current);
3534 dump_stack();
3535 }
3536 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3537 3283
3538need_resched: 3284 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3539 preempt_disable();
3540 prev = current;
3541 release_kernel_lock(prev);
3542need_resched_nonpreemptible:
3543 rq = this_rq();
3544 3285
3545 /* 3286 schedstat_inc(this_rq(), sched_cnt);
3546 * The idle thread is not allowed to schedule! 3287}
3547 * Remove this check after it has been exercised a bit.
3548 */
3549 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3550 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3551 dump_stack();
3552 }
3553 3288
3554 schedstat_inc(rq, sched_cnt); 3289/*
3555 now = sched_clock(); 3290 * Pick up the highest-prio task:
3556 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3291 */
3557 run_time = now - prev->timestamp; 3292static inline struct task_struct *
3558 if (unlikely((long long)(now - prev->timestamp) < 0)) 3293pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3559 run_time = 0; 3294{
3560 } else 3295 struct sched_class *class;
3561 run_time = NS_MAX_SLEEP_AVG; 3296 struct task_struct *p;
3562 3297
3563 /* 3298 /*
3564 * Tasks charged proportionately less run_time at high sleep_avg to 3299 * Optimization: we know that if all tasks are in
3565 * delay them losing their interactive status 3300 * the fair class we can call that function directly:
3566 */ 3301 */
3567 run_time /= (CURRENT_BONUS(prev) ? : 1); 3302 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3568 3303 p = fair_sched_class.pick_next_task(rq, now);
3569 spin_lock_irq(&rq->lock); 3304 if (likely(p))
3570 3305 return p;
3571 switch_count = &prev->nivcsw;
3572 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3573 switch_count = &prev->nvcsw;
3574 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3575 unlikely(signal_pending(prev))))
3576 prev->state = TASK_RUNNING;
3577 else {
3578 if (prev->state == TASK_UNINTERRUPTIBLE)
3579 rq->nr_uninterruptible++;
3580 deactivate_task(prev, rq);
3581 }
3582 } 3306 }
3583 3307
3584 cpu = smp_processor_id(); 3308 class = sched_class_highest;
3585 if (unlikely(!rq->nr_running)) { 3309 for ( ; ; ) {
3586 idle_balance(cpu, rq); 3310 p = class->pick_next_task(rq, now);
3587 if (!rq->nr_running) { 3311 if (p)
3588 next = rq->idle; 3312 return p;
3589 rq->expired_timestamp = 0;
3590 goto switch_tasks;
3591 }
3592 }
3593
3594 array = rq->active;
3595 if (unlikely(!array->nr_active)) {
3596 /* 3313 /*
3597 * Switch the active and expired arrays. 3314 * Will never be NULL as the idle class always
3315 * returns a non-NULL p:
3598 */ 3316 */
3599 schedstat_inc(rq, sched_switch); 3317 class = class->next;
3600 rq->active = rq->expired;
3601 rq->expired = array;
3602 array = rq->active;
3603 rq->expired_timestamp = 0;
3604 rq->best_expired_prio = MAX_PRIO;
3605 } 3318 }
3319}
3320
3321/*
3322 * schedule() is the main scheduler function.
3323 */
3324asmlinkage void __sched schedule(void)
3325{
3326 struct task_struct *prev, *next;
3327 long *switch_count;
3328 struct rq *rq;
3329 u64 now;
3330 int cpu;
3606 3331
3607 idx = sched_find_first_bit(array->bitmap); 3332need_resched:
3608 queue = array->queue + idx; 3333 preempt_disable();
3609 next = list_entry(queue->next, struct task_struct, run_list); 3334 cpu = smp_processor_id();
3335 rq = cpu_rq(cpu);
3336 rcu_qsctr_inc(cpu);
3337 prev = rq->curr;
3338 switch_count = &prev->nivcsw;
3610 3339
3611 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3340 release_kernel_lock(prev);
3612 unsigned long long delta = now - next->timestamp; 3341need_resched_nonpreemptible:
3613 if (unlikely((long long)(now - next->timestamp) < 0))
3614 delta = 0;
3615 3342
3616 if (next->sleep_type == SLEEP_INTERACTIVE) 3343 schedule_debug(prev);
3617 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3618 3344
3619 array = next->array; 3345 spin_lock_irq(&rq->lock);
3620 new_prio = recalc_task_prio(next, next->timestamp + delta); 3346 clear_tsk_need_resched(prev);
3621 3347
3622 if (unlikely(next->prio != new_prio)) { 3348 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3623 dequeue_task(next, array); 3349 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3624 next->prio = new_prio; 3350 unlikely(signal_pending(prev)))) {
3625 enqueue_task(next, array); 3351 prev->state = TASK_RUNNING;
3352 } else {
3353 deactivate_task(rq, prev, 1);
3626 } 3354 }
3355 switch_count = &prev->nvcsw;
3627 } 3356 }
3628 next->sleep_type = SLEEP_NORMAL;
3629switch_tasks:
3630 if (next == rq->idle)
3631 schedstat_inc(rq, sched_goidle);
3632 prefetch(next);
3633 prefetch_stack(next);
3634 clear_tsk_need_resched(prev);
3635 rcu_qsctr_inc(task_cpu(prev));
3636 3357
3637 update_cpu_clock(prev, rq, now); 3358 if (unlikely(!rq->nr_running))
3359 idle_balance(cpu, rq);
3638 3360
3639 prev->sleep_avg -= run_time; 3361 now = __rq_clock(rq);
3640 if ((long)prev->sleep_avg <= 0) 3362 prev->sched_class->put_prev_task(rq, prev, now);
3641 prev->sleep_avg = 0; 3363 next = pick_next_task(rq, prev, now);
3642 prev->timestamp = prev->last_ran = now;
3643 3364
3644 sched_info_switch(prev, next); 3365 sched_info_switch(prev, next);
3366
3645 if (likely(prev != next)) { 3367 if (likely(prev != next)) {
3646 next->timestamp = next->last_ran = now;
3647 rq->nr_switches++; 3368 rq->nr_switches++;
3648 rq->curr = next; 3369 rq->curr = next;
3649 ++*switch_count; 3370 ++*switch_count;
3650 3371
3651 prepare_task_switch(rq, next); 3372 context_switch(rq, prev, next); /* unlocks the rq */
3652 prev = context_switch(rq, prev, next);
3653 barrier();
3654 /*
3655 * this_rq must be evaluated again because prev may have moved
3656 * CPUs since it called schedule(), thus the 'rq' on its stack
3657 * frame will be invalid.
3658 */
3659 finish_task_switch(this_rq(), prev);
3660 } else 3373 } else
3661 spin_unlock_irq(&rq->lock); 3374 spin_unlock_irq(&rq->lock);
3662 3375
3663 prev = current; 3376 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3664 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3377 cpu = smp_processor_id();
3378 rq = cpu_rq(cpu);
3665 goto need_resched_nonpreemptible; 3379 goto need_resched_nonpreemptible;
3380 }
3666 preempt_enable_no_resched(); 3381 preempt_enable_no_resched();
3667 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3382 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3668 goto need_resched; 3383 goto need_resched;
@@ -3990,74 +3705,85 @@ out:
3990} 3705}
3991EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3706EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3992 3707
3993 3708static inline void
3994#define SLEEP_ON_VAR \ 3709sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3995 unsigned long flags; \ 3710{
3996 wait_queue_t wait; \ 3711 spin_lock_irqsave(&q->lock, *flags);
3997 init_waitqueue_entry(&wait, current); 3712 __add_wait_queue(q, wait);
3998
3999#define SLEEP_ON_HEAD \
4000 spin_lock_irqsave(&q->lock,flags); \
4001 __add_wait_queue(q, &wait); \
4002 spin_unlock(&q->lock); 3713 spin_unlock(&q->lock);
3714}
4003 3715
4004#define SLEEP_ON_TAIL \ 3716static inline void
4005 spin_lock_irq(&q->lock); \ 3717sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4006 __remove_wait_queue(q, &wait); \ 3718{
4007 spin_unlock_irqrestore(&q->lock, flags); 3719 spin_lock_irq(&q->lock);
3720 __remove_wait_queue(q, wait);
3721 spin_unlock_irqrestore(&q->lock, *flags);
3722}
4008 3723
4009void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3724void __sched interruptible_sleep_on(wait_queue_head_t *q)
4010{ 3725{
4011 SLEEP_ON_VAR 3726 unsigned long flags;
3727 wait_queue_t wait;
3728
3729 init_waitqueue_entry(&wait, current);
4012 3730
4013 current->state = TASK_INTERRUPTIBLE; 3731 current->state = TASK_INTERRUPTIBLE;
4014 3732
4015 SLEEP_ON_HEAD 3733 sleep_on_head(q, &wait, &flags);
4016 schedule(); 3734 schedule();
4017 SLEEP_ON_TAIL 3735 sleep_on_tail(q, &wait, &flags);
4018} 3736}
4019EXPORT_SYMBOL(interruptible_sleep_on); 3737EXPORT_SYMBOL(interruptible_sleep_on);
4020 3738
4021long fastcall __sched 3739long __sched
4022interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3740interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4023{ 3741{
4024 SLEEP_ON_VAR 3742 unsigned long flags;
3743 wait_queue_t wait;
3744
3745 init_waitqueue_entry(&wait, current);
4025 3746
4026 current->state = TASK_INTERRUPTIBLE; 3747 current->state = TASK_INTERRUPTIBLE;
4027 3748
4028 SLEEP_ON_HEAD 3749 sleep_on_head(q, &wait, &flags);
4029 timeout = schedule_timeout(timeout); 3750 timeout = schedule_timeout(timeout);
4030 SLEEP_ON_TAIL 3751 sleep_on_tail(q, &wait, &flags);
4031 3752
4032 return timeout; 3753 return timeout;
4033} 3754}
4034EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3755EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4035 3756
4036void fastcall __sched sleep_on(wait_queue_head_t *q) 3757void __sched sleep_on(wait_queue_head_t *q)
4037{ 3758{
4038 SLEEP_ON_VAR 3759 unsigned long flags;
3760 wait_queue_t wait;
3761
3762 init_waitqueue_entry(&wait, current);
4039 3763
4040 current->state = TASK_UNINTERRUPTIBLE; 3764 current->state = TASK_UNINTERRUPTIBLE;
4041 3765
4042 SLEEP_ON_HEAD 3766 sleep_on_head(q, &wait, &flags);
4043 schedule(); 3767 schedule();
4044 SLEEP_ON_TAIL 3768 sleep_on_tail(q, &wait, &flags);
4045} 3769}
4046EXPORT_SYMBOL(sleep_on); 3770EXPORT_SYMBOL(sleep_on);
4047 3771
4048long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3772long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4049{ 3773{
4050 SLEEP_ON_VAR 3774 unsigned long flags;
3775 wait_queue_t wait;
3776
3777 init_waitqueue_entry(&wait, current);
4051 3778
4052 current->state = TASK_UNINTERRUPTIBLE; 3779 current->state = TASK_UNINTERRUPTIBLE;
4053 3780
4054 SLEEP_ON_HEAD 3781 sleep_on_head(q, &wait, &flags);
4055 timeout = schedule_timeout(timeout); 3782 timeout = schedule_timeout(timeout);
4056 SLEEP_ON_TAIL 3783 sleep_on_tail(q, &wait, &flags);
4057 3784
4058 return timeout; 3785 return timeout;
4059} 3786}
4060
4061EXPORT_SYMBOL(sleep_on_timeout); 3787EXPORT_SYMBOL(sleep_on_timeout);
4062 3788
4063#ifdef CONFIG_RT_MUTEXES 3789#ifdef CONFIG_RT_MUTEXES
@@ -4074,29 +3800,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
4074 */ 3800 */
4075void rt_mutex_setprio(struct task_struct *p, int prio) 3801void rt_mutex_setprio(struct task_struct *p, int prio)
4076{ 3802{
4077 struct prio_array *array;
4078 unsigned long flags; 3803 unsigned long flags;
3804 int oldprio, on_rq;
4079 struct rq *rq; 3805 struct rq *rq;
4080 int oldprio; 3806 u64 now;
4081 3807
4082 BUG_ON(prio < 0 || prio > MAX_PRIO); 3808 BUG_ON(prio < 0 || prio > MAX_PRIO);
4083 3809
4084 rq = task_rq_lock(p, &flags); 3810 rq = task_rq_lock(p, &flags);
3811 now = rq_clock(rq);
4085 3812
4086 oldprio = p->prio; 3813 oldprio = p->prio;
4087 array = p->array; 3814 on_rq = p->se.on_rq;
4088 if (array) 3815 if (on_rq)
4089 dequeue_task(p, array); 3816 dequeue_task(rq, p, 0, now);
3817
3818 if (rt_prio(prio))
3819 p->sched_class = &rt_sched_class;
3820 else
3821 p->sched_class = &fair_sched_class;
3822
4090 p->prio = prio; 3823 p->prio = prio;
4091 3824
4092 if (array) { 3825 if (on_rq) {
4093 /* 3826 enqueue_task(rq, p, 0, now);
4094 * If changing to an RT priority then queue it
4095 * in the active array!
4096 */
4097 if (rt_task(p))
4098 array = rq->active;
4099 enqueue_task(p, array);
4100 /* 3827 /*
4101 * Reschedule if we are currently running on this runqueue and 3828 * Reschedule if we are currently running on this runqueue and
4102 * our priority decreased, or if we are not currently running on 3829 * our priority decreased, or if we are not currently running on
@@ -4105,8 +3832,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4105 if (task_running(rq, p)) { 3832 if (task_running(rq, p)) {
4106 if (p->prio > oldprio) 3833 if (p->prio > oldprio)
4107 resched_task(rq->curr); 3834 resched_task(rq->curr);
4108 } else if (TASK_PREEMPTS_CURR(p, rq)) 3835 } else {
4109 resched_task(rq->curr); 3836 check_preempt_curr(rq, p);
3837 }
4110 } 3838 }
4111 task_rq_unlock(rq, &flags); 3839 task_rq_unlock(rq, &flags);
4112} 3840}
@@ -4115,10 +3843,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4115 3843
4116void set_user_nice(struct task_struct *p, long nice) 3844void set_user_nice(struct task_struct *p, long nice)
4117{ 3845{
4118 struct prio_array *array; 3846 int old_prio, delta, on_rq;
4119 int old_prio, delta;
4120 unsigned long flags; 3847 unsigned long flags;
4121 struct rq *rq; 3848 struct rq *rq;
3849 u64 now;
4122 3850
4123 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3851 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4124 return; 3852 return;
@@ -4127,20 +3855,21 @@ void set_user_nice(struct task_struct *p, long nice)
4127 * the task might be in the middle of scheduling on another CPU. 3855 * the task might be in the middle of scheduling on another CPU.
4128 */ 3856 */
4129 rq = task_rq_lock(p, &flags); 3857 rq = task_rq_lock(p, &flags);
3858 now = rq_clock(rq);
4130 /* 3859 /*
4131 * The RT priorities are set via sched_setscheduler(), but we still 3860 * The RT priorities are set via sched_setscheduler(), but we still
4132 * allow the 'normal' nice value to be set - but as expected 3861 * allow the 'normal' nice value to be set - but as expected
4133 * it wont have any effect on scheduling until the task is 3862 * it wont have any effect on scheduling until the task is
4134 * not SCHED_NORMAL/SCHED_BATCH: 3863 * SCHED_FIFO/SCHED_RR:
4135 */ 3864 */
4136 if (has_rt_policy(p)) { 3865 if (task_has_rt_policy(p)) {
4137 p->static_prio = NICE_TO_PRIO(nice); 3866 p->static_prio = NICE_TO_PRIO(nice);
4138 goto out_unlock; 3867 goto out_unlock;
4139 } 3868 }
4140 array = p->array; 3869 on_rq = p->se.on_rq;
4141 if (array) { 3870 if (on_rq) {
4142 dequeue_task(p, array); 3871 dequeue_task(rq, p, 0, now);
4143 dec_raw_weighted_load(rq, p); 3872 dec_load(rq, p, now);
4144 } 3873 }
4145 3874
4146 p->static_prio = NICE_TO_PRIO(nice); 3875 p->static_prio = NICE_TO_PRIO(nice);
@@ -4149,9 +3878,9 @@ void set_user_nice(struct task_struct *p, long nice)
4149 p->prio = effective_prio(p); 3878 p->prio = effective_prio(p);
4150 delta = p->prio - old_prio; 3879 delta = p->prio - old_prio;
4151 3880
4152 if (array) { 3881 if (on_rq) {
4153 enqueue_task(p, array); 3882 enqueue_task(rq, p, 0, now);
4154 inc_raw_weighted_load(rq, p); 3883 inc_load(rq, p, now);
4155 /* 3884 /*
4156 * If the task increased its priority or is running and 3885 * If the task increased its priority or is running and
4157 * lowered its priority, then reschedule its CPU: 3886 * lowered its priority, then reschedule its CPU:
@@ -4271,20 +4000,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4271} 4000}
4272 4001
4273/* Actually do priority change: must hold rq lock. */ 4002/* Actually do priority change: must hold rq lock. */
4274static void __setscheduler(struct task_struct *p, int policy, int prio) 4003static void
4004__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4275{ 4005{
4276 BUG_ON(p->array); 4006 BUG_ON(p->se.on_rq);
4277 4007
4278 p->policy = policy; 4008 p->policy = policy;
4009 switch (p->policy) {
4010 case SCHED_NORMAL:
4011 case SCHED_BATCH:
4012 case SCHED_IDLE:
4013 p->sched_class = &fair_sched_class;
4014 break;
4015 case SCHED_FIFO:
4016 case SCHED_RR:
4017 p->sched_class = &rt_sched_class;
4018 break;
4019 }
4020
4279 p->rt_priority = prio; 4021 p->rt_priority = prio;
4280 p->normal_prio = normal_prio(p); 4022 p->normal_prio = normal_prio(p);
4281 /* we are holding p->pi_lock already */ 4023 /* we are holding p->pi_lock already */
4282 p->prio = rt_mutex_getprio(p); 4024 p->prio = rt_mutex_getprio(p);
4283 /*
4284 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4285 */
4286 if (policy == SCHED_BATCH)
4287 p->sleep_avg = 0;
4288 set_load_weight(p); 4025 set_load_weight(p);
4289} 4026}
4290 4027
@@ -4299,8 +4036,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4299int sched_setscheduler(struct task_struct *p, int policy, 4036int sched_setscheduler(struct task_struct *p, int policy,
4300 struct sched_param *param) 4037 struct sched_param *param)
4301{ 4038{
4302 int retval, oldprio, oldpolicy = -1; 4039 int retval, oldprio, oldpolicy = -1, on_rq;
4303 struct prio_array *array;
4304 unsigned long flags; 4040 unsigned long flags;
4305 struct rq *rq; 4041 struct rq *rq;
4306 4042
@@ -4311,27 +4047,27 @@ recheck:
4311 if (policy < 0) 4047 if (policy < 0)
4312 policy = oldpolicy = p->policy; 4048 policy = oldpolicy = p->policy;
4313 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4049 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4314 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4050 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4051 policy != SCHED_IDLE)
4315 return -EINVAL; 4052 return -EINVAL;
4316 /* 4053 /*
4317 * Valid priorities for SCHED_FIFO and SCHED_RR are 4054 * Valid priorities for SCHED_FIFO and SCHED_RR are
4318 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4055 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4319 * SCHED_BATCH is 0. 4056 * SCHED_BATCH and SCHED_IDLE is 0.
4320 */ 4057 */
4321 if (param->sched_priority < 0 || 4058 if (param->sched_priority < 0 ||
4322 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4059 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4323 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4060 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4324 return -EINVAL; 4061 return -EINVAL;
4325 if (is_rt_policy(policy) != (param->sched_priority != 0)) 4062 if (rt_policy(policy) != (param->sched_priority != 0))
4326 return -EINVAL; 4063 return -EINVAL;
4327 4064
4328 /* 4065 /*
4329 * Allow unprivileged RT tasks to decrease priority: 4066 * Allow unprivileged RT tasks to decrease priority:
4330 */ 4067 */
4331 if (!capable(CAP_SYS_NICE)) { 4068 if (!capable(CAP_SYS_NICE)) {
4332 if (is_rt_policy(policy)) { 4069 if (rt_policy(policy)) {
4333 unsigned long rlim_rtprio; 4070 unsigned long rlim_rtprio;
4334 unsigned long flags;
4335 4071
4336 if (!lock_task_sighand(p, &flags)) 4072 if (!lock_task_sighand(p, &flags))
4337 return -ESRCH; 4073 return -ESRCH;
@@ -4347,6 +4083,12 @@ recheck:
4347 param->sched_priority > rlim_rtprio) 4083 param->sched_priority > rlim_rtprio)
4348 return -EPERM; 4084 return -EPERM;
4349 } 4085 }
4086 /*
4087 * Like positive nice levels, dont allow tasks to
4088 * move out of SCHED_IDLE either:
4089 */
4090 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4091 return -EPERM;
4350 4092
4351 /* can't change other user's priorities */ 4093 /* can't change other user's priorities */
4352 if ((current->euid != p->euid) && 4094 if ((current->euid != p->euid) &&
@@ -4374,13 +4116,13 @@ recheck:
4374 spin_unlock_irqrestore(&p->pi_lock, flags); 4116 spin_unlock_irqrestore(&p->pi_lock, flags);
4375 goto recheck; 4117 goto recheck;
4376 } 4118 }
4377 array = p->array; 4119 on_rq = p->se.on_rq;
4378 if (array) 4120 if (on_rq)
4379 deactivate_task(p, rq); 4121 deactivate_task(rq, p, 0);
4380 oldprio = p->prio; 4122 oldprio = p->prio;
4381 __setscheduler(p, policy, param->sched_priority); 4123 __setscheduler(rq, p, policy, param->sched_priority);
4382 if (array) { 4124 if (on_rq) {
4383 __activate_task(p, rq); 4125 activate_task(rq, p, 0);
4384 /* 4126 /*
4385 * Reschedule if we are currently running on this runqueue and 4127 * Reschedule if we are currently running on this runqueue and
4386 * our priority decreased, or if we are not currently running on 4128 * our priority decreased, or if we are not currently running on
@@ -4389,8 +4131,9 @@ recheck:
4389 if (task_running(rq, p)) { 4131 if (task_running(rq, p)) {
4390 if (p->prio > oldprio) 4132 if (p->prio > oldprio)
4391 resched_task(rq->curr); 4133 resched_task(rq->curr);
4392 } else if (TASK_PREEMPTS_CURR(p, rq)) 4134 } else {
4393 resched_task(rq->curr); 4135 check_preempt_curr(rq, p);
4136 }
4394 } 4137 }
4395 __task_rq_unlock(rq); 4138 __task_rq_unlock(rq);
4396 spin_unlock_irqrestore(&p->pi_lock, flags); 4139 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4662,41 +4405,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4662/** 4405/**
4663 * sys_sched_yield - yield the current processor to other threads. 4406 * sys_sched_yield - yield the current processor to other threads.
4664 * 4407 *
4665 * This function yields the current CPU by moving the calling thread 4408 * This function yields the current CPU to other tasks. If there are no
4666 * to the expired array. If there are no other threads running on this 4409 * other threads running on this CPU then this function will return.
4667 * CPU then this function will return.
4668 */ 4410 */
4669asmlinkage long sys_sched_yield(void) 4411asmlinkage long sys_sched_yield(void)
4670{ 4412{
4671 struct rq *rq = this_rq_lock(); 4413 struct rq *rq = this_rq_lock();
4672 struct prio_array *array = current->array, *target = rq->expired;
4673 4414
4674 schedstat_inc(rq, yld_cnt); 4415 schedstat_inc(rq, yld_cnt);
4675 /* 4416 if (unlikely(rq->nr_running == 1))
4676 * We implement yielding by moving the task into the expired
4677 * queue.
4678 *
4679 * (special rule: RT tasks will just roundrobin in the active
4680 * array.)
4681 */
4682 if (rt_task(current))
4683 target = rq->active;
4684
4685 if (array->nr_active == 1) {
4686 schedstat_inc(rq, yld_act_empty); 4417 schedstat_inc(rq, yld_act_empty);
4687 if (!rq->expired->nr_active) 4418 else
4688 schedstat_inc(rq, yld_both_empty); 4419 current->sched_class->yield_task(rq, current);
4689 } else if (!rq->expired->nr_active)
4690 schedstat_inc(rq, yld_exp_empty);
4691
4692 if (array != target) {
4693 dequeue_task(current, array);
4694 enqueue_task(current, target);
4695 } else
4696 /*
4697 * requeue_task is cheaper so perform that if possible.
4698 */
4699 requeue_task(current, array);
4700 4420
4701 /* 4421 /*
4702 * Since we are going to call schedule() anyway, there's 4422 * Since we are going to call schedule() anyway, there's
@@ -4847,6 +4567,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4847 break; 4567 break;
4848 case SCHED_NORMAL: 4568 case SCHED_NORMAL:
4849 case SCHED_BATCH: 4569 case SCHED_BATCH:
4570 case SCHED_IDLE:
4850 ret = 0; 4571 ret = 0;
4851 break; 4572 break;
4852 } 4573 }
@@ -4871,6 +4592,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4871 break; 4592 break;
4872 case SCHED_NORMAL: 4593 case SCHED_NORMAL:
4873 case SCHED_BATCH: 4594 case SCHED_BATCH:
4595 case SCHED_IDLE:
4874 ret = 0; 4596 ret = 0;
4875 } 4597 }
4876 return ret; 4598 return ret;
@@ -4905,7 +4627,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4905 goto out_unlock; 4627 goto out_unlock;
4906 4628
4907 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4629 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4908 0 : task_timeslice(p), &t); 4630 0 : static_prio_timeslice(p->static_prio), &t);
4909 read_unlock(&tasklist_lock); 4631 read_unlock(&tasklist_lock);
4910 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4632 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4911out_nounlock: 4633out_nounlock:
@@ -4925,14 +4647,14 @@ static void show_task(struct task_struct *p)
4925 state = p->state ? __ffs(p->state) + 1 : 0; 4647 state = p->state ? __ffs(p->state) + 1 : 0;
4926 printk("%-13.13s %c", p->comm, 4648 printk("%-13.13s %c", p->comm,
4927 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4649 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4928#if (BITS_PER_LONG == 32) 4650#if BITS_PER_LONG == 32
4929 if (state == TASK_RUNNING) 4651 if (state == TASK_RUNNING)
4930 printk(" running "); 4652 printk(" running ");
4931 else 4653 else
4932 printk(" %08lX ", thread_saved_pc(p)); 4654 printk(" %08lx ", thread_saved_pc(p));
4933#else 4655#else
4934 if (state == TASK_RUNNING) 4656 if (state == TASK_RUNNING)
4935 printk(" running task "); 4657 printk(" running task ");
4936 else 4658 else
4937 printk(" %016lx ", thread_saved_pc(p)); 4659 printk(" %016lx ", thread_saved_pc(p));
4938#endif 4660#endif
@@ -4944,11 +4666,7 @@ static void show_task(struct task_struct *p)
4944 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4666 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4945 } 4667 }
4946#endif 4668#endif
4947 printk("%5lu %5d %6d", free, p->pid, p->parent->pid); 4669 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
4948 if (!p->mm)
4949 printk(" (L-TLB)\n");
4950 else
4951 printk(" (NOTLB)\n");
4952 4670
4953 if (state != TASK_RUNNING) 4671 if (state != TASK_RUNNING)
4954 show_stack(p, NULL); 4672 show_stack(p, NULL);
@@ -4958,14 +4676,12 @@ void show_state_filter(unsigned long state_filter)
4958{ 4676{
4959 struct task_struct *g, *p; 4677 struct task_struct *g, *p;
4960 4678
4961#if (BITS_PER_LONG == 32) 4679#if BITS_PER_LONG == 32
4962 printk("\n" 4680 printk(KERN_INFO
4963 " free sibling\n"); 4681 " task PC stack pid father\n");
4964 printk(" task PC stack pid father child younger older\n");
4965#else 4682#else
4966 printk("\n" 4683 printk(KERN_INFO
4967 " free sibling\n"); 4684 " task PC stack pid father\n");
4968 printk(" task PC stack pid father child younger older\n");
4969#endif 4685#endif
4970 read_lock(&tasklist_lock); 4686 read_lock(&tasklist_lock);
4971 do_each_thread(g, p) { 4687 do_each_thread(g, p) {
@@ -4980,6 +4696,9 @@ void show_state_filter(unsigned long state_filter)
4980 4696
4981 touch_all_softlockup_watchdogs(); 4697 touch_all_softlockup_watchdogs();
4982 4698
4699#ifdef CONFIG_SCHED_DEBUG
4700 sysrq_sched_debug_show();
4701#endif
4983 read_unlock(&tasklist_lock); 4702 read_unlock(&tasklist_lock);
4984 /* 4703 /*
4985 * Only show locks if all tasks are dumped: 4704 * Only show locks if all tasks are dumped:
@@ -4988,6 +4707,11 @@ void show_state_filter(unsigned long state_filter)
4988 debug_show_all_locks(); 4707 debug_show_all_locks();
4989} 4708}
4990 4709
4710void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4711{
4712 idle->sched_class = &idle_sched_class;
4713}
4714
4991/** 4715/**
4992 * init_idle - set up an idle thread for a given CPU 4716 * init_idle - set up an idle thread for a given CPU
4993 * @idle: task in question 4717 * @idle: task in question
@@ -5001,13 +4725,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5001 struct rq *rq = cpu_rq(cpu); 4725 struct rq *rq = cpu_rq(cpu);
5002 unsigned long flags; 4726 unsigned long flags;
5003 4727
5004 idle->timestamp = sched_clock(); 4728 __sched_fork(idle);
5005 idle->sleep_avg = 0; 4729 idle->se.exec_start = sched_clock();
5006 idle->array = NULL; 4730
5007 idle->prio = idle->normal_prio = MAX_PRIO; 4731 idle->prio = idle->normal_prio = MAX_PRIO;
5008 idle->state = TASK_RUNNING;
5009 idle->cpus_allowed = cpumask_of_cpu(cpu); 4732 idle->cpus_allowed = cpumask_of_cpu(cpu);
5010 set_task_cpu(idle, cpu); 4733 __set_task_cpu(idle, cpu);
5011 4734
5012 spin_lock_irqsave(&rq->lock, flags); 4735 spin_lock_irqsave(&rq->lock, flags);
5013 rq->curr = rq->idle = idle; 4736 rq->curr = rq->idle = idle;
@@ -5022,6 +4745,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5022#else 4745#else
5023 task_thread_info(idle)->preempt_count = 0; 4746 task_thread_info(idle)->preempt_count = 0;
5024#endif 4747#endif
4748 /*
4749 * The idle tasks have their own, simple scheduling class:
4750 */
4751 idle->sched_class = &idle_sched_class;
5025} 4752}
5026 4753
5027/* 4754/*
@@ -5033,6 +4760,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5033 */ 4760 */
5034cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4761cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5035 4762
4763/*
4764 * Increase the granularity value when there are more CPUs,
4765 * because with more CPUs the 'effective latency' as visible
4766 * to users decreases. But the relationship is not linear,
4767 * so pick a second-best guess by going with the log2 of the
4768 * number of CPUs.
4769 *
4770 * This idea comes from the SD scheduler of Con Kolivas:
4771 */
4772static inline void sched_init_granularity(void)
4773{
4774 unsigned int factor = 1 + ilog2(num_online_cpus());
4775 const unsigned long gran_limit = 100000000;
4776
4777 sysctl_sched_granularity *= factor;
4778 if (sysctl_sched_granularity > gran_limit)
4779 sysctl_sched_granularity = gran_limit;
4780
4781 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4782 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4783}
4784
5036#ifdef CONFIG_SMP 4785#ifdef CONFIG_SMP
5037/* 4786/*
5038 * This is how migration works: 4787 * This is how migration works:
@@ -5106,7 +4855,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
5106static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4855static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5107{ 4856{
5108 struct rq *rq_dest, *rq_src; 4857 struct rq *rq_dest, *rq_src;
5109 int ret = 0; 4858 int ret = 0, on_rq;
5110 4859
5111 if (unlikely(cpu_is_offline(dest_cpu))) 4860 if (unlikely(cpu_is_offline(dest_cpu)))
5112 return ret; 4861 return ret;
@@ -5122,20 +4871,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5122 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4871 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5123 goto out; 4872 goto out;
5124 4873
4874 on_rq = p->se.on_rq;
4875 if (on_rq)
4876 deactivate_task(rq_src, p, 0);
5125 set_task_cpu(p, dest_cpu); 4877 set_task_cpu(p, dest_cpu);
5126 if (p->array) { 4878 if (on_rq) {
5127 /* 4879 activate_task(rq_dest, p, 0);
5128 * Sync timestamp with rq_dest's before activating. 4880 check_preempt_curr(rq_dest, p);
5129 * The same thing could be achieved by doing this step
5130 * afterwards, and pretending it was a local activate.
5131 * This way is cleaner and logically correct.
5132 */
5133 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5134 + rq_dest->most_recent_timestamp;
5135 deactivate_task(p, rq_src);
5136 __activate_task(p, rq_dest);
5137 if (TASK_PREEMPTS_CURR(p, rq_dest))
5138 resched_task(rq_dest->curr);
5139 } 4881 }
5140 ret = 1; 4882 ret = 1;
5141out: 4883out:
@@ -5287,7 +5029,8 @@ static void migrate_live_tasks(int src_cpu)
5287 write_unlock_irq(&tasklist_lock); 5029 write_unlock_irq(&tasklist_lock);
5288} 5030}
5289 5031
5290/* Schedules idle task to be the next runnable task on current CPU. 5032/*
5033 * Schedules idle task to be the next runnable task on current CPU.
5291 * It does so by boosting its priority to highest possible and adding it to 5034 * It does so by boosting its priority to highest possible and adding it to
5292 * the _front_ of the runqueue. Used by CPU offline code. 5035 * the _front_ of the runqueue. Used by CPU offline code.
5293 */ 5036 */
@@ -5307,10 +5050,10 @@ void sched_idle_next(void)
5307 */ 5050 */
5308 spin_lock_irqsave(&rq->lock, flags); 5051 spin_lock_irqsave(&rq->lock, flags);
5309 5052
5310 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5053 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5311 5054
5312 /* Add idle task to the _front_ of its priority queue: */ 5055 /* Add idle task to the _front_ of its priority queue: */
5313 __activate_idle_task(p, rq); 5056 activate_idle_task(p, rq);
5314 5057
5315 spin_unlock_irqrestore(&rq->lock, flags); 5058 spin_unlock_irqrestore(&rq->lock, flags);
5316} 5059}
@@ -5360,16 +5103,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5360static void migrate_dead_tasks(unsigned int dead_cpu) 5103static void migrate_dead_tasks(unsigned int dead_cpu)
5361{ 5104{
5362 struct rq *rq = cpu_rq(dead_cpu); 5105 struct rq *rq = cpu_rq(dead_cpu);
5363 unsigned int arr, i; 5106 struct task_struct *next;
5364 5107
5365 for (arr = 0; arr < 2; arr++) { 5108 for ( ; ; ) {
5366 for (i = 0; i < MAX_PRIO; i++) { 5109 if (!rq->nr_running)
5367 struct list_head *list = &rq->arrays[arr].queue[i]; 5110 break;
5368 5111 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5369 while (!list_empty(list)) 5112 if (!next)
5370 migrate_dead(dead_cpu, list_entry(list->next, 5113 break;
5371 struct task_struct, run_list)); 5114 migrate_dead(dead_cpu, next);
5372 }
5373 } 5115 }
5374} 5116}
5375#endif /* CONFIG_HOTPLUG_CPU */ 5117#endif /* CONFIG_HOTPLUG_CPU */
@@ -5393,14 +5135,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5393 5135
5394 case CPU_UP_PREPARE: 5136 case CPU_UP_PREPARE:
5395 case CPU_UP_PREPARE_FROZEN: 5137 case CPU_UP_PREPARE_FROZEN:
5396 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5138 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5397 if (IS_ERR(p)) 5139 if (IS_ERR(p))
5398 return NOTIFY_BAD; 5140 return NOTIFY_BAD;
5399 p->flags |= PF_NOFREEZE; 5141 p->flags |= PF_NOFREEZE;
5400 kthread_bind(p, cpu); 5142 kthread_bind(p, cpu);
5401 /* Must be high prio: stop_machine expects to yield to it. */ 5143 /* Must be high prio: stop_machine expects to yield to it. */
5402 rq = task_rq_lock(p, &flags); 5144 rq = task_rq_lock(p, &flags);
5403 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5145 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5404 task_rq_unlock(rq, &flags); 5146 task_rq_unlock(rq, &flags);
5405 cpu_rq(cpu)->migration_thread = p; 5147 cpu_rq(cpu)->migration_thread = p;
5406 break; 5148 break;
@@ -5431,9 +5173,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5431 rq->migration_thread = NULL; 5173 rq->migration_thread = NULL;
5432 /* Idle task back to normal (off runqueue, low prio) */ 5174 /* Idle task back to normal (off runqueue, low prio) */
5433 rq = task_rq_lock(rq->idle, &flags); 5175 rq = task_rq_lock(rq->idle, &flags);
5434 deactivate_task(rq->idle, rq); 5176 deactivate_task(rq, rq->idle, 0);
5435 rq->idle->static_prio = MAX_PRIO; 5177 rq->idle->static_prio = MAX_PRIO;
5436 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5178 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5179 rq->idle->sched_class = &idle_sched_class;
5437 migrate_dead_tasks(cpu); 5180 migrate_dead_tasks(cpu);
5438 task_rq_unlock(rq, &flags); 5181 task_rq_unlock(rq, &flags);
5439 migrate_nr_uninterruptible(rq); 5182 migrate_nr_uninterruptible(rq);
@@ -5742,483 +5485,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5742 5485
5743#define SD_NODES_PER_DOMAIN 16 5486#define SD_NODES_PER_DOMAIN 16
5744 5487
5745/*
5746 * Self-tuning task migration cost measurement between source and target CPUs.
5747 *
5748 * This is done by measuring the cost of manipulating buffers of varying
5749 * sizes. For a given buffer-size here are the steps that are taken:
5750 *
5751 * 1) the source CPU reads+dirties a shared buffer
5752 * 2) the target CPU reads+dirties the same shared buffer
5753 *
5754 * We measure how long they take, in the following 4 scenarios:
5755 *
5756 * - source: CPU1, target: CPU2 | cost1
5757 * - source: CPU2, target: CPU1 | cost2
5758 * - source: CPU1, target: CPU1 | cost3
5759 * - source: CPU2, target: CPU2 | cost4
5760 *
5761 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5762 * the cost of migration.
5763 *
5764 * We then start off from a small buffer-size and iterate up to larger
5765 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5766 * doing a maximum search for the cost. (The maximum cost for a migration
5767 * normally occurs when the working set size is around the effective cache
5768 * size.)
5769 */
5770#define SEARCH_SCOPE 2
5771#define MIN_CACHE_SIZE (64*1024U)
5772#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5773#define ITERATIONS 1
5774#define SIZE_THRESH 130
5775#define COST_THRESH 130
5776
5777/*
5778 * The migration cost is a function of 'domain distance'. Domain
5779 * distance is the number of steps a CPU has to iterate down its
5780 * domain tree to share a domain with the other CPU. The farther
5781 * two CPUs are from each other, the larger the distance gets.
5782 *
5783 * Note that we use the distance only to cache measurement results,
5784 * the distance value is not used numerically otherwise. When two
5785 * CPUs have the same distance it is assumed that the migration
5786 * cost is the same. (this is a simplification but quite practical)
5787 */
5788#define MAX_DOMAIN_DISTANCE 32
5789
5790static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5791 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5792/*
5793 * Architectures may override the migration cost and thus avoid
5794 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5795 * virtualized hardware:
5796 */
5797#ifdef CONFIG_DEFAULT_MIGRATION_COST
5798 CONFIG_DEFAULT_MIGRATION_COST
5799#else
5800 -1LL
5801#endif
5802};
5803
5804/*
5805 * Allow override of migration cost - in units of microseconds.
5806 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5807 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5808 */
5809static int __init migration_cost_setup(char *str)
5810{
5811 int ints[MAX_DOMAIN_DISTANCE+1], i;
5812
5813 str = get_options(str, ARRAY_SIZE(ints), ints);
5814
5815 printk("#ints: %d\n", ints[0]);
5816 for (i = 1; i <= ints[0]; i++) {
5817 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5818 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5819 }
5820 return 1;
5821}
5822
5823__setup ("migration_cost=", migration_cost_setup);
5824
5825/*
5826 * Global multiplier (divisor) for migration-cutoff values,
5827 * in percentiles. E.g. use a value of 150 to get 1.5 times
5828 * longer cache-hot cutoff times.
5829 *
5830 * (We scale it from 100 to 128 to long long handling easier.)
5831 */
5832
5833#define MIGRATION_FACTOR_SCALE 128
5834
5835static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5836
5837static int __init setup_migration_factor(char *str)
5838{
5839 get_option(&str, &migration_factor);
5840 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5841 return 1;
5842}
5843
5844__setup("migration_factor=", setup_migration_factor);
5845
5846/*
5847 * Estimated distance of two CPUs, measured via the number of domains
5848 * we have to pass for the two CPUs to be in the same span:
5849 */
5850static unsigned long domain_distance(int cpu1, int cpu2)
5851{
5852 unsigned long distance = 0;
5853 struct sched_domain *sd;
5854
5855 for_each_domain(cpu1, sd) {
5856 WARN_ON(!cpu_isset(cpu1, sd->span));
5857 if (cpu_isset(cpu2, sd->span))
5858 return distance;
5859 distance++;
5860 }
5861 if (distance >= MAX_DOMAIN_DISTANCE) {
5862 WARN_ON(1);
5863 distance = MAX_DOMAIN_DISTANCE-1;
5864 }
5865
5866 return distance;
5867}
5868
5869static unsigned int migration_debug;
5870
5871static int __init setup_migration_debug(char *str)
5872{
5873 get_option(&str, &migration_debug);
5874 return 1;
5875}
5876
5877__setup("migration_debug=", setup_migration_debug);
5878
5879/*
5880 * Maximum cache-size that the scheduler should try to measure.
5881 * Architectures with larger caches should tune this up during
5882 * bootup. Gets used in the domain-setup code (i.e. during SMP
5883 * bootup).
5884 */
5885unsigned int max_cache_size;
5886
5887static int __init setup_max_cache_size(char *str)
5888{
5889 get_option(&str, &max_cache_size);
5890 return 1;
5891}
5892
5893__setup("max_cache_size=", setup_max_cache_size);
5894
5895/*
5896 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5897 * is the operation that is timed, so we try to generate unpredictable
5898 * cachemisses that still end up filling the L2 cache:
5899 */
5900static void touch_cache(void *__cache, unsigned long __size)
5901{
5902 unsigned long size = __size / sizeof(long);
5903 unsigned long chunk1 = size / 3;
5904 unsigned long chunk2 = 2 * size / 3;
5905 unsigned long *cache = __cache;
5906 int i;
5907
5908 for (i = 0; i < size/6; i += 8) {
5909 switch (i % 6) {
5910 case 0: cache[i]++;
5911 case 1: cache[size-1-i]++;
5912 case 2: cache[chunk1-i]++;
5913 case 3: cache[chunk1+i]++;
5914 case 4: cache[chunk2-i]++;
5915 case 5: cache[chunk2+i]++;
5916 }
5917 }
5918}
5919
5920/*
5921 * Measure the cache-cost of one task migration. Returns in units of nsec.
5922 */
5923static unsigned long long
5924measure_one(void *cache, unsigned long size, int source, int target)
5925{
5926 cpumask_t mask, saved_mask;
5927 unsigned long long t0, t1, t2, t3, cost;
5928
5929 saved_mask = current->cpus_allowed;
5930
5931 /*
5932 * Flush source caches to RAM and invalidate them:
5933 */
5934 sched_cacheflush();
5935
5936 /*
5937 * Migrate to the source CPU:
5938 */
5939 mask = cpumask_of_cpu(source);
5940 set_cpus_allowed(current, mask);
5941 WARN_ON(smp_processor_id() != source);
5942
5943 /*
5944 * Dirty the working set:
5945 */
5946 t0 = sched_clock();
5947 touch_cache(cache, size);
5948 t1 = sched_clock();
5949
5950 /*
5951 * Migrate to the target CPU, dirty the L2 cache and access
5952 * the shared buffer. (which represents the working set
5953 * of a migrated task.)
5954 */
5955 mask = cpumask_of_cpu(target);
5956 set_cpus_allowed(current, mask);
5957 WARN_ON(smp_processor_id() != target);
5958
5959 t2 = sched_clock();
5960 touch_cache(cache, size);
5961 t3 = sched_clock();
5962
5963 cost = t1-t0 + t3-t2;
5964
5965 if (migration_debug >= 2)
5966 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
5967 source, target, t1-t0, t1-t0, t3-t2, cost);
5968 /*
5969 * Flush target caches to RAM and invalidate them:
5970 */
5971 sched_cacheflush();
5972
5973 set_cpus_allowed(current, saved_mask);
5974
5975 return cost;
5976}
5977
5978/*
5979 * Measure a series of task migrations and return the average
5980 * result. Since this code runs early during bootup the system
5981 * is 'undisturbed' and the average latency makes sense.
5982 *
5983 * The algorithm in essence auto-detects the relevant cache-size,
5984 * so it will properly detect different cachesizes for different
5985 * cache-hierarchies, depending on how the CPUs are connected.
5986 *
5987 * Architectures can prime the upper limit of the search range via
5988 * max_cache_size, otherwise the search range defaults to 20MB...64K.
5989 */
5990static unsigned long long
5991measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5992{
5993 unsigned long long cost1, cost2;
5994 int i;
5995
5996 /*
5997 * Measure the migration cost of 'size' bytes, over an
5998 * average of 10 runs:
5999 *
6000 * (We perturb the cache size by a small (0..4k)
6001 * value to compensate size/alignment related artifacts.
6002 * We also subtract the cost of the operation done on
6003 * the same CPU.)
6004 */
6005 cost1 = 0;
6006
6007 /*
6008 * dry run, to make sure we start off cache-cold on cpu1,
6009 * and to get any vmalloc pagefaults in advance:
6010 */
6011 measure_one(cache, size, cpu1, cpu2);
6012 for (i = 0; i < ITERATIONS; i++)
6013 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6014
6015 measure_one(cache, size, cpu2, cpu1);
6016 for (i = 0; i < ITERATIONS; i++)
6017 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6018
6019 /*
6020 * (We measure the non-migrating [cached] cost on both
6021 * cpu1 and cpu2, to handle CPUs with different speeds)
6022 */
6023 cost2 = 0;
6024
6025 measure_one(cache, size, cpu1, cpu1);
6026 for (i = 0; i < ITERATIONS; i++)
6027 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6028
6029 measure_one(cache, size, cpu2, cpu2);
6030 for (i = 0; i < ITERATIONS; i++)
6031 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6032
6033 /*
6034 * Get the per-iteration migration cost:
6035 */
6036 do_div(cost1, 2 * ITERATIONS);
6037 do_div(cost2, 2 * ITERATIONS);
6038
6039 return cost1 - cost2;
6040}
6041
6042static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6043{
6044 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6045 unsigned int max_size, size, size_found = 0;
6046 long long cost = 0, prev_cost;
6047 void *cache;
6048
6049 /*
6050 * Search from max_cache_size*5 down to 64K - the real relevant
6051 * cachesize has to lie somewhere inbetween.
6052 */
6053 if (max_cache_size) {
6054 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6055 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6056 } else {
6057 /*
6058 * Since we have no estimation about the relevant
6059 * search range
6060 */
6061 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6062 size = MIN_CACHE_SIZE;
6063 }
6064
6065 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
6066 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6067 return 0;
6068 }
6069
6070 /*
6071 * Allocate the working set:
6072 */
6073 cache = vmalloc(max_size);
6074 if (!cache) {
6075 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6076 return 1000000; /* return 1 msec on very small boxen */
6077 }
6078
6079 while (size <= max_size) {
6080 prev_cost = cost;
6081 cost = measure_cost(cpu1, cpu2, cache, size);
6082
6083 /*
6084 * Update the max:
6085 */
6086 if (cost > 0) {
6087 if (max_cost < cost) {
6088 max_cost = cost;
6089 size_found = size;
6090 }
6091 }
6092 /*
6093 * Calculate average fluctuation, we use this to prevent
6094 * noise from triggering an early break out of the loop:
6095 */
6096 fluct = abs(cost - prev_cost);
6097 avg_fluct = (avg_fluct + fluct)/2;
6098
6099 if (migration_debug)
6100 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6101 "(%8Ld %8Ld)\n",
6102 cpu1, cpu2, size,
6103 (long)cost / 1000000,
6104 ((long)cost / 100000) % 10,
6105 (long)max_cost / 1000000,
6106 ((long)max_cost / 100000) % 10,
6107 domain_distance(cpu1, cpu2),
6108 cost, avg_fluct);
6109
6110 /*
6111 * If we iterated at least 20% past the previous maximum,
6112 * and the cost has dropped by more than 20% already,
6113 * (taking fluctuations into account) then we assume to
6114 * have found the maximum and break out of the loop early:
6115 */
6116 if (size_found && (size*100 > size_found*SIZE_THRESH))
6117 if (cost+avg_fluct <= 0 ||
6118 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
6119
6120 if (migration_debug)
6121 printk("-> found max.\n");
6122 break;
6123 }
6124 /*
6125 * Increase the cachesize in 10% steps:
6126 */
6127 size = size * 10 / 9;
6128 }
6129
6130 if (migration_debug)
6131 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6132 cpu1, cpu2, size_found, max_cost);
6133
6134 vfree(cache);
6135
6136 /*
6137 * A task is considered 'cache cold' if at least 2 times
6138 * the worst-case cost of migration has passed.
6139 *
6140 * (this limit is only listened to if the load-balancing
6141 * situation is 'nice' - if there is a large imbalance we
6142 * ignore it for the sake of CPU utilization and
6143 * processing fairness.)
6144 */
6145 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6146}
6147
6148static void calibrate_migration_costs(const cpumask_t *cpu_map)
6149{
6150 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6151 unsigned long j0, j1, distance, max_distance = 0;
6152 struct sched_domain *sd;
6153
6154 j0 = jiffies;
6155
6156 /*
6157 * First pass - calculate the cacheflush times:
6158 */
6159 for_each_cpu_mask(cpu1, *cpu_map) {
6160 for_each_cpu_mask(cpu2, *cpu_map) {
6161 if (cpu1 == cpu2)
6162 continue;
6163 distance = domain_distance(cpu1, cpu2);
6164 max_distance = max(max_distance, distance);
6165 /*
6166 * No result cached yet?
6167 */
6168 if (migration_cost[distance] == -1LL)
6169 migration_cost[distance] =
6170 measure_migration_cost(cpu1, cpu2);
6171 }
6172 }
6173 /*
6174 * Second pass - update the sched domain hierarchy with
6175 * the new cache-hot-time estimations:
6176 */
6177 for_each_cpu_mask(cpu, *cpu_map) {
6178 distance = 0;
6179 for_each_domain(cpu, sd) {
6180 sd->cache_hot_time = migration_cost[distance];
6181 distance++;
6182 }
6183 }
6184 /*
6185 * Print the matrix:
6186 */
6187 if (migration_debug)
6188 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6189 max_cache_size,
6190#ifdef CONFIG_X86
6191 cpu_khz/1000
6192#else
6193 -1
6194#endif
6195 );
6196 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6197 printk("migration_cost=");
6198 for (distance = 0; distance <= max_distance; distance++) {
6199 if (distance)
6200 printk(",");
6201 printk("%ld", (long)migration_cost[distance] / 1000);
6202 }
6203 printk("\n");
6204 }
6205 j1 = jiffies;
6206 if (migration_debug)
6207 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6208
6209 /*
6210 * Move back to the original CPU. NUMA-Q gets confused
6211 * if we migrate to another quad during bootup.
6212 */
6213 if (raw_smp_processor_id() != orig_cpu) {
6214 cpumask_t mask = cpumask_of_cpu(orig_cpu),
6215 saved_mask = current->cpus_allowed;
6216
6217 set_cpus_allowed(current, mask);
6218 set_cpus_allowed(current, saved_mask);
6219 }
6220}
6221
6222#ifdef CONFIG_NUMA 5488#ifdef CONFIG_NUMA
6223 5489
6224/** 5490/**
@@ -6519,7 +5785,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6519static int build_sched_domains(const cpumask_t *cpu_map) 5785static int build_sched_domains(const cpumask_t *cpu_map)
6520{ 5786{
6521 int i; 5787 int i;
6522 struct sched_domain *sd;
6523#ifdef CONFIG_NUMA 5788#ifdef CONFIG_NUMA
6524 struct sched_group **sched_group_nodes = NULL; 5789 struct sched_group **sched_group_nodes = NULL;
6525 int sd_allnodes = 0; 5790 int sd_allnodes = 0;
@@ -6527,7 +5792,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6527 /* 5792 /*
6528 * Allocate the per-node list of sched groups 5793 * Allocate the per-node list of sched groups
6529 */ 5794 */
6530 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5795 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
6531 GFP_KERNEL); 5796 GFP_KERNEL);
6532 if (!sched_group_nodes) { 5797 if (!sched_group_nodes) {
6533 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5798 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6546,8 +5811,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6546 cpus_and(nodemask, nodemask, *cpu_map); 5811 cpus_and(nodemask, nodemask, *cpu_map);
6547 5812
6548#ifdef CONFIG_NUMA 5813#ifdef CONFIG_NUMA
6549 if (cpus_weight(*cpu_map) 5814 if (cpus_weight(*cpu_map) >
6550 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5815 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6551 sd = &per_cpu(allnodes_domains, i); 5816 sd = &per_cpu(allnodes_domains, i);
6552 *sd = SD_ALLNODES_INIT; 5817 *sd = SD_ALLNODES_INIT;
6553 sd->span = *cpu_map; 5818 sd->span = *cpu_map;
@@ -6606,7 +5871,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6606 if (i != first_cpu(this_sibling_map)) 5871 if (i != first_cpu(this_sibling_map))
6607 continue; 5872 continue;
6608 5873
6609 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 5874 init_sched_build_groups(this_sibling_map, cpu_map,
5875 &cpu_to_cpu_group);
6610 } 5876 }
6611#endif 5877#endif
6612 5878
@@ -6617,11 +5883,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6617 cpus_and(this_core_map, this_core_map, *cpu_map); 5883 cpus_and(this_core_map, this_core_map, *cpu_map);
6618 if (i != first_cpu(this_core_map)) 5884 if (i != first_cpu(this_core_map))
6619 continue; 5885 continue;
6620 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 5886 init_sched_build_groups(this_core_map, cpu_map,
5887 &cpu_to_core_group);
6621 } 5888 }
6622#endif 5889#endif
6623 5890
6624
6625 /* Set up physical groups */ 5891 /* Set up physical groups */
6626 for (i = 0; i < MAX_NUMNODES; i++) { 5892 for (i = 0; i < MAX_NUMNODES; i++) {
6627 cpumask_t nodemask = node_to_cpumask(i); 5893 cpumask_t nodemask = node_to_cpumask(i);
@@ -6636,7 +5902,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6636#ifdef CONFIG_NUMA 5902#ifdef CONFIG_NUMA
6637 /* Set up node groups */ 5903 /* Set up node groups */
6638 if (sd_allnodes) 5904 if (sd_allnodes)
6639 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 5905 init_sched_build_groups(*cpu_map, cpu_map,
5906 &cpu_to_allnodes_group);
6640 5907
6641 for (i = 0; i < MAX_NUMNODES; i++) { 5908 for (i = 0; i < MAX_NUMNODES; i++) {
6642 /* Set up node groups */ 5909 /* Set up node groups */
@@ -6664,6 +5931,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6664 sched_group_nodes[i] = sg; 5931 sched_group_nodes[i] = sg;
6665 for_each_cpu_mask(j, nodemask) { 5932 for_each_cpu_mask(j, nodemask) {
6666 struct sched_domain *sd; 5933 struct sched_domain *sd;
5934
6667 sd = &per_cpu(node_domains, j); 5935 sd = &per_cpu(node_domains, j);
6668 sd->groups = sg; 5936 sd->groups = sg;
6669 } 5937 }
@@ -6708,19 +5976,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6708 /* Calculate CPU power for physical packages and nodes */ 5976 /* Calculate CPU power for physical packages and nodes */
6709#ifdef CONFIG_SCHED_SMT 5977#ifdef CONFIG_SCHED_SMT
6710 for_each_cpu_mask(i, *cpu_map) { 5978 for_each_cpu_mask(i, *cpu_map) {
6711 sd = &per_cpu(cpu_domains, i); 5979 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5980
6712 init_sched_groups_power(i, sd); 5981 init_sched_groups_power(i, sd);
6713 } 5982 }
6714#endif 5983#endif
6715#ifdef CONFIG_SCHED_MC 5984#ifdef CONFIG_SCHED_MC
6716 for_each_cpu_mask(i, *cpu_map) { 5985 for_each_cpu_mask(i, *cpu_map) {
6717 sd = &per_cpu(core_domains, i); 5986 struct sched_domain *sd = &per_cpu(core_domains, i);
5987
6718 init_sched_groups_power(i, sd); 5988 init_sched_groups_power(i, sd);
6719 } 5989 }
6720#endif 5990#endif
6721 5991
6722 for_each_cpu_mask(i, *cpu_map) { 5992 for_each_cpu_mask(i, *cpu_map) {
6723 sd = &per_cpu(phys_domains, i); 5993 struct sched_domain *sd = &per_cpu(phys_domains, i);
5994
6724 init_sched_groups_power(i, sd); 5995 init_sched_groups_power(i, sd);
6725 } 5996 }
6726 5997
@@ -6748,10 +6019,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6748#endif 6019#endif
6749 cpu_attach_domain(sd, i); 6020 cpu_attach_domain(sd, i);
6750 } 6021 }
6751 /*
6752 * Tune cache-hot values:
6753 */
6754 calibrate_migration_costs(cpu_map);
6755 6022
6756 return 0; 6023 return 0;
6757 6024
@@ -6958,10 +6225,12 @@ void __init sched_init_smp(void)
6958 /* Move init over to a non-isolated CPU */ 6225 /* Move init over to a non-isolated CPU */
6959 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6226 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6960 BUG(); 6227 BUG();
6228 sched_init_granularity();
6961} 6229}
6962#else 6230#else
6963void __init sched_init_smp(void) 6231void __init sched_init_smp(void)
6964{ 6232{
6233 sched_init_granularity();
6965} 6234}
6966#endif /* CONFIG_SMP */ 6235#endif /* CONFIG_SMP */
6967 6236
@@ -6975,28 +6244,51 @@ int in_sched_functions(unsigned long addr)
6975 && addr < (unsigned long)__sched_text_end); 6244 && addr < (unsigned long)__sched_text_end);
6976} 6245}
6977 6246
6247static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6248{
6249 cfs_rq->tasks_timeline = RB_ROOT;
6250 cfs_rq->fair_clock = 1;
6251#ifdef CONFIG_FAIR_GROUP_SCHED
6252 cfs_rq->rq = rq;
6253#endif
6254}
6255
6978void __init sched_init(void) 6256void __init sched_init(void)
6979{ 6257{
6980 int i, j, k; 6258 u64 now = sched_clock();
6981 int highest_cpu = 0; 6259 int highest_cpu = 0;
6260 int i, j;
6261
6262 /*
6263 * Link up the scheduling class hierarchy:
6264 */
6265 rt_sched_class.next = &fair_sched_class;
6266 fair_sched_class.next = &idle_sched_class;
6267 idle_sched_class.next = NULL;
6982 6268
6983 for_each_possible_cpu(i) { 6269 for_each_possible_cpu(i) {
6984 struct prio_array *array; 6270 struct rt_prio_array *array;
6985 struct rq *rq; 6271 struct rq *rq;
6986 6272
6987 rq = cpu_rq(i); 6273 rq = cpu_rq(i);
6988 spin_lock_init(&rq->lock); 6274 spin_lock_init(&rq->lock);
6989 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6275 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6990 rq->nr_running = 0; 6276 rq->nr_running = 0;
6991 rq->active = rq->arrays; 6277 rq->clock = 1;
6992 rq->expired = rq->arrays + 1; 6278 init_cfs_rq(&rq->cfs, rq);
6993 rq->best_expired_prio = MAX_PRIO; 6279#ifdef CONFIG_FAIR_GROUP_SCHED
6280 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6281 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6282#endif
6283 rq->ls.load_update_last = now;
6284 rq->ls.load_update_start = now;
6994 6285
6286 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6287 rq->cpu_load[j] = 0;
6995#ifdef CONFIG_SMP 6288#ifdef CONFIG_SMP
6996 rq->sd = NULL; 6289 rq->sd = NULL;
6997 for (j = 1; j < 3; j++)
6998 rq->cpu_load[j] = 0;
6999 rq->active_balance = 0; 6290 rq->active_balance = 0;
6291 rq->next_balance = jiffies;
7000 rq->push_cpu = 0; 6292 rq->push_cpu = 0;
7001 rq->cpu = i; 6293 rq->cpu = i;
7002 rq->migration_thread = NULL; 6294 rq->migration_thread = NULL;
@@ -7004,16 +6296,14 @@ void __init sched_init(void)
7004#endif 6296#endif
7005 atomic_set(&rq->nr_iowait, 0); 6297 atomic_set(&rq->nr_iowait, 0);
7006 6298
7007 for (j = 0; j < 2; j++) { 6299 array = &rq->rt.active;
7008 array = rq->arrays + j; 6300 for (j = 0; j < MAX_RT_PRIO; j++) {
7009 for (k = 0; k < MAX_PRIO; k++) { 6301 INIT_LIST_HEAD(array->queue + j);
7010 INIT_LIST_HEAD(array->queue + k); 6302 __clear_bit(j, array->bitmap);
7011 __clear_bit(k, array->bitmap);
7012 }
7013 // delimiter for bitsearch
7014 __set_bit(MAX_PRIO, array->bitmap);
7015 } 6303 }
7016 highest_cpu = i; 6304 highest_cpu = i;
6305 /* delimiter for bitsearch: */
6306 __set_bit(MAX_RT_PRIO, array->bitmap);
7017 } 6307 }
7018 6308
7019 set_load_weight(&init_task); 6309 set_load_weight(&init_task);
@@ -7040,6 +6330,10 @@ void __init sched_init(void)
7040 * when this runqueue becomes "idle". 6330 * when this runqueue becomes "idle".
7041 */ 6331 */
7042 init_idle(current, smp_processor_id()); 6332 init_idle(current, smp_processor_id());
6333 /*
6334 * During early bootup we pretend to be a normal task:
6335 */
6336 current->sched_class = &fair_sched_class;
7043} 6337}
7044 6338
7045#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6339#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7070,31 +6364,59 @@ EXPORT_SYMBOL(__might_sleep);
7070#ifdef CONFIG_MAGIC_SYSRQ 6364#ifdef CONFIG_MAGIC_SYSRQ
7071void normalize_rt_tasks(void) 6365void normalize_rt_tasks(void)
7072{ 6366{
7073 struct prio_array *array; 6367 struct task_struct *g, *p;
7074 struct task_struct *p;
7075 unsigned long flags; 6368 unsigned long flags;
7076 struct rq *rq; 6369 struct rq *rq;
6370 int on_rq;
7077 6371
7078 read_lock_irq(&tasklist_lock); 6372 read_lock_irq(&tasklist_lock);
7079 for_each_process(p) { 6373 do_each_thread(g, p) {
7080 if (!rt_task(p)) 6374 p->se.fair_key = 0;
6375 p->se.wait_runtime = 0;
6376 p->se.wait_start_fair = 0;
6377 p->se.wait_start = 0;
6378 p->se.exec_start = 0;
6379 p->se.sleep_start = 0;
6380 p->se.sleep_start_fair = 0;
6381 p->se.block_start = 0;
6382 task_rq(p)->cfs.fair_clock = 0;
6383 task_rq(p)->clock = 0;
6384
6385 if (!rt_task(p)) {
6386 /*
6387 * Renice negative nice level userspace
6388 * tasks back to 0:
6389 */
6390 if (TASK_NICE(p) < 0 && p->mm)
6391 set_user_nice(p, 0);
7081 continue; 6392 continue;
6393 }
7082 6394
7083 spin_lock_irqsave(&p->pi_lock, flags); 6395 spin_lock_irqsave(&p->pi_lock, flags);
7084 rq = __task_rq_lock(p); 6396 rq = __task_rq_lock(p);
6397#ifdef CONFIG_SMP
6398 /*
6399 * Do not touch the migration thread:
6400 */
6401 if (p == rq->migration_thread)
6402 goto out_unlock;
6403#endif
7085 6404
7086 array = p->array; 6405 on_rq = p->se.on_rq;
7087 if (array) 6406 if (on_rq)
7088 deactivate_task(p, task_rq(p)); 6407 deactivate_task(task_rq(p), p, 0);
7089 __setscheduler(p, SCHED_NORMAL, 0); 6408 __setscheduler(rq, p, SCHED_NORMAL, 0);
7090 if (array) { 6409 if (on_rq) {
7091 __activate_task(p, task_rq(p)); 6410 activate_task(task_rq(p), p, 0);
7092 resched_task(rq->curr); 6411 resched_task(rq->curr);
7093 } 6412 }
7094 6413#ifdef CONFIG_SMP
6414 out_unlock:
6415#endif
7095 __task_rq_unlock(rq); 6416 __task_rq_unlock(rq);
7096 spin_unlock_irqrestore(&p->pi_lock, flags); 6417 spin_unlock_irqrestore(&p->pi_lock, flags);
7097 } 6418 } while_each_thread(g, p);
6419
7098 read_unlock_irq(&tasklist_lock); 6420 read_unlock_irq(&tasklist_lock);
7099} 6421}
7100 6422