aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c3021
1 files changed, 1146 insertions, 1875 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 50e1a3122699..9fbced64bfee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,13 +16,19 @@
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
19 */ 25 */
20 26
21#include <linux/mm.h> 27#include <linux/mm.h>
22#include <linux/module.h> 28#include <linux/module.h>
23#include <linux/nmi.h> 29#include <linux/nmi.h>
24#include <linux/init.h> 30#include <linux/init.h>
25#include <asm/uaccess.h> 31#include <linux/uaccess.h>
26#include <linux/highmem.h> 32#include <linux/highmem.h>
27#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
28#include <asm/mmu_context.h> 34#include <asm/mmu_context.h>
@@ -53,9 +59,9 @@
53#include <linux/kprobes.h> 59#include <linux/kprobes.h>
54#include <linux/delayacct.h> 60#include <linux/delayacct.h>
55#include <linux/reciprocal_div.h> 61#include <linux/reciprocal_div.h>
62#include <linux/unistd.h>
56 63
57#include <asm/tlb.h> 64#include <asm/tlb.h>
58#include <asm/unistd.h>
59 65
60/* 66/*
61 * Scheduler clock - returns current time in nanosec units. 67 * Scheduler clock - returns current time in nanosec units.
@@ -91,6 +97,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93 99
100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
94/* 103/*
95 * These are the 'tuning knobs' of the scheduler: 104 * These are the 'tuning knobs' of the scheduler:
96 * 105 *
@@ -100,87 +109,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
100 */ 109 */
101#define MIN_TIMESLICE max(5 * HZ / 1000, 1) 110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
102#define DEF_TIMESLICE (100 * HZ / 1000) 111#define DEF_TIMESLICE (100 * HZ / 1000)
103#define ON_RUNQUEUE_WEIGHT 30
104#define CHILD_PENALTY 95
105#define PARENT_PENALTY 100
106#define EXIT_WEIGHT 3
107#define PRIO_BONUS_RATIO 25
108#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
109#define INTERACTIVE_DELTA 2
110#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
111#define STARVATION_LIMIT (MAX_SLEEP_AVG)
112#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
113
114/*
115 * If a task is 'interactive' then we reinsert it in the active
116 * array after it has expired its current timeslice. (it will not
117 * continue to run immediately, it will still roundrobin with
118 * other interactive tasks.)
119 *
120 * This part scales the interactivity limit depending on niceness.
121 *
122 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
123 * Here are a few examples of different nice levels:
124 *
125 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
126 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
127 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
128 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
129 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
130 *
131 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
132 * priority range a task can explore, a value of '1' means the
133 * task is rated interactive.)
134 *
135 * Ie. nice +19 tasks can never get 'interactive' enough to be
136 * reinserted into the active array. And only heavily CPU-hog nice -20
137 * tasks will be expired. Default nice 0 tasks are somewhere between,
138 * it takes some effort for them to get interactive, but it's not
139 * too hard.
140 */
141
142#define CURRENT_BONUS(p) \
143 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
144 MAX_SLEEP_AVG)
145
146#define GRANULARITY (10 * HZ / 1000 ? : 1)
147
148#ifdef CONFIG_SMP
149#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
150 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
151 num_online_cpus())
152#else
153#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
154 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
155#endif
156
157#define SCALE(v1,v1_max,v2_max) \
158 (v1) * (v2_max) / (v1_max)
159
160#define DELTA(p) \
161 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
162 INTERACTIVE_DELTA)
163
164#define TASK_INTERACTIVE(p) \
165 ((p)->prio <= (p)->static_prio - DELTA(p))
166
167#define INTERACTIVE_SLEEP(p) \
168 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
169 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
170
171#define TASK_PREEMPTS_CURR(p, rq) \
172 ((p)->prio < (rq)->curr->prio)
173
174#define SCALE_PRIO(x, prio) \
175 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
176
177static unsigned int static_prio_timeslice(int static_prio)
178{
179 if (static_prio < NICE_TO_PRIO(0))
180 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
181 else
182 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
183}
184 112
185#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
186/* 114/*
@@ -203,28 +131,87 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
203} 131}
204#endif 132#endif
205 133
134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
136
206/* 137/*
207 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
208 * to time slice values: [800ms ... 100ms ... 5ms] 139 * to time slice values: [800ms ... 100ms ... 5ms]
209 *
210 * The higher a thread's priority, the bigger timeslices
211 * it gets during one round of execution. But even the lowest
212 * priority thread gets MIN_TIMESLICE worth of execution time.
213 */ 140 */
141static unsigned int static_prio_timeslice(int static_prio)
142{
143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
145
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
150}
151
152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
214 158
215static inline unsigned int task_timeslice(struct task_struct *p) 159static inline int task_has_rt_policy(struct task_struct *p)
216{ 160{
217 return static_prio_timeslice(p->static_prio); 161 return rt_policy(p->policy);
218} 162}
219 163
220/* 164/*
221 * These are the runqueue data structures: 165 * This is the priority-queue data structure of the RT scheduling class:
222 */ 166 */
167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
171
172struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
223 198
224struct prio_array { 199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
225 unsigned int nr_active; 200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
226 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ 201 * (like users, containers etc.)
227 struct list_head queue[MAX_PRIO]; 202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
209
210/* Real-Time classes' related field in a runqueue: */
211struct rt_rq {
212 struct rt_prio_array active;
213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
228}; 215};
229 216
230/* 217/*
@@ -235,22 +222,28 @@ struct prio_array {
235 * acquire operations must be ordered by ascending &runqueue. 222 * acquire operations must be ordered by ascending &runqueue.
236 */ 223 */
237struct rq { 224struct rq {
238 spinlock_t lock; 225 spinlock_t lock; /* runqueue lock */
239 226
240 /* 227 /*
241 * nr_running and cpu_load should be in the same cacheline because 228 * nr_running and cpu_load should be in the same cacheline because
242 * remote CPUs use both these fields when doing load calculation. 229 * remote CPUs use both these fields when doing load calculation.
243 */ 230 */
244 unsigned long nr_running; 231 unsigned long nr_running;
245 unsigned long raw_weighted_load; 232 #define CPU_LOAD_IDX_MAX 5
246#ifdef CONFIG_SMP 233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
247 unsigned long cpu_load[3];
248 unsigned char idle_at_tick; 234 unsigned char idle_at_tick;
249#ifdef CONFIG_NO_HZ 235#ifdef CONFIG_NO_HZ
250 unsigned char in_nohz_recently; 236 unsigned char in_nohz_recently;
251#endif 237#endif
238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
252#endif 245#endif
253 unsigned long long nr_switches; 246 struct rt_rq rt;
254 247
255 /* 248 /*
256 * This is part of a global counter where only the total sum 249 * This is part of a global counter where only the total sum
@@ -260,14 +253,18 @@ struct rq {
260 */ 253 */
261 unsigned long nr_uninterruptible; 254 unsigned long nr_uninterruptible;
262 255
263 unsigned long expired_timestamp;
264 /* Cached timestamp set by update_cpu_clock() */
265 unsigned long long most_recent_timestamp;
266 struct task_struct *curr, *idle; 256 struct task_struct *curr, *idle;
267 unsigned long next_balance; 257 unsigned long next_balance;
268 struct mm_struct *prev_mm; 258 struct mm_struct *prev_mm;
269 struct prio_array *active, *expired, arrays[2]; 259
270 int best_expired_prio; 260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
271 atomic_t nr_iowait; 268 atomic_t nr_iowait;
272 269
273#ifdef CONFIG_SMP 270#ifdef CONFIG_SMP
@@ -307,6 +304,11 @@ struct rq {
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex); 305static DEFINE_MUTEX(sched_hotcpu_mutex);
309 306
307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
310static inline int cpu_of(struct rq *rq) 312static inline int cpu_of(struct rq *rq)
311{ 313{
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
@@ -317,6 +319,52 @@ static inline int cpu_of(struct rq *rq)
317} 319}
318 320
319/* 321/*
322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
367/*
320 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
321 * See detach_destroy_domains: synchronize_sched for details. 369 * See detach_destroy_domains: synchronize_sched for details.
322 * 370 *
@@ -331,6 +379,18 @@ static inline int cpu_of(struct rq *rq)
331#define task_rq(p) cpu_rq(task_cpu(p)) 379#define task_rq(p) cpu_rq(task_cpu(p))
332#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
333 381
382#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p)
385{
386 p->se.cfs_rq = &task_rq(p)->cfs;
387}
388#else
389static inline void set_task_cfs_rq(struct task_struct *p)
390{
391}
392#endif
393
334#ifndef prepare_arch_switch 394#ifndef prepare_arch_switch
335# define prepare_arch_switch(next) do { } while (0) 395# define prepare_arch_switch(next) do { } while (0)
336#endif 396#endif
@@ -460,134 +520,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
460 spin_unlock_irqrestore(&rq->lock, *flags); 520 spin_unlock_irqrestore(&rq->lock, *flags);
461} 521}
462 522
463#ifdef CONFIG_SCHEDSTATS
464/*
465 * bump this up when changing the output format or the meaning of an existing
466 * format, so that tools can adapt (or abort)
467 */
468#define SCHEDSTAT_VERSION 14
469
470static int show_schedstat(struct seq_file *seq, void *v)
471{
472 int cpu;
473
474 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
475 seq_printf(seq, "timestamp %lu\n", jiffies);
476 for_each_online_cpu(cpu) {
477 struct rq *rq = cpu_rq(cpu);
478#ifdef CONFIG_SMP
479 struct sched_domain *sd;
480 int dcnt = 0;
481#endif
482
483 /* runqueue-specific stats */
484 seq_printf(seq,
485 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
486 cpu, rq->yld_both_empty,
487 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
488 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
489 rq->ttwu_cnt, rq->ttwu_local,
490 rq->rq_sched_info.cpu_time,
491 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
492
493 seq_printf(seq, "\n");
494
495#ifdef CONFIG_SMP
496 /* domain-specific stats */
497 preempt_disable();
498 for_each_domain(cpu, sd) {
499 enum idle_type itype;
500 char mask_str[NR_CPUS];
501
502 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
503 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
504 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
505 itype++) {
506 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
507 "%lu",
508 sd->lb_cnt[itype],
509 sd->lb_balanced[itype],
510 sd->lb_failed[itype],
511 sd->lb_imbalance[itype],
512 sd->lb_gained[itype],
513 sd->lb_hot_gained[itype],
514 sd->lb_nobusyq[itype],
515 sd->lb_nobusyg[itype]);
516 }
517 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
518 " %lu %lu %lu\n",
519 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
520 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
521 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
522 sd->ttwu_wake_remote, sd->ttwu_move_affine,
523 sd->ttwu_move_balance);
524 }
525 preempt_enable();
526#endif
527 }
528 return 0;
529}
530
531static int schedstat_open(struct inode *inode, struct file *file)
532{
533 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
534 char *buf = kmalloc(size, GFP_KERNEL);
535 struct seq_file *m;
536 int res;
537
538 if (!buf)
539 return -ENOMEM;
540 res = single_open(file, show_schedstat, NULL);
541 if (!res) {
542 m = file->private_data;
543 m->buf = buf;
544 m->size = size;
545 } else
546 kfree(buf);
547 return res;
548}
549
550const struct file_operations proc_schedstat_operations = {
551 .open = schedstat_open,
552 .read = seq_read,
553 .llseek = seq_lseek,
554 .release = single_release,
555};
556
557/*
558 * Expects runqueue lock to be held for atomicity of update
559 */
560static inline void
561rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
562{
563 if (rq) {
564 rq->rq_sched_info.run_delay += delta_jiffies;
565 rq->rq_sched_info.pcnt++;
566 }
567}
568
569/*
570 * Expects runqueue lock to be held for atomicity of update
571 */
572static inline void
573rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
574{
575 if (rq)
576 rq->rq_sched_info.cpu_time += delta_jiffies;
577}
578# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
579# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
580#else /* !CONFIG_SCHEDSTATS */
581static inline void
582rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
583{}
584static inline void
585rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
586{}
587# define schedstat_inc(rq, field) do { } while (0)
588# define schedstat_add(rq, field, amt) do { } while (0)
589#endif
590
591/* 523/*
592 * this_rq_lock - lock this runqueue and disable interrupts. 524 * this_rq_lock - lock this runqueue and disable interrupts.
593 */ 525 */
@@ -603,177 +535,172 @@ static inline struct rq *this_rq_lock(void)
603 return rq; 535 return rq;
604} 536}
605 537
606#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
607/* 538/*
608 * Called when a process is dequeued from the active array and given 539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
609 * the cpu. We should note that with the exception of interactive
610 * tasks, the expired queue will become the active queue after the active
611 * queue is empty, without explicitly dequeuing and requeuing tasks in the
612 * expired queue. (Interactive tasks may be requeued directly to the
613 * active queue, thus delaying tasks in the expired queue from running;
614 * see scheduler_tick()).
615 *
616 * This function is only called from sched_info_arrive(), rather than
617 * dequeue_task(). Even though a task may be queued and dequeued multiple
618 * times as it is shuffled about, we're really interested in knowing how
619 * long it was from the *first* time it was queued to the time that it
620 * finally hit a cpu.
621 */ 540 */
622static inline void sched_info_dequeued(struct task_struct *t) 541void sched_clock_unstable_event(void)
623{ 542{
624 t->sched_info.last_queued = 0; 543 unsigned long flags;
544 struct rq *rq;
545
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
625} 550}
626 551
627/* 552/*
628 * Called when a task finally hits the cpu. We can now calculate how 553 * resched_task - mark a task 'to be rescheduled now'.
629 * long it was waiting to run. We also note when it began so that we 554 *
630 * can keep stats on how long its timeslice is. 555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
631 */ 558 */
632static void sched_info_arrive(struct task_struct *t) 559#ifdef CONFIG_SMP
560
561#ifndef tsk_is_polling
562#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563#endif
564
565static void resched_task(struct task_struct *p)
633{ 566{
634 unsigned long now = jiffies, delta_jiffies = 0; 567 int cpu;
568
569 assert_spin_locked(&task_rq(p)->lock);
570
571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
572 return;
635 573
636 if (t->sched_info.last_queued) 574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
637 delta_jiffies = now - t->sched_info.last_queued; 575
638 sched_info_dequeued(t); 576 cpu = task_cpu(p);
639 t->sched_info.run_delay += delta_jiffies; 577 if (cpu == smp_processor_id())
640 t->sched_info.last_arrival = now; 578 return;
641 t->sched_info.pcnt++;
642 579
643 rq_sched_info_arrive(task_rq(t), delta_jiffies); 580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
644} 584}
645 585
646/* 586static void resched_cpu(int cpu)
647 * Called when a process is queued into either the active or expired
648 * array. The time is noted and later used to determine how long we
649 * had to wait for us to reach the cpu. Since the expired queue will
650 * become the active queue after active queue is empty, without dequeuing
651 * and requeuing any tasks, we are interested in queuing to either. It
652 * is unusual but not impossible for tasks to be dequeued and immediately
653 * requeued in the same or another array: this can happen in sched_yield(),
654 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
655 * to runqueue.
656 *
657 * This function is only called from enqueue_task(), but also only updates
658 * the timestamp if it is already not set. It's assumed that
659 * sched_info_dequeued() will clear that stamp when appropriate.
660 */
661static inline void sched_info_queued(struct task_struct *t)
662{ 587{
663 if (unlikely(sched_info_on())) 588 struct rq *rq = cpu_rq(cpu);
664 if (!t->sched_info.last_queued) 589 unsigned long flags;
665 t->sched_info.last_queued = jiffies; 590
591 if (!spin_trylock_irqsave(&rq->lock, flags))
592 return;
593 resched_task(cpu_curr(cpu));
594 spin_unlock_irqrestore(&rq->lock, flags);
666} 595}
596#else
597static inline void resched_task(struct task_struct *p)
598{
599 assert_spin_locked(&task_rq(p)->lock);
600 set_tsk_need_resched(p);
601}
602#endif
667 603
668/* 604static u64 div64_likely32(u64 divident, unsigned long divisor)
669 * Called when a process ceases being the active-running process, either
670 * voluntarily or involuntarily. Now we can calculate how long we ran.
671 */
672static inline void sched_info_depart(struct task_struct *t)
673{ 605{
674 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; 606#if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
675 610
676 t->sched_info.cpu_time += delta_jiffies; 611 return divident;
677 rq_sched_info_depart(task_rq(t), delta_jiffies); 612#else
613 return divident / divisor;
614#endif
678} 615}
679 616
680/* 617#if BITS_PER_LONG == 32
681 * Called when tasks are switched involuntarily due, typically, to expiring 618# define WMULT_CONST (~0UL)
682 * their time slice. (This may also be called when switching to or from 619#else
683 * the idle task.) We are only called when prev != next. 620# define WMULT_CONST (1UL << 32)
684 */ 621#endif
685static inline void 622
686__sched_info_switch(struct task_struct *prev, struct task_struct *next) 623#define WMULT_SHIFT 32
624
625static inline unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
687{ 628{
688 struct rq *rq = task_rq(prev); 629 u64 tmp;
689 630
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
633
634 tmp = (u64)delta_exec * weight;
690 /* 635 /*
691 * prev now departs the cpu. It's not interesting to record 636 * Check whether we'd overflow the 64-bit multiplication:
692 * stats about how efficient we were at scheduling the idle
693 * process, however.
694 */ 637 */
695 if (prev != rq->idle) 638 if (unlikely(tmp > WMULT_CONST)) {
696 sched_info_depart(prev); 639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
643 }
697 644
698 if (next != rq->idle) 645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
699 sched_info_arrive(next);
700}
701static inline void
702sched_info_switch(struct task_struct *prev, struct task_struct *next)
703{
704 if (unlikely(sched_info_on()))
705 __sched_info_switch(prev, next);
706} 646}
707#else
708#define sched_info_queued(t) do { } while (0)
709#define sched_info_switch(t, next) do { } while (0)
710#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
711 647
712/* 648static inline unsigned long
713 * Adding/removing a task to/from a priority array: 649calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
714 */
715static void dequeue_task(struct task_struct *p, struct prio_array *array)
716{ 650{
717 array->nr_active--; 651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
718 list_del(&p->run_list);
719 if (list_empty(array->queue + p->prio))
720 __clear_bit(p->prio, array->bitmap);
721} 652}
722 653
723static void enqueue_task(struct task_struct *p, struct prio_array *array) 654static void update_load_add(struct load_weight *lw, unsigned long inc)
724{ 655{
725 sched_info_queued(p); 656 lw->weight += inc;
726 list_add_tail(&p->run_list, array->queue + p->prio); 657 lw->inv_weight = 0;
727 __set_bit(p->prio, array->bitmap);
728 array->nr_active++;
729 p->array = array;
730} 658}
731 659
732/* 660static void update_load_sub(struct load_weight *lw, unsigned long dec)
733 * Put task to the end of the run list without the overhead of dequeue
734 * followed by enqueue.
735 */
736static void requeue_task(struct task_struct *p, struct prio_array *array)
737{ 661{
738 list_move_tail(&p->run_list, array->queue + p->prio); 662 lw->weight -= dec;
663 lw->inv_weight = 0;
739} 664}
740 665
741static inline void 666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
742enqueue_task_head(struct task_struct *p, struct prio_array *array)
743{ 667{
744 list_add(&p->run_list, array->queue + p->prio); 668 if (rq->curr != rq->idle && ls->load.weight) {
745 __set_bit(p->prio, array->bitmap); 669 ls->delta_exec += ls->delta_stat;
746 array->nr_active++; 670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
747 p->array = array; 671 ls->delta_stat = 0;
672 }
748} 673}
749 674
750/* 675/*
751 * __normal_prio - return the priority that is based on the static 676 * Update delta_exec, delta_fair fields for rq.
752 * priority but is modified by bonuses/penalties.
753 * 677 *
754 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 678 * delta_fair clock advances at a rate inversely proportional to
755 * into the -5 ... 0 ... +5 bonus/penalty range. 679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
756 * 682 *
757 * We use 25% of the full 0...39 priority range so that: 683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
758 * 686 *
759 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 687 * This function is called /before/ updating rq->ls.load
760 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 688 * and when switching tasks.
761 *
762 * Both properties are important to certain workloads.
763 */ 689 */
764 690static void update_curr_load(struct rq *rq, u64 now)
765static inline int __normal_prio(struct task_struct *p)
766{ 691{
767 int bonus, prio; 692 struct load_stat *ls = &rq->ls;
768 693 u64 start;
769 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
770 694
771 prio = p->static_prio - bonus; 695 start = ls->load_update_start;
772 if (prio < MAX_RT_PRIO) 696 ls->load_update_start = now;
773 prio = MAX_RT_PRIO; 697 ls->delta_stat += now - start;
774 if (prio > MAX_PRIO-1) 698 /*
775 prio = MAX_PRIO-1; 699 * Stagger updates to ls->delta_fair. Very frequent updates
776 return prio; 700 * can be expensive.
701 */
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
777} 704}
778 705
779/* 706/*
@@ -791,53 +718,146 @@ static inline int __normal_prio(struct task_struct *p)
791 * this code will need modification 718 * this code will need modification
792 */ 719 */
793#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
794#define LOAD_WEIGHT(lp) \ 721#define load_weight(lp) \
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796#define PRIO_TO_LOAD_WEIGHT(prio) \ 723#define PRIO_TO_LOAD_WEIGHT(prio) \
797 LOAD_WEIGHT(static_prio_timeslice(prio)) 724 load_weight(static_prio_timeslice(prio))
798#define RTPRIO_TO_LOAD_WEIGHT(rp) \ 725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
800 727
801static void set_load_weight(struct task_struct *p) 728#define WEIGHT_IDLEPRIO 2
802{ 729#define WMULT_IDLEPRIO (1 << 31)
803 if (has_rt_policy(p)) { 730
804#ifdef CONFIG_SMP 731/*
805 if (p == task_rq(p)->migration_thread) 732 * Nice levels are multiplicative, with a gentle 10% change for every
806 /* 733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * The migration thread does the actual balancing. 734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * Giving its load any weight will skew balancing 735 * that remained on nice 0.
809 * adversely. 736 *
810 */ 737 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 p->load_weight = 0; 738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 else 739 * it's +10% CPU usage.
813#endif 740 */
814 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); 741static const int prio_to_weight[40] = {
815 } else 742/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); 743/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817} 744/* 0 */ NICE_0_LOAD /* 1024 */,
745/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
746/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
747};
748
749static const u32 prio_to_wmult[40] = {
750 48356, 60446, 75558, 94446, 118058, 147573,
751 184467, 230589, 288233, 360285, 450347,
752 562979, 703746, 879575, 1099582, 1374389,
753 717986, 2147483, 2684354, 3355443, 4194304,
754 244160, 6557201, 8196502, 10250518, 12782640,
755 16025997, 19976592, 24970740, 31350126, 39045157,
756 49367440, 61356675, 76695844, 95443717, 119304647,
757 148102320, 186737708, 238609294, 286331153,
758};
818 759
819static inline void 760static inline void
820inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 761inc_load(struct rq *rq, const struct task_struct *p, u64 now)
821{ 762{
822 rq->raw_weighted_load += p->load_weight; 763 update_curr_load(rq, now);
764 update_load_add(&rq->ls.load, p->se.load.weight);
823} 765}
824 766
825static inline void 767static inline void
826dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 768dec_load(struct rq *rq, const struct task_struct *p, u64 now)
827{ 769{
828 rq->raw_weighted_load -= p->load_weight; 770 update_curr_load(rq, now);
771 update_load_sub(&rq->ls.load, p->se.load.weight);
829} 772}
830 773
831static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 774static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
832{ 775{
833 rq->nr_running++; 776 rq->nr_running++;
834 inc_raw_weighted_load(rq, p); 777 inc_load(rq, p, now);
835} 778}
836 779
837static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 780static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
838{ 781{
839 rq->nr_running--; 782 rq->nr_running--;
840 dec_raw_weighted_load(rq, p); 783 dec_load(rq, p, now);
784}
785
786static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
787
788/*
789 * runqueue iterator, to support SMP load-balancing between different
790 * scheduling classes, without having to expose their internal data
791 * structures to the load-balancing proper:
792 */
793struct rq_iterator {
794 void *arg;
795 struct task_struct *(*start)(void *);
796 struct task_struct *(*next)(void *);
797};
798
799static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
800 unsigned long max_nr_move, unsigned long max_load_move,
801 struct sched_domain *sd, enum cpu_idle_type idle,
802 int *all_pinned, unsigned long *load_moved,
803 int this_best_prio, int best_prio, int best_prio_seen,
804 struct rq_iterator *iterator);
805
806#include "sched_stats.h"
807#include "sched_rt.c"
808#include "sched_fair.c"
809#include "sched_idletask.c"
810#ifdef CONFIG_SCHED_DEBUG
811# include "sched_debug.c"
812#endif
813
814#define sched_class_highest (&rt_sched_class)
815
816static void set_load_weight(struct task_struct *p)
817{
818 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
819 p->se.wait_runtime = 0;
820
821 if (task_has_rt_policy(p)) {
822 p->se.load.weight = prio_to_weight[0] * 2;
823 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
824 return;
825 }
826
827 /*
828 * SCHED_IDLE tasks get minimal weight:
829 */
830 if (p->policy == SCHED_IDLE) {
831 p->se.load.weight = WEIGHT_IDLEPRIO;
832 p->se.load.inv_weight = WMULT_IDLEPRIO;
833 return;
834 }
835
836 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
837 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
838}
839
840static void
841enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
842{
843 sched_info_queued(p);
844 p->sched_class->enqueue_task(rq, p, wakeup, now);
845 p->se.on_rq = 1;
846}
847
848static void
849dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
850{
851 p->sched_class->dequeue_task(rq, p, sleep, now);
852 p->se.on_rq = 0;
853}
854
855/*
856 * __normal_prio - return the priority that is based on the static prio
857 */
858static inline int __normal_prio(struct task_struct *p)
859{
860 return p->static_prio;
841} 861}
842 862
843/* 863/*
@@ -851,7 +871,7 @@ static inline int normal_prio(struct task_struct *p)
851{ 871{
852 int prio; 872 int prio;
853 873
854 if (has_rt_policy(p)) 874 if (task_has_rt_policy(p))
855 prio = MAX_RT_PRIO-1 - p->rt_priority; 875 prio = MAX_RT_PRIO-1 - p->rt_priority;
856 else 876 else
857 prio = __normal_prio(p); 877 prio = __normal_prio(p);
@@ -879,222 +899,47 @@ static int effective_prio(struct task_struct *p)
879} 899}
880 900
881/* 901/*
882 * __activate_task - move a task to the runqueue. 902 * activate_task - move a task to the runqueue.
883 */
884static void __activate_task(struct task_struct *p, struct rq *rq)
885{
886 struct prio_array *target = rq->active;
887
888 if (batch_task(p))
889 target = rq->expired;
890 enqueue_task(p, target);
891 inc_nr_running(p, rq);
892}
893
894/*
895 * __activate_idle_task - move idle task to the _front_ of runqueue.
896 */
897static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
898{
899 enqueue_task_head(p, rq->active);
900 inc_nr_running(p, rq);
901}
902
903/*
904 * Recalculate p->normal_prio and p->prio after having slept,
905 * updating the sleep-average too:
906 */ 903 */
907static int recalc_task_prio(struct task_struct *p, unsigned long long now) 904static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
908{ 905{
909 /* Caller must always ensure 'now >= p->timestamp' */ 906 u64 now = rq_clock(rq);
910 unsigned long sleep_time = now - p->timestamp;
911
912 if (batch_task(p))
913 sleep_time = 0;
914
915 if (likely(sleep_time > 0)) {
916 /*
917 * This ceiling is set to the lowest priority that would allow
918 * a task to be reinserted into the active array on timeslice
919 * completion.
920 */
921 unsigned long ceiling = INTERACTIVE_SLEEP(p);
922
923 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
924 /*
925 * Prevents user tasks from achieving best priority
926 * with one single large enough sleep.
927 */
928 p->sleep_avg = ceiling;
929 /*
930 * Using INTERACTIVE_SLEEP() as a ceiling places a
931 * nice(0) task 1ms sleep away from promotion, and
932 * gives it 700ms to round-robin with no chance of
933 * being demoted. This is more than generous, so
934 * mark this sleep as non-interactive to prevent the
935 * on-runqueue bonus logic from intervening should
936 * this task not receive cpu immediately.
937 */
938 p->sleep_type = SLEEP_NONINTERACTIVE;
939 } else {
940 /*
941 * Tasks waking from uninterruptible sleep are
942 * limited in their sleep_avg rise as they
943 * are likely to be waiting on I/O
944 */
945 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
946 if (p->sleep_avg >= ceiling)
947 sleep_time = 0;
948 else if (p->sleep_avg + sleep_time >=
949 ceiling) {
950 p->sleep_avg = ceiling;
951 sleep_time = 0;
952 }
953 }
954 907
955 /* 908 if (p->state == TASK_UNINTERRUPTIBLE)
956 * This code gives a bonus to interactive tasks. 909 rq->nr_uninterruptible--;
957 *
958 * The boost works by updating the 'average sleep time'
959 * value here, based on ->timestamp. The more time a
960 * task spends sleeping, the higher the average gets -
961 * and the higher the priority boost gets as well.
962 */
963 p->sleep_avg += sleep_time;
964
965 }
966 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
967 p->sleep_avg = NS_MAX_SLEEP_AVG;
968 }
969 910
970 return effective_prio(p); 911 enqueue_task(rq, p, wakeup, now);
912 inc_nr_running(p, rq, now);
971} 913}
972 914
973/* 915/*
974 * activate_task - move a task to the runqueue and do priority recalculation 916 * activate_idle_task - move idle task to the _front_ of runqueue.
975 *
976 * Update all the scheduling statistics stuff. (sleep average
977 * calculation, priority modifiers, etc.)
978 */ 917 */
979static void activate_task(struct task_struct *p, struct rq *rq, int local) 918static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
980{ 919{
981 unsigned long long now; 920 u64 now = rq_clock(rq);
982 921
983 if (rt_task(p)) 922 if (p->state == TASK_UNINTERRUPTIBLE)
984 goto out; 923 rq->nr_uninterruptible--;
985
986 now = sched_clock();
987#ifdef CONFIG_SMP
988 if (!local) {
989 /* Compensate for drifting sched_clock */
990 struct rq *this_rq = this_rq();
991 now = (now - this_rq->most_recent_timestamp)
992 + rq->most_recent_timestamp;
993 }
994#endif
995
996 /*
997 * Sleep time is in units of nanosecs, so shift by 20 to get a
998 * milliseconds-range estimation of the amount of time that the task
999 * spent sleeping:
1000 */
1001 if (unlikely(prof_on == SLEEP_PROFILING)) {
1002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1004 (now - p->timestamp) >> 20);
1005 }
1006
1007 p->prio = recalc_task_prio(p, now);
1008 924
1009 /* 925 enqueue_task(rq, p, 0, now);
1010 * This checks to make sure it's not an uninterruptible task 926 inc_nr_running(p, rq, now);
1011 * that is now waking up.
1012 */
1013 if (p->sleep_type == SLEEP_NORMAL) {
1014 /*
1015 * Tasks which were woken up by interrupts (ie. hw events)
1016 * are most likely of interactive nature. So we give them
1017 * the credit of extending their sleep time to the period
1018 * of time they spend on the runqueue, waiting for execution
1019 * on a CPU, first time around:
1020 */
1021 if (in_interrupt())
1022 p->sleep_type = SLEEP_INTERRUPTED;
1023 else {
1024 /*
1025 * Normal first-time wakeups get a credit too for
1026 * on-runqueue time, but it will be weighted down:
1027 */
1028 p->sleep_type = SLEEP_INTERACTIVE;
1029 }
1030 }
1031 p->timestamp = now;
1032out:
1033 __activate_task(p, rq);
1034} 927}
1035 928
1036/* 929/*
1037 * deactivate_task - remove a task from the runqueue. 930 * deactivate_task - remove a task from the runqueue.
1038 */ 931 */
1039static void deactivate_task(struct task_struct *p, struct rq *rq) 932static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1040{
1041 dec_nr_running(p, rq);
1042 dequeue_task(p, p->array);
1043 p->array = NULL;
1044}
1045
1046/*
1047 * resched_task - mark a task 'to be rescheduled now'.
1048 *
1049 * On UP this means the setting of the need_resched flag, on SMP it
1050 * might also involve a cross-CPU call to trigger the scheduler on
1051 * the target CPU.
1052 */
1053#ifdef CONFIG_SMP
1054
1055#ifndef tsk_is_polling
1056#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1057#endif
1058
1059static void resched_task(struct task_struct *p)
1060{ 933{
1061 int cpu; 934 u64 now = rq_clock(rq);
1062 935
1063 assert_spin_locked(&task_rq(p)->lock); 936 if (p->state == TASK_UNINTERRUPTIBLE)
937 rq->nr_uninterruptible++;
1064 938
1065 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 939 dequeue_task(rq, p, sleep, now);
1066 return; 940 dec_nr_running(p, rq, now);
1067
1068 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1069
1070 cpu = task_cpu(p);
1071 if (cpu == smp_processor_id())
1072 return;
1073
1074 /* NEED_RESCHED must be visible before we test polling */
1075 smp_mb();
1076 if (!tsk_is_polling(p))
1077 smp_send_reschedule(cpu);
1078} 941}
1079 942
1080static void resched_cpu(int cpu)
1081{
1082 struct rq *rq = cpu_rq(cpu);
1083 unsigned long flags;
1084
1085 if (!spin_trylock_irqsave(&rq->lock, flags))
1086 return;
1087 resched_task(cpu_curr(cpu));
1088 spin_unlock_irqrestore(&rq->lock, flags);
1089}
1090#else
1091static inline void resched_task(struct task_struct *p)
1092{
1093 assert_spin_locked(&task_rq(p)->lock);
1094 set_tsk_need_resched(p);
1095}
1096#endif
1097
1098/** 943/**
1099 * task_curr - is this task currently executing on a CPU? 944 * task_curr - is this task currently executing on a CPU?
1100 * @p: the task in question. 945 * @p: the task in question.
@@ -1107,10 +952,42 @@ inline int task_curr(const struct task_struct *p)
1107/* Used instead of source_load when we know the type == 0 */ 952/* Used instead of source_load when we know the type == 0 */
1108unsigned long weighted_cpuload(const int cpu) 953unsigned long weighted_cpuload(const int cpu)
1109{ 954{
1110 return cpu_rq(cpu)->raw_weighted_load; 955 return cpu_rq(cpu)->ls.load.weight;
956}
957
958static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
959{
960#ifdef CONFIG_SMP
961 task_thread_info(p)->cpu = cpu;
962 set_task_cfs_rq(p);
963#endif
1111} 964}
1112 965
1113#ifdef CONFIG_SMP 966#ifdef CONFIG_SMP
967
968void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
969{
970 int old_cpu = task_cpu(p);
971 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
972 u64 clock_offset, fair_clock_offset;
973
974 clock_offset = old_rq->clock - new_rq->clock;
975 fair_clock_offset = old_rq->cfs.fair_clock -
976 new_rq->cfs.fair_clock;
977 if (p->se.wait_start)
978 p->se.wait_start -= clock_offset;
979 if (p->se.wait_start_fair)
980 p->se.wait_start_fair -= fair_clock_offset;
981 if (p->se.sleep_start)
982 p->se.sleep_start -= clock_offset;
983 if (p->se.block_start)
984 p->se.block_start -= clock_offset;
985 if (p->se.sleep_start_fair)
986 p->se.sleep_start_fair -= fair_clock_offset;
987
988 __set_task_cpu(p, new_cpu);
989}
990
1114struct migration_req { 991struct migration_req {
1115 struct list_head list; 992 struct list_head list;
1116 993
@@ -1133,7 +1010,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1133 * If the task is not on a runqueue (and not running), then 1010 * If the task is not on a runqueue (and not running), then
1134 * it is sufficient to simply update the task's cpu field. 1011 * it is sufficient to simply update the task's cpu field.
1135 */ 1012 */
1136 if (!p->array && !task_running(rq, p)) { 1013 if (!p->se.on_rq && !task_running(rq, p)) {
1137 set_task_cpu(p, dest_cpu); 1014 set_task_cpu(p, dest_cpu);
1138 return 0; 1015 return 0;
1139 } 1016 }
@@ -1158,9 +1035,8 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1158void wait_task_inactive(struct task_struct *p) 1035void wait_task_inactive(struct task_struct *p)
1159{ 1036{
1160 unsigned long flags; 1037 unsigned long flags;
1038 int running, on_rq;
1161 struct rq *rq; 1039 struct rq *rq;
1162 struct prio_array *array;
1163 int running;
1164 1040
1165repeat: 1041repeat:
1166 /* 1042 /*
@@ -1192,7 +1068,7 @@ repeat:
1192 */ 1068 */
1193 rq = task_rq_lock(p, &flags); 1069 rq = task_rq_lock(p, &flags);
1194 running = task_running(rq, p); 1070 running = task_running(rq, p);
1195 array = p->array; 1071 on_rq = p->se.on_rq;
1196 task_rq_unlock(rq, &flags); 1072 task_rq_unlock(rq, &flags);
1197 1073
1198 /* 1074 /*
@@ -1215,7 +1091,7 @@ repeat:
1215 * running right now), it's preempted, and we should 1091 * running right now), it's preempted, and we should
1216 * yield - it could be a while. 1092 * yield - it could be a while.
1217 */ 1093 */
1218 if (unlikely(array)) { 1094 if (unlikely(on_rq)) {
1219 yield(); 1095 yield();
1220 goto repeat; 1096 goto repeat;
1221 } 1097 }
@@ -1261,11 +1137,12 @@ void kick_process(struct task_struct *p)
1261static inline unsigned long source_load(int cpu, int type) 1137static inline unsigned long source_load(int cpu, int type)
1262{ 1138{
1263 struct rq *rq = cpu_rq(cpu); 1139 struct rq *rq = cpu_rq(cpu);
1140 unsigned long total = weighted_cpuload(cpu);
1264 1141
1265 if (type == 0) 1142 if (type == 0)
1266 return rq->raw_weighted_load; 1143 return total;
1267 1144
1268 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1145 return min(rq->cpu_load[type-1], total);
1269} 1146}
1270 1147
1271/* 1148/*
@@ -1275,11 +1152,12 @@ static inline unsigned long source_load(int cpu, int type)
1275static inline unsigned long target_load(int cpu, int type) 1152static inline unsigned long target_load(int cpu, int type)
1276{ 1153{
1277 struct rq *rq = cpu_rq(cpu); 1154 struct rq *rq = cpu_rq(cpu);
1155 unsigned long total = weighted_cpuload(cpu);
1278 1156
1279 if (type == 0) 1157 if (type == 0)
1280 return rq->raw_weighted_load; 1158 return total;
1281 1159
1282 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1160 return max(rq->cpu_load[type-1], total);
1283} 1161}
1284 1162
1285/* 1163/*
@@ -1288,9 +1166,10 @@ static inline unsigned long target_load(int cpu, int type)
1288static inline unsigned long cpu_avg_load_per_task(int cpu) 1166static inline unsigned long cpu_avg_load_per_task(int cpu)
1289{ 1167{
1290 struct rq *rq = cpu_rq(cpu); 1168 struct rq *rq = cpu_rq(cpu);
1169 unsigned long total = weighted_cpuload(cpu);
1291 unsigned long n = rq->nr_running; 1170 unsigned long n = rq->nr_running;
1292 1171
1293 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1172 return n ? total / n : SCHED_LOAD_SCALE;
1294} 1173}
1295 1174
1296/* 1175/*
@@ -1392,9 +1271,9 @@ static int sched_balance_self(int cpu, int flag)
1392 struct sched_domain *tmp, *sd = NULL; 1271 struct sched_domain *tmp, *sd = NULL;
1393 1272
1394 for_each_domain(cpu, tmp) { 1273 for_each_domain(cpu, tmp) {
1395 /* 1274 /*
1396 * If power savings logic is enabled for a domain, stop there. 1275 * If power savings logic is enabled for a domain, stop there.
1397 */ 1276 */
1398 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1277 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1399 break; 1278 break;
1400 if (tmp->flags & flag) 1279 if (tmp->flags & flag)
@@ -1477,9 +1356,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1477 if (idle_cpu(i)) 1356 if (idle_cpu(i))
1478 return i; 1357 return i;
1479 } 1358 }
1480 } 1359 } else {
1481 else
1482 break; 1360 break;
1361 }
1483 } 1362 }
1484 return cpu; 1363 return cpu;
1485} 1364}
@@ -1521,7 +1400,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1521 if (!(old_state & state)) 1400 if (!(old_state & state))
1522 goto out; 1401 goto out;
1523 1402
1524 if (p->array) 1403 if (p->se.on_rq)
1525 goto out_running; 1404 goto out_running;
1526 1405
1527 cpu = task_cpu(p); 1406 cpu = task_cpu(p);
@@ -1576,11 +1455,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1576 * of the current CPU: 1455 * of the current CPU:
1577 */ 1456 */
1578 if (sync) 1457 if (sync)
1579 tl -= current->load_weight; 1458 tl -= current->se.load.weight;
1580 1459
1581 if ((tl <= load && 1460 if ((tl <= load &&
1582 tl + target_load(cpu, idx) <= tl_per_task) || 1461 tl + target_load(cpu, idx) <= tl_per_task) ||
1583 100*(tl + p->load_weight) <= imbalance*load) { 1462 100*(tl + p->se.load.weight) <= imbalance*load) {
1584 /* 1463 /*
1585 * This domain has SD_WAKE_AFFINE and 1464 * This domain has SD_WAKE_AFFINE and
1586 * p is cache cold in this domain, and 1465 * p is cache cold in this domain, and
@@ -1614,7 +1493,7 @@ out_set_cpu:
1614 old_state = p->state; 1493 old_state = p->state;
1615 if (!(old_state & state)) 1494 if (!(old_state & state))
1616 goto out; 1495 goto out;
1617 if (p->array) 1496 if (p->se.on_rq)
1618 goto out_running; 1497 goto out_running;
1619 1498
1620 this_cpu = smp_processor_id(); 1499 this_cpu = smp_processor_id();
@@ -1623,25 +1502,7 @@ out_set_cpu:
1623 1502
1624out_activate: 1503out_activate:
1625#endif /* CONFIG_SMP */ 1504#endif /* CONFIG_SMP */
1626 if (old_state == TASK_UNINTERRUPTIBLE) { 1505 activate_task(rq, p, 1);
1627 rq->nr_uninterruptible--;
1628 /*
1629 * Tasks on involuntary sleep don't earn
1630 * sleep_avg beyond just interactive state.
1631 */
1632 p->sleep_type = SLEEP_NONINTERACTIVE;
1633 } else
1634
1635 /*
1636 * Tasks that have marked their sleep as noninteractive get
1637 * woken up with their sleep average not weighted in an
1638 * interactive way.
1639 */
1640 if (old_state & TASK_NONINTERACTIVE)
1641 p->sleep_type = SLEEP_NONINTERACTIVE;
1642
1643
1644 activate_task(p, rq, cpu == this_cpu);
1645 /* 1506 /*
1646 * Sync wakeups (i.e. those types of wakeups where the waker 1507 * Sync wakeups (i.e. those types of wakeups where the waker
1647 * has indicated that it will leave the CPU in short order) 1508 * has indicated that it will leave the CPU in short order)
@@ -1650,10 +1511,8 @@ out_activate:
1650 * the waker guarantees that the freshly woken up task is going 1511 * the waker guarantees that the freshly woken up task is going
1651 * to be considered on this CPU.) 1512 * to be considered on this CPU.)
1652 */ 1513 */
1653 if (!sync || cpu != this_cpu) { 1514 if (!sync || cpu != this_cpu)
1654 if (TASK_PREEMPTS_CURR(p, rq)) 1515 check_preempt_curr(rq, p);
1655 resched_task(rq->curr);
1656 }
1657 success = 1; 1516 success = 1;
1658 1517
1659out_running: 1518out_running:
@@ -1676,19 +1535,36 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1676 return try_to_wake_up(p, state, 0); 1535 return try_to_wake_up(p, state, 0);
1677} 1536}
1678 1537
1679static void task_running_tick(struct rq *rq, struct task_struct *p);
1680/* 1538/*
1681 * Perform scheduler related setup for a newly forked process p. 1539 * Perform scheduler related setup for a newly forked process p.
1682 * p is forked by current. 1540 * p is forked by current.
1683 */ 1541 *
1684void fastcall sched_fork(struct task_struct *p, int clone_flags) 1542 * __sched_fork() is basic setup used by init_idle() too:
1685{ 1543 */
1686 int cpu = get_cpu(); 1544static void __sched_fork(struct task_struct *p)
1545{
1546 p->se.wait_start_fair = 0;
1547 p->se.wait_start = 0;
1548 p->se.exec_start = 0;
1549 p->se.sum_exec_runtime = 0;
1550 p->se.delta_exec = 0;
1551 p->se.delta_fair_run = 0;
1552 p->se.delta_fair_sleep = 0;
1553 p->se.wait_runtime = 0;
1554 p->se.sum_wait_runtime = 0;
1555 p->se.sum_sleep_runtime = 0;
1556 p->se.sleep_start = 0;
1557 p->se.sleep_start_fair = 0;
1558 p->se.block_start = 0;
1559 p->se.sleep_max = 0;
1560 p->se.block_max = 0;
1561 p->se.exec_max = 0;
1562 p->se.wait_max = 0;
1563 p->se.wait_runtime_overruns = 0;
1564 p->se.wait_runtime_underruns = 0;
1687 1565
1688#ifdef CONFIG_SMP 1566 INIT_LIST_HEAD(&p->run_list);
1689 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1567 p->se.on_rq = 0;
1690#endif
1691 set_task_cpu(p, cpu);
1692 1568
1693 /* 1569 /*
1694 * We mark the process as running here, but have not actually 1570 * We mark the process as running here, but have not actually
@@ -1697,16 +1573,29 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1697 * event cannot wake it up and insert it on the runqueue either. 1573 * event cannot wake it up and insert it on the runqueue either.
1698 */ 1574 */
1699 p->state = TASK_RUNNING; 1575 p->state = TASK_RUNNING;
1576}
1577
1578/*
1579 * fork()/clone()-time setup:
1580 */
1581void sched_fork(struct task_struct *p, int clone_flags)
1582{
1583 int cpu = get_cpu();
1584
1585 __sched_fork(p);
1586
1587#ifdef CONFIG_SMP
1588 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1589#endif
1590 __set_task_cpu(p, cpu);
1700 1591
1701 /* 1592 /*
1702 * Make sure we do not leak PI boosting priority to the child: 1593 * Make sure we do not leak PI boosting priority to the child:
1703 */ 1594 */
1704 p->prio = current->normal_prio; 1595 p->prio = current->normal_prio;
1705 1596
1706 INIT_LIST_HEAD(&p->run_list);
1707 p->array = NULL;
1708#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1597#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1709 if (unlikely(sched_info_on())) 1598 if (likely(sched_info_on()))
1710 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1599 memset(&p->sched_info, 0, sizeof(p->sched_info));
1711#endif 1600#endif
1712#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1601#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -1716,34 +1605,16 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1716 /* Want to start with kernel preemption disabled. */ 1605 /* Want to start with kernel preemption disabled. */
1717 task_thread_info(p)->preempt_count = 1; 1606 task_thread_info(p)->preempt_count = 1;
1718#endif 1607#endif
1719 /*
1720 * Share the timeslice between parent and child, thus the
1721 * total amount of pending timeslices in the system doesn't change,
1722 * resulting in more scheduling fairness.
1723 */
1724 local_irq_disable();
1725 p->time_slice = (current->time_slice + 1) >> 1;
1726 /*
1727 * The remainder of the first timeslice might be recovered by
1728 * the parent if the child exits early enough.
1729 */
1730 p->first_time_slice = 1;
1731 current->time_slice >>= 1;
1732 p->timestamp = sched_clock();
1733 if (unlikely(!current->time_slice)) {
1734 /*
1735 * This case is rare, it happens when the parent has only
1736 * a single jiffy left from its timeslice. Taking the
1737 * runqueue lock is not a problem.
1738 */
1739 current->time_slice = 1;
1740 task_running_tick(cpu_rq(cpu), current);
1741 }
1742 local_irq_enable();
1743 put_cpu(); 1608 put_cpu();
1744} 1609}
1745 1610
1746/* 1611/*
1612 * After fork, child runs first. (default) If set to 0 then
1613 * parent will (try to) run first.
1614 */
1615unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1616
1617/*
1747 * wake_up_new_task - wake up a newly created task for the first time. 1618 * wake_up_new_task - wake up a newly created task for the first time.
1748 * 1619 *
1749 * This function will do some initial scheduler statistics housekeeping 1620 * This function will do some initial scheduler statistics housekeeping
@@ -1752,107 +1623,27 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1752 */ 1623 */
1753void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1624void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1754{ 1625{
1755 struct rq *rq, *this_rq;
1756 unsigned long flags; 1626 unsigned long flags;
1757 int this_cpu, cpu; 1627 struct rq *rq;
1628 int this_cpu;
1758 1629
1759 rq = task_rq_lock(p, &flags); 1630 rq = task_rq_lock(p, &flags);
1760 BUG_ON(p->state != TASK_RUNNING); 1631 BUG_ON(p->state != TASK_RUNNING);
1761 this_cpu = smp_processor_id(); 1632 this_cpu = smp_processor_id(); /* parent's CPU */
1762 cpu = task_cpu(p);
1763
1764 /*
1765 * We decrease the sleep average of forking parents
1766 * and children as well, to keep max-interactive tasks
1767 * from forking tasks that are max-interactive. The parent
1768 * (current) is done further down, under its lock.
1769 */
1770 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1771 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1772 1633
1773 p->prio = effective_prio(p); 1634 p->prio = effective_prio(p);
1774 1635
1775 if (likely(cpu == this_cpu)) { 1636 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1776 if (!(clone_flags & CLONE_VM)) { 1637 task_cpu(p) != this_cpu || !current->se.on_rq) {
1777 /* 1638 activate_task(rq, p, 0);
1778 * The VM isn't cloned, so we're in a good position to
1779 * do child-runs-first in anticipation of an exec. This
1780 * usually avoids a lot of COW overhead.
1781 */
1782 if (unlikely(!current->array))
1783 __activate_task(p, rq);
1784 else {
1785 p->prio = current->prio;
1786 p->normal_prio = current->normal_prio;
1787 list_add_tail(&p->run_list, &current->run_list);
1788 p->array = current->array;
1789 p->array->nr_active++;
1790 inc_nr_running(p, rq);
1791 }
1792 set_need_resched();
1793 } else
1794 /* Run child last */
1795 __activate_task(p, rq);
1796 /*
1797 * We skip the following code due to cpu == this_cpu
1798 *
1799 * task_rq_unlock(rq, &flags);
1800 * this_rq = task_rq_lock(current, &flags);
1801 */
1802 this_rq = rq;
1803 } else { 1639 } else {
1804 this_rq = cpu_rq(this_cpu);
1805
1806 /* 1640 /*
1807 * Not the local CPU - must adjust timestamp. This should 1641 * Let the scheduling class do new task startup
1808 * get optimised away in the !CONFIG_SMP case. 1642 * management (if any):
1809 */ 1643 */
1810 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) 1644 p->sched_class->task_new(rq, p);
1811 + rq->most_recent_timestamp;
1812 __activate_task(p, rq);
1813 if (TASK_PREEMPTS_CURR(p, rq))
1814 resched_task(rq->curr);
1815
1816 /*
1817 * Parent and child are on different CPUs, now get the
1818 * parent runqueue to update the parent's ->sleep_avg:
1819 */
1820 task_rq_unlock(rq, &flags);
1821 this_rq = task_rq_lock(current, &flags);
1822 }
1823 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1824 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1825 task_rq_unlock(this_rq, &flags);
1826}
1827
1828/*
1829 * Potentially available exiting-child timeslices are
1830 * retrieved here - this way the parent does not get
1831 * penalized for creating too many threads.
1832 *
1833 * (this cannot be used to 'generate' timeslices
1834 * artificially, because any timeslice recovered here
1835 * was given away by the parent in the first place.)
1836 */
1837void fastcall sched_exit(struct task_struct *p)
1838{
1839 unsigned long flags;
1840 struct rq *rq;
1841
1842 /*
1843 * If the child was a (relative-) CPU hog then decrease
1844 * the sleep_avg of the parent as well.
1845 */
1846 rq = task_rq_lock(p->parent, &flags);
1847 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1848 p->parent->time_slice += p->time_slice;
1849 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1850 p->parent->time_slice = task_timeslice(p);
1851 } 1645 }
1852 if (p->sleep_avg < p->parent->sleep_avg) 1646 check_preempt_curr(rq, p);
1853 p->parent->sleep_avg = p->parent->sleep_avg /
1854 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1855 (EXIT_WEIGHT + 1);
1856 task_rq_unlock(rq, &flags); 1647 task_rq_unlock(rq, &flags);
1857} 1648}
1858 1649
@@ -1917,7 +1708,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1917 /* 1708 /*
1918 * Remove function-return probe instances associated with this 1709 * Remove function-return probe instances associated with this
1919 * task and put them back on the free list. 1710 * task and put them back on the free list.
1920 */ 1711 */
1921 kprobe_flush_task(prev); 1712 kprobe_flush_task(prev);
1922 put_task_struct(prev); 1713 put_task_struct(prev);
1923 } 1714 }
@@ -1945,13 +1736,15 @@ asmlinkage void schedule_tail(struct task_struct *prev)
1945 * context_switch - switch to the new MM and the new 1736 * context_switch - switch to the new MM and the new
1946 * thread's register state. 1737 * thread's register state.
1947 */ 1738 */
1948static inline struct task_struct * 1739static inline void
1949context_switch(struct rq *rq, struct task_struct *prev, 1740context_switch(struct rq *rq, struct task_struct *prev,
1950 struct task_struct *next) 1741 struct task_struct *next)
1951{ 1742{
1952 struct mm_struct *mm = next->mm; 1743 struct mm_struct *mm, *oldmm;
1953 struct mm_struct *oldmm = prev->active_mm;
1954 1744
1745 prepare_task_switch(rq, next);
1746 mm = next->mm;
1747 oldmm = prev->active_mm;
1955 /* 1748 /*
1956 * For paravirt, this is coupled with an exit in switch_to to 1749 * For paravirt, this is coupled with an exit in switch_to to
1957 * combine the page table reload and the switch backend into 1750 * combine the page table reload and the switch backend into
@@ -1959,16 +1752,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1959 */ 1752 */
1960 arch_enter_lazy_cpu_mode(); 1753 arch_enter_lazy_cpu_mode();
1961 1754
1962 if (!mm) { 1755 if (unlikely(!mm)) {
1963 next->active_mm = oldmm; 1756 next->active_mm = oldmm;
1964 atomic_inc(&oldmm->mm_count); 1757 atomic_inc(&oldmm->mm_count);
1965 enter_lazy_tlb(oldmm, next); 1758 enter_lazy_tlb(oldmm, next);
1966 } else 1759 } else
1967 switch_mm(oldmm, mm, next); 1760 switch_mm(oldmm, mm, next);
1968 1761
1969 if (!prev->mm) { 1762 if (unlikely(!prev->mm)) {
1970 prev->active_mm = NULL; 1763 prev->active_mm = NULL;
1971 WARN_ON(rq->prev_mm);
1972 rq->prev_mm = oldmm; 1764 rq->prev_mm = oldmm;
1973 } 1765 }
1974 /* 1766 /*
@@ -1984,7 +1776,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
1984 /* Here we just switch the register state and the stack. */ 1776 /* Here we just switch the register state and the stack. */
1985 switch_to(prev, next, prev); 1777 switch_to(prev, next, prev);
1986 1778
1987 return prev; 1779 barrier();
1780 /*
1781 * this_rq must be evaluated again because prev may have moved
1782 * CPUs since it called schedule(), thus the 'rq' on its stack
1783 * frame will be invalid.
1784 */
1785 finish_task_switch(this_rq(), prev);
1988} 1786}
1989 1787
1990/* 1788/*
@@ -2057,17 +1855,65 @@ unsigned long nr_active(void)
2057 return running + uninterruptible; 1855 return running + uninterruptible;
2058} 1856}
2059 1857
2060#ifdef CONFIG_SMP
2061
2062/* 1858/*
2063 * Is this task likely cache-hot: 1859 * Update rq->cpu_load[] statistics. This function is usually called every
1860 * scheduler tick (TICK_NSEC).
2064 */ 1861 */
2065static inline int 1862static void update_cpu_load(struct rq *this_rq)
2066task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2067{ 1863{
2068 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1864 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1865 unsigned long total_load = this_rq->ls.load.weight;
1866 unsigned long this_load = total_load;
1867 struct load_stat *ls = &this_rq->ls;
1868 u64 now = __rq_clock(this_rq);
1869 int i, scale;
1870
1871 this_rq->nr_load_updates++;
1872 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1873 goto do_avg;
1874
1875 /* Update delta_fair/delta_exec fields first */
1876 update_curr_load(this_rq, now);
1877
1878 fair_delta64 = ls->delta_fair + 1;
1879 ls->delta_fair = 0;
1880
1881 exec_delta64 = ls->delta_exec + 1;
1882 ls->delta_exec = 0;
1883
1884 sample_interval64 = now - ls->load_update_last;
1885 ls->load_update_last = now;
1886
1887 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1888 sample_interval64 = TICK_NSEC;
1889
1890 if (exec_delta64 > sample_interval64)
1891 exec_delta64 = sample_interval64;
1892
1893 idle_delta64 = sample_interval64 - exec_delta64;
1894
1895 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1896 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1897
1898 this_load = (unsigned long)tmp64;
1899
1900do_avg:
1901
1902 /* Update our load: */
1903 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1904 unsigned long old_load, new_load;
1905
1906 /* scale is effectively 1 << i now, and >> i divides by scale */
1907
1908 old_load = this_rq->cpu_load[i];
1909 new_load = this_load;
1910
1911 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1912 }
2069} 1913}
2070 1914
1915#ifdef CONFIG_SMP
1916
2071/* 1917/*
2072 * double_rq_lock - safely lock two runqueues 1918 * double_rq_lock - safely lock two runqueues
2073 * 1919 *
@@ -2184,23 +2030,17 @@ void sched_exec(void)
2184 * pull_task - move a task from a remote runqueue to the local runqueue. 2030 * pull_task - move a task from a remote runqueue to the local runqueue.
2185 * Both runqueues must be locked. 2031 * Both runqueues must be locked.
2186 */ 2032 */
2187static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2033static void pull_task(struct rq *src_rq, struct task_struct *p,
2188 struct task_struct *p, struct rq *this_rq, 2034 struct rq *this_rq, int this_cpu)
2189 struct prio_array *this_array, int this_cpu)
2190{ 2035{
2191 dequeue_task(p, src_array); 2036 deactivate_task(src_rq, p, 0);
2192 dec_nr_running(p, src_rq);
2193 set_task_cpu(p, this_cpu); 2037 set_task_cpu(p, this_cpu);
2194 inc_nr_running(p, this_rq); 2038 activate_task(this_rq, p, 0);
2195 enqueue_task(p, this_array);
2196 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2197 + this_rq->most_recent_timestamp;
2198 /* 2039 /*
2199 * Note that idle threads have a prio of MAX_PRIO, for this test 2040 * Note that idle threads have a prio of MAX_PRIO, for this test
2200 * to be always true for them. 2041 * to be always true for them.
2201 */ 2042 */
2202 if (TASK_PREEMPTS_CURR(p, this_rq)) 2043 check_preempt_curr(this_rq, p);
2203 resched_task(this_rq->curr);
2204} 2044}
2205 2045
2206/* 2046/*
@@ -2208,7 +2048,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2208 */ 2048 */
2209static 2049static
2210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2050int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2211 struct sched_domain *sd, enum idle_type idle, 2051 struct sched_domain *sd, enum cpu_idle_type idle,
2212 int *all_pinned) 2052 int *all_pinned)
2213{ 2053{
2214 /* 2054 /*
@@ -2225,132 +2065,67 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2225 return 0; 2065 return 0;
2226 2066
2227 /* 2067 /*
2228 * Aggressive migration if: 2068 * Aggressive migration if too many balance attempts have failed:
2229 * 1) task is cache cold, or
2230 * 2) too many balance attempts have failed.
2231 */ 2069 */
2232 2070 if (sd->nr_balance_failed > sd->cache_nice_tries)
2233 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2234#ifdef CONFIG_SCHEDSTATS
2235 if (task_hot(p, rq->most_recent_timestamp, sd))
2236 schedstat_inc(sd, lb_hot_gained[idle]);
2237#endif
2238 return 1; 2071 return 1;
2239 }
2240 2072
2241 if (task_hot(p, rq->most_recent_timestamp, sd))
2242 return 0;
2243 return 1; 2073 return 1;
2244} 2074}
2245 2075
2246#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2076static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2247
2248/*
2249 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2250 * load from busiest to this_rq, as part of a balancing operation within
2251 * "domain". Returns the number of tasks moved.
2252 *
2253 * Called with both runqueues locked.
2254 */
2255static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2256 unsigned long max_nr_move, unsigned long max_load_move, 2077 unsigned long max_nr_move, unsigned long max_load_move,
2257 struct sched_domain *sd, enum idle_type idle, 2078 struct sched_domain *sd, enum cpu_idle_type idle,
2258 int *all_pinned) 2079 int *all_pinned, unsigned long *load_moved,
2080 int this_best_prio, int best_prio, int best_prio_seen,
2081 struct rq_iterator *iterator)
2259{ 2082{
2260 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2083 int pulled = 0, pinned = 0, skip_for_load;
2261 best_prio_seen, skip_for_load; 2084 struct task_struct *p;
2262 struct prio_array *array, *dst_array; 2085 long rem_load_move = max_load_move;
2263 struct list_head *head, *curr;
2264 struct task_struct *tmp;
2265 long rem_load_move;
2266 2086
2267 if (max_nr_move == 0 || max_load_move == 0) 2087 if (max_nr_move == 0 || max_load_move == 0)
2268 goto out; 2088 goto out;
2269 2089
2270 rem_load_move = max_load_move;
2271 pinned = 1; 2090 pinned = 1;
2272 this_best_prio = rq_best_prio(this_rq);
2273 best_prio = rq_best_prio(busiest);
2274 /*
2275 * Enable handling of the case where there is more than one task
2276 * with the best priority. If the current running task is one
2277 * of those with prio==best_prio we know it won't be moved
2278 * and therefore it's safe to override the skip (based on load) of
2279 * any task we find with that prio.
2280 */
2281 best_prio_seen = best_prio == busiest->curr->prio;
2282 2091
2283 /* 2092 /*
2284 * We first consider expired tasks. Those will likely not be 2093 * Start the load-balancing iterator:
2285 * executed in the near future, and they are most likely to
2286 * be cache-cold, thus switching CPUs has the least effect
2287 * on them.
2288 */ 2094 */
2289 if (busiest->expired->nr_active) { 2095 p = iterator->start(iterator->arg);
2290 array = busiest->expired; 2096next:
2291 dst_array = this_rq->expired; 2097 if (!p)
2292 } else {
2293 array = busiest->active;
2294 dst_array = this_rq->active;
2295 }
2296
2297new_array:
2298 /* Start searching at priority 0: */
2299 idx = 0;
2300skip_bitmap:
2301 if (!idx)
2302 idx = sched_find_first_bit(array->bitmap);
2303 else
2304 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2305 if (idx >= MAX_PRIO) {
2306 if (array == busiest->expired && busiest->active->nr_active) {
2307 array = busiest->active;
2308 dst_array = this_rq->active;
2309 goto new_array;
2310 }
2311 goto out; 2098 goto out;
2312 }
2313
2314 head = array->queue + idx;
2315 curr = head->prev;
2316skip_queue:
2317 tmp = list_entry(curr, struct task_struct, run_list);
2318
2319 curr = curr->prev;
2320
2321 /* 2099 /*
2322 * To help distribute high priority tasks accross CPUs we don't 2100 * To help distribute high priority tasks accross CPUs we don't
2323 * skip a task if it will be the highest priority task (i.e. smallest 2101 * skip a task if it will be the highest priority task (i.e. smallest
2324 * prio value) on its new queue regardless of its load weight 2102 * prio value) on its new queue regardless of its load weight
2325 */ 2103 */
2326 skip_for_load = tmp->load_weight > rem_load_move; 2104 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2327 if (skip_for_load && idx < this_best_prio) 2105 SCHED_LOAD_SCALE_FUZZ;
2328 skip_for_load = !best_prio_seen && idx == best_prio; 2106 if (skip_for_load && p->prio < this_best_prio)
2107 skip_for_load = !best_prio_seen && p->prio == best_prio;
2329 if (skip_for_load || 2108 if (skip_for_load ||
2330 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2109 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2331 2110
2332 best_prio_seen |= idx == best_prio; 2111 best_prio_seen |= p->prio == best_prio;
2333 if (curr != head) 2112 p = iterator->next(iterator->arg);
2334 goto skip_queue; 2113 goto next;
2335 idx++;
2336 goto skip_bitmap;
2337 } 2114 }
2338 2115
2339 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2116 pull_task(busiest, p, this_rq, this_cpu);
2340 pulled++; 2117 pulled++;
2341 rem_load_move -= tmp->load_weight; 2118 rem_load_move -= p->se.load.weight;
2342 2119
2343 /* 2120 /*
2344 * We only want to steal up to the prescribed number of tasks 2121 * We only want to steal up to the prescribed number of tasks
2345 * and the prescribed amount of weighted load. 2122 * and the prescribed amount of weighted load.
2346 */ 2123 */
2347 if (pulled < max_nr_move && rem_load_move > 0) { 2124 if (pulled < max_nr_move && rem_load_move > 0) {
2348 if (idx < this_best_prio) 2125 if (p->prio < this_best_prio)
2349 this_best_prio = idx; 2126 this_best_prio = p->prio;
2350 if (curr != head) 2127 p = iterator->next(iterator->arg);
2351 goto skip_queue; 2128 goto next;
2352 idx++;
2353 goto skip_bitmap;
2354 } 2129 }
2355out: 2130out:
2356 /* 2131 /*
@@ -2362,18 +2137,48 @@ out:
2362 2137
2363 if (all_pinned) 2138 if (all_pinned)
2364 *all_pinned = pinned; 2139 *all_pinned = pinned;
2140 *load_moved = max_load_move - rem_load_move;
2365 return pulled; 2141 return pulled;
2366} 2142}
2367 2143
2368/* 2144/*
2145 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2146 * load from busiest to this_rq, as part of a balancing operation within
2147 * "domain". Returns the number of tasks moved.
2148 *
2149 * Called with both runqueues locked.
2150 */
2151static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2152 unsigned long max_nr_move, unsigned long max_load_move,
2153 struct sched_domain *sd, enum cpu_idle_type idle,
2154 int *all_pinned)
2155{
2156 struct sched_class *class = sched_class_highest;
2157 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2158 long rem_load_move = max_load_move;
2159
2160 do {
2161 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2162 max_nr_move, (unsigned long)rem_load_move,
2163 sd, idle, all_pinned, &load_moved);
2164 total_nr_moved += nr_moved;
2165 max_nr_move -= nr_moved;
2166 rem_load_move -= load_moved;
2167 class = class->next;
2168 } while (class && max_nr_move && rem_load_move > 0);
2169
2170 return total_nr_moved;
2171}
2172
2173/*
2369 * find_busiest_group finds and returns the busiest CPU group within the 2174 * find_busiest_group finds and returns the busiest CPU group within the
2370 * domain. It calculates and returns the amount of weighted load which 2175 * domain. It calculates and returns the amount of weighted load which
2371 * should be moved to restore balance via the imbalance parameter. 2176 * should be moved to restore balance via the imbalance parameter.
2372 */ 2177 */
2373static struct sched_group * 2178static struct sched_group *
2374find_busiest_group(struct sched_domain *sd, int this_cpu, 2179find_busiest_group(struct sched_domain *sd, int this_cpu,
2375 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2180 unsigned long *imbalance, enum cpu_idle_type idle,
2376 cpumask_t *cpus, int *balance) 2181 int *sd_idle, cpumask_t *cpus, int *balance)
2377{ 2182{
2378 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2183 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2379 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2184 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2391,9 +2196,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2391 max_load = this_load = total_load = total_pwr = 0; 2196 max_load = this_load = total_load = total_pwr = 0;
2392 busiest_load_per_task = busiest_nr_running = 0; 2197 busiest_load_per_task = busiest_nr_running = 0;
2393 this_load_per_task = this_nr_running = 0; 2198 this_load_per_task = this_nr_running = 0;
2394 if (idle == NOT_IDLE) 2199 if (idle == CPU_NOT_IDLE)
2395 load_idx = sd->busy_idx; 2200 load_idx = sd->busy_idx;
2396 else if (idle == NEWLY_IDLE) 2201 else if (idle == CPU_NEWLY_IDLE)
2397 load_idx = sd->newidle_idx; 2202 load_idx = sd->newidle_idx;
2398 else 2203 else
2399 load_idx = sd->idle_idx; 2204 load_idx = sd->idle_idx;
@@ -2437,7 +2242,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2437 2242
2438 avg_load += load; 2243 avg_load += load;
2439 sum_nr_running += rq->nr_running; 2244 sum_nr_running += rq->nr_running;
2440 sum_weighted_load += rq->raw_weighted_load; 2245 sum_weighted_load += weighted_cpuload(i);
2441 } 2246 }
2442 2247
2443 /* 2248 /*
@@ -2477,8 +2282,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2477 * Busy processors will not participate in power savings 2282 * Busy processors will not participate in power savings
2478 * balance. 2283 * balance.
2479 */ 2284 */
2480 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2285 if (idle == CPU_NOT_IDLE ||
2481 goto group_next; 2286 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2287 goto group_next;
2482 2288
2483 /* 2289 /*
2484 * If the local group is idle or completely loaded 2290 * If the local group is idle or completely loaded
@@ -2488,42 +2294,42 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2488 !this_nr_running)) 2294 !this_nr_running))
2489 power_savings_balance = 0; 2295 power_savings_balance = 0;
2490 2296
2491 /* 2297 /*
2492 * If a group is already running at full capacity or idle, 2298 * If a group is already running at full capacity or idle,
2493 * don't include that group in power savings calculations 2299 * don't include that group in power savings calculations
2494 */ 2300 */
2495 if (!power_savings_balance || sum_nr_running >= group_capacity 2301 if (!power_savings_balance || sum_nr_running >= group_capacity
2496 || !sum_nr_running) 2302 || !sum_nr_running)
2497 goto group_next; 2303 goto group_next;
2498 2304
2499 /* 2305 /*
2500 * Calculate the group which has the least non-idle load. 2306 * Calculate the group which has the least non-idle load.
2501 * This is the group from where we need to pick up the load 2307 * This is the group from where we need to pick up the load
2502 * for saving power 2308 * for saving power
2503 */ 2309 */
2504 if ((sum_nr_running < min_nr_running) || 2310 if ((sum_nr_running < min_nr_running) ||
2505 (sum_nr_running == min_nr_running && 2311 (sum_nr_running == min_nr_running &&
2506 first_cpu(group->cpumask) < 2312 first_cpu(group->cpumask) <
2507 first_cpu(group_min->cpumask))) { 2313 first_cpu(group_min->cpumask))) {
2508 group_min = group; 2314 group_min = group;
2509 min_nr_running = sum_nr_running; 2315 min_nr_running = sum_nr_running;
2510 min_load_per_task = sum_weighted_load / 2316 min_load_per_task = sum_weighted_load /
2511 sum_nr_running; 2317 sum_nr_running;
2512 } 2318 }
2513 2319
2514 /* 2320 /*
2515 * Calculate the group which is almost near its 2321 * Calculate the group which is almost near its
2516 * capacity but still has some space to pick up some load 2322 * capacity but still has some space to pick up some load
2517 * from other group and save more power 2323 * from other group and save more power
2518 */ 2324 */
2519 if (sum_nr_running <= group_capacity - 1) { 2325 if (sum_nr_running <= group_capacity - 1) {
2520 if (sum_nr_running > leader_nr_running || 2326 if (sum_nr_running > leader_nr_running ||
2521 (sum_nr_running == leader_nr_running && 2327 (sum_nr_running == leader_nr_running &&
2522 first_cpu(group->cpumask) > 2328 first_cpu(group->cpumask) >
2523 first_cpu(group_leader->cpumask))) { 2329 first_cpu(group_leader->cpumask))) {
2524 group_leader = group; 2330 group_leader = group;
2525 leader_nr_running = sum_nr_running; 2331 leader_nr_running = sum_nr_running;
2526 } 2332 }
2527 } 2333 }
2528group_next: 2334group_next:
2529#endif 2335#endif
@@ -2578,7 +2384,7 @@ group_next:
2578 * a think about bumping its value to force at least one task to be 2384 * a think about bumping its value to force at least one task to be
2579 * moved 2385 * moved
2580 */ 2386 */
2581 if (*imbalance < busiest_load_per_task) { 2387 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2582 unsigned long tmp, pwr_now, pwr_move; 2388 unsigned long tmp, pwr_now, pwr_move;
2583 unsigned int imbn; 2389 unsigned int imbn;
2584 2390
@@ -2592,7 +2398,8 @@ small_imbalance:
2592 } else 2398 } else
2593 this_load_per_task = SCHED_LOAD_SCALE; 2399 this_load_per_task = SCHED_LOAD_SCALE;
2594 2400
2595 if (max_load - this_load >= busiest_load_per_task * imbn) { 2401 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2402 busiest_load_per_task * imbn) {
2596 *imbalance = busiest_load_per_task; 2403 *imbalance = busiest_load_per_task;
2597 return busiest; 2404 return busiest;
2598 } 2405 }
@@ -2639,7 +2446,7 @@ small_imbalance:
2639 2446
2640out_balanced: 2447out_balanced:
2641#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2448#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2642 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2449 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2643 goto ret; 2450 goto ret;
2644 2451
2645 if (this == group_leader && group_leader != group_min) { 2452 if (this == group_leader && group_leader != group_min) {
@@ -2656,7 +2463,7 @@ ret:
2656 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2463 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2657 */ 2464 */
2658static struct rq * 2465static struct rq *
2659find_busiest_queue(struct sched_group *group, enum idle_type idle, 2466find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2660 unsigned long imbalance, cpumask_t *cpus) 2467 unsigned long imbalance, cpumask_t *cpus)
2661{ 2468{
2662 struct rq *busiest = NULL, *rq; 2469 struct rq *busiest = NULL, *rq;
@@ -2664,17 +2471,19 @@ find_busiest_queue(struct sched_group *group, enum idle_type idle,
2664 int i; 2471 int i;
2665 2472
2666 for_each_cpu_mask(i, group->cpumask) { 2473 for_each_cpu_mask(i, group->cpumask) {
2474 unsigned long wl;
2667 2475
2668 if (!cpu_isset(i, *cpus)) 2476 if (!cpu_isset(i, *cpus))
2669 continue; 2477 continue;
2670 2478
2671 rq = cpu_rq(i); 2479 rq = cpu_rq(i);
2480 wl = weighted_cpuload(i);
2672 2481
2673 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2482 if (rq->nr_running == 1 && wl > imbalance)
2674 continue; 2483 continue;
2675 2484
2676 if (rq->raw_weighted_load > max_load) { 2485 if (wl > max_load) {
2677 max_load = rq->raw_weighted_load; 2486 max_load = wl;
2678 busiest = rq; 2487 busiest = rq;
2679 } 2488 }
2680 } 2489 }
@@ -2698,7 +2507,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
2698 * tasks if there is an imbalance. 2507 * tasks if there is an imbalance.
2699 */ 2508 */
2700static int load_balance(int this_cpu, struct rq *this_rq, 2509static int load_balance(int this_cpu, struct rq *this_rq,
2701 struct sched_domain *sd, enum idle_type idle, 2510 struct sched_domain *sd, enum cpu_idle_type idle,
2702 int *balance) 2511 int *balance)
2703{ 2512{
2704 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2513 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
@@ -2711,10 +2520,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
2711 /* 2520 /*
2712 * When power savings policy is enabled for the parent domain, idle 2521 * When power savings policy is enabled for the parent domain, idle
2713 * sibling can pick up load irrespective of busy siblings. In this case, 2522 * sibling can pick up load irrespective of busy siblings. In this case,
2714 * let the state of idle sibling percolate up as IDLE, instead of 2523 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2715 * portraying it as NOT_IDLE. 2524 * portraying it as CPU_NOT_IDLE.
2716 */ 2525 */
2717 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2526 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2718 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2527 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2719 sd_idle = 1; 2528 sd_idle = 1;
2720 2529
@@ -2848,7 +2657,7 @@ out_one_pinned:
2848 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2657 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2849 * tasks if there is an imbalance. 2658 * tasks if there is an imbalance.
2850 * 2659 *
2851 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2660 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2852 * this_rq is locked. 2661 * this_rq is locked.
2853 */ 2662 */
2854static int 2663static int
@@ -2865,31 +2674,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2865 * When power savings policy is enabled for the parent domain, idle 2674 * When power savings policy is enabled for the parent domain, idle
2866 * sibling can pick up load irrespective of busy siblings. In this case, 2675 * sibling can pick up load irrespective of busy siblings. In this case,
2867 * let the state of idle sibling percolate up as IDLE, instead of 2676 * let the state of idle sibling percolate up as IDLE, instead of
2868 * portraying it as NOT_IDLE. 2677 * portraying it as CPU_NOT_IDLE.
2869 */ 2678 */
2870 if (sd->flags & SD_SHARE_CPUPOWER && 2679 if (sd->flags & SD_SHARE_CPUPOWER &&
2871 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2872 sd_idle = 1; 2681 sd_idle = 1;
2873 2682
2874 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2683 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2875redo: 2684redo:
2876 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2685 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2877 &sd_idle, &cpus, NULL); 2686 &sd_idle, &cpus, NULL);
2878 if (!group) { 2687 if (!group) {
2879 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2688 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2880 goto out_balanced; 2689 goto out_balanced;
2881 } 2690 }
2882 2691
2883 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, 2692 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2884 &cpus); 2693 &cpus);
2885 if (!busiest) { 2694 if (!busiest) {
2886 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2695 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2887 goto out_balanced; 2696 goto out_balanced;
2888 } 2697 }
2889 2698
2890 BUG_ON(busiest == this_rq); 2699 BUG_ON(busiest == this_rq);
2891 2700
2892 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2701 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2893 2702
2894 nr_moved = 0; 2703 nr_moved = 0;
2895 if (busiest->nr_running > 1) { 2704 if (busiest->nr_running > 1) {
@@ -2897,7 +2706,7 @@ redo:
2897 double_lock_balance(this_rq, busiest); 2706 double_lock_balance(this_rq, busiest);
2898 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2707 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2899 minus_1_or_zero(busiest->nr_running), 2708 minus_1_or_zero(busiest->nr_running),
2900 imbalance, sd, NEWLY_IDLE, NULL); 2709 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2901 spin_unlock(&busiest->lock); 2710 spin_unlock(&busiest->lock);
2902 2711
2903 if (!nr_moved) { 2712 if (!nr_moved) {
@@ -2908,7 +2717,7 @@ redo:
2908 } 2717 }
2909 2718
2910 if (!nr_moved) { 2719 if (!nr_moved) {
2911 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2720 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2912 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2913 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2914 return -1; 2723 return -1;
@@ -2918,7 +2727,7 @@ redo:
2918 return nr_moved; 2727 return nr_moved;
2919 2728
2920out_balanced: 2729out_balanced:
2921 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2730 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2922 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2923 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2924 return -1; 2733 return -1;
@@ -2934,8 +2743,8 @@ out_balanced:
2934static void idle_balance(int this_cpu, struct rq *this_rq) 2743static void idle_balance(int this_cpu, struct rq *this_rq)
2935{ 2744{
2936 struct sched_domain *sd; 2745 struct sched_domain *sd;
2937 int pulled_task = 0; 2746 int pulled_task = -1;
2938 unsigned long next_balance = jiffies + 60 * HZ; 2747 unsigned long next_balance = jiffies + HZ;
2939 2748
2940 for_each_domain(this_cpu, sd) { 2749 for_each_domain(this_cpu, sd) {
2941 unsigned long interval; 2750 unsigned long interval;
@@ -2954,12 +2763,13 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
2954 if (pulled_task) 2763 if (pulled_task)
2955 break; 2764 break;
2956 } 2765 }
2957 if (!pulled_task) 2766 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2958 /* 2767 /*
2959 * We are going idle. next_balance may be set based on 2768 * We are going idle. next_balance may be set based on
2960 * a busy processor. So reset next_balance. 2769 * a busy processor. So reset next_balance.
2961 */ 2770 */
2962 this_rq->next_balance = next_balance; 2771 this_rq->next_balance = next_balance;
2772 }
2963} 2773}
2964 2774
2965/* 2775/*
@@ -3003,7 +2813,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3003 schedstat_inc(sd, alb_cnt); 2813 schedstat_inc(sd, alb_cnt);
3004 2814
3005 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2815 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
3006 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, 2816 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
3007 NULL)) 2817 NULL))
3008 schedstat_inc(sd, alb_pushed); 2818 schedstat_inc(sd, alb_pushed);
3009 else 2819 else
@@ -3012,32 +2822,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3012 spin_unlock(&target_rq->lock); 2822 spin_unlock(&target_rq->lock);
3013} 2823}
3014 2824
3015static void update_load(struct rq *this_rq)
3016{
3017 unsigned long this_load;
3018 unsigned int i, scale;
3019
3020 this_load = this_rq->raw_weighted_load;
3021
3022 /* Update our load: */
3023 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
3024 unsigned long old_load, new_load;
3025
3026 /* scale is effectively 1 << i now, and >> i divides by scale */
3027
3028 old_load = this_rq->cpu_load[i];
3029 new_load = this_load;
3030 /*
3031 * Round up the averaging division if load is increasing. This
3032 * prevents us from getting stuck on 9 if the load is 10, for
3033 * example.
3034 */
3035 if (new_load > old_load)
3036 new_load += scale-1;
3037 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3038 }
3039}
3040
3041#ifdef CONFIG_NO_HZ 2825#ifdef CONFIG_NO_HZ
3042static struct { 2826static struct {
3043 atomic_t load_balancer; 2827 atomic_t load_balancer;
@@ -3120,7 +2904,7 @@ static DEFINE_SPINLOCK(balancing);
3120 * 2904 *
3121 * Balancing parameters are set up in arch_init_sched_domains. 2905 * Balancing parameters are set up in arch_init_sched_domains.
3122 */ 2906 */
3123static inline void rebalance_domains(int cpu, enum idle_type idle) 2907static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3124{ 2908{
3125 int balance = 1; 2909 int balance = 1;
3126 struct rq *rq = cpu_rq(cpu); 2910 struct rq *rq = cpu_rq(cpu);
@@ -3134,13 +2918,16 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3134 continue; 2918 continue;
3135 2919
3136 interval = sd->balance_interval; 2920 interval = sd->balance_interval;
3137 if (idle != SCHED_IDLE) 2921 if (idle != CPU_IDLE)
3138 interval *= sd->busy_factor; 2922 interval *= sd->busy_factor;
3139 2923
3140 /* scale ms to jiffies */ 2924 /* scale ms to jiffies */
3141 interval = msecs_to_jiffies(interval); 2925 interval = msecs_to_jiffies(interval);
3142 if (unlikely(!interval)) 2926 if (unlikely(!interval))
3143 interval = 1; 2927 interval = 1;
2928 if (interval > HZ*NR_CPUS/10)
2929 interval = HZ*NR_CPUS/10;
2930
3144 2931
3145 if (sd->flags & SD_SERIALIZE) { 2932 if (sd->flags & SD_SERIALIZE) {
3146 if (!spin_trylock(&balancing)) 2933 if (!spin_trylock(&balancing))
@@ -3154,7 +2941,7 @@ static inline void rebalance_domains(int cpu, enum idle_type idle)
3154 * longer idle, or one of our SMT siblings is 2941 * longer idle, or one of our SMT siblings is
3155 * not idle. 2942 * not idle.
3156 */ 2943 */
3157 idle = NOT_IDLE; 2944 idle = CPU_NOT_IDLE;
3158 } 2945 }
3159 sd->last_balance = jiffies; 2946 sd->last_balance = jiffies;
3160 } 2947 }
@@ -3182,11 +2969,12 @@ out:
3182 */ 2969 */
3183static void run_rebalance_domains(struct softirq_action *h) 2970static void run_rebalance_domains(struct softirq_action *h)
3184{ 2971{
3185 int local_cpu = smp_processor_id(); 2972 int this_cpu = smp_processor_id();
3186 struct rq *local_rq = cpu_rq(local_cpu); 2973 struct rq *this_rq = cpu_rq(this_cpu);
3187 enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; 2974 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2975 CPU_IDLE : CPU_NOT_IDLE;
3188 2976
3189 rebalance_domains(local_cpu, idle); 2977 rebalance_domains(this_cpu, idle);
3190 2978
3191#ifdef CONFIG_NO_HZ 2979#ifdef CONFIG_NO_HZ
3192 /* 2980 /*
@@ -3194,13 +2982,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3194 * balancing on behalf of the other idle cpus whose ticks are 2982 * balancing on behalf of the other idle cpus whose ticks are
3195 * stopped. 2983 * stopped.
3196 */ 2984 */
3197 if (local_rq->idle_at_tick && 2985 if (this_rq->idle_at_tick &&
3198 atomic_read(&nohz.load_balancer) == local_cpu) { 2986 atomic_read(&nohz.load_balancer) == this_cpu) {
3199 cpumask_t cpus = nohz.cpu_mask; 2987 cpumask_t cpus = nohz.cpu_mask;
3200 struct rq *rq; 2988 struct rq *rq;
3201 int balance_cpu; 2989 int balance_cpu;
3202 2990
3203 cpu_clear(local_cpu, cpus); 2991 cpu_clear(this_cpu, cpus);
3204 for_each_cpu_mask(balance_cpu, cpus) { 2992 for_each_cpu_mask(balance_cpu, cpus) {
3205 /* 2993 /*
3206 * If this cpu gets work to do, stop the load balancing 2994 * If this cpu gets work to do, stop the load balancing
@@ -3213,8 +3001,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3213 rebalance_domains(balance_cpu, SCHED_IDLE); 3001 rebalance_domains(balance_cpu, SCHED_IDLE);
3214 3002
3215 rq = cpu_rq(balance_cpu); 3003 rq = cpu_rq(balance_cpu);
3216 if (time_after(local_rq->next_balance, rq->next_balance)) 3004 if (time_after(this_rq->next_balance, rq->next_balance))
3217 local_rq->next_balance = rq->next_balance; 3005 this_rq->next_balance = rq->next_balance;
3218 } 3006 }
3219 } 3007 }
3220#endif 3008#endif
@@ -3227,9 +3015,8 @@ static void run_rebalance_domains(struct softirq_action *h)
3227 * idle load balancing owner or decide to stop the periodic load balancing, 3015 * idle load balancing owner or decide to stop the periodic load balancing,
3228 * if the whole system is idle. 3016 * if the whole system is idle.
3229 */ 3017 */
3230static inline void trigger_load_balance(int cpu) 3018static inline void trigger_load_balance(struct rq *rq, int cpu)
3231{ 3019{
3232 struct rq *rq = cpu_rq(cpu);
3233#ifdef CONFIG_NO_HZ 3020#ifdef CONFIG_NO_HZ
3234 /* 3021 /*
3235 * If we were in the nohz mode recently and busy at the current 3022 * If we were in the nohz mode recently and busy at the current
@@ -3281,13 +3068,29 @@ static inline void trigger_load_balance(int cpu)
3281 if (time_after_eq(jiffies, rq->next_balance)) 3068 if (time_after_eq(jiffies, rq->next_balance))
3282 raise_softirq(SCHED_SOFTIRQ); 3069 raise_softirq(SCHED_SOFTIRQ);
3283} 3070}
3284#else 3071
3072#else /* CONFIG_SMP */
3073
3285/* 3074/*
3286 * on UP we do not need to balance between CPUs: 3075 * on UP we do not need to balance between CPUs:
3287 */ 3076 */
3288static inline void idle_balance(int cpu, struct rq *rq) 3077static inline void idle_balance(int cpu, struct rq *rq)
3289{ 3078{
3290} 3079}
3080
3081/* Avoid "used but not defined" warning on UP */
3082static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 unsigned long max_nr_move, unsigned long max_load_move,
3084 struct sched_domain *sd, enum cpu_idle_type idle,
3085 int *all_pinned, unsigned long *load_moved,
3086 int this_best_prio, int best_prio, int best_prio_seen,
3087 struct rq_iterator *iterator)
3088{
3089 *load_moved = 0;
3090
3091 return 0;
3092}
3093
3291#endif 3094#endif
3292 3095
3293DEFINE_PER_CPU(struct kernel_stat, kstat); 3096DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3295,54 +3098,28 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
3295EXPORT_PER_CPU_SYMBOL(kstat); 3098EXPORT_PER_CPU_SYMBOL(kstat);
3296 3099
3297/* 3100/*
3298 * This is called on clock ticks and on context switches. 3101 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3299 * Bank in p->sched_time the ns elapsed since the last tick or switch. 3102 * that have not yet been banked in case the task is currently running.
3300 */
3301static inline void
3302update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3303{
3304 p->sched_time += now - p->last_ran;
3305 p->last_ran = rq->most_recent_timestamp = now;
3306}
3307
3308/*
3309 * Return current->sched_time plus any more ns on the sched_clock
3310 * that have not yet been banked.
3311 */ 3103 */
3312unsigned long long current_sched_time(const struct task_struct *p) 3104unsigned long long task_sched_runtime(struct task_struct *p)
3313{ 3105{
3314 unsigned long long ns;
3315 unsigned long flags; 3106 unsigned long flags;
3107 u64 ns, delta_exec;
3108 struct rq *rq;
3316 3109
3317 local_irq_save(flags); 3110 rq = task_rq_lock(p, &flags);
3318 ns = p->sched_time + sched_clock() - p->last_ran; 3111 ns = p->se.sum_exec_runtime;
3319 local_irq_restore(flags); 3112 if (rq->curr == p) {
3113 delta_exec = rq_clock(rq) - p->se.exec_start;
3114 if ((s64)delta_exec > 0)
3115 ns += delta_exec;
3116 }
3117 task_rq_unlock(rq, &flags);
3320 3118
3321 return ns; 3119 return ns;
3322} 3120}
3323 3121
3324/* 3122/*
3325 * We place interactive tasks back into the active array, if possible.
3326 *
3327 * To guarantee that this does not starve expired tasks we ignore the
3328 * interactivity of a task if the first expired task had to wait more
3329 * than a 'reasonable' amount of time. This deadline timeout is
3330 * load-dependent, as the frequency of array switched decreases with
3331 * increasing number of running tasks. We also ignore the interactivity
3332 * if a better static_prio task has expired:
3333 */
3334static inline int expired_starving(struct rq *rq)
3335{
3336 if (rq->curr->static_prio > rq->best_expired_prio)
3337 return 1;
3338 if (!STARVATION_LIMIT || !rq->expired_timestamp)
3339 return 0;
3340 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3341 return 1;
3342 return 0;
3343}
3344
3345/*
3346 * Account user cpu time to a process. 3123 * Account user cpu time to a process.
3347 * @p: the process that the cpu time gets accounted to 3124 * @p: the process that the cpu time gets accounted to
3348 * @hardirq_offset: the offset to subtract from hardirq_count() 3125 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3415,81 +3192,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3415 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3192 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3416} 3193}
3417 3194
3418static void task_running_tick(struct rq *rq, struct task_struct *p)
3419{
3420 if (p->array != rq->active) {
3421 /* Task has expired but was not scheduled yet */
3422 set_tsk_need_resched(p);
3423 return;
3424 }
3425 spin_lock(&rq->lock);
3426 /*
3427 * The task was running during this tick - update the
3428 * time slice counter. Note: we do not update a thread's
3429 * priority until it either goes to sleep or uses up its
3430 * timeslice. This makes it possible for interactive tasks
3431 * to use up their timeslices at their highest priority levels.
3432 */
3433 if (rt_task(p)) {
3434 /*
3435 * RR tasks need a special form of timeslice management.
3436 * FIFO tasks have no timeslices.
3437 */
3438 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3439 p->time_slice = task_timeslice(p);
3440 p->first_time_slice = 0;
3441 set_tsk_need_resched(p);
3442
3443 /* put it at the end of the queue: */
3444 requeue_task(p, rq->active);
3445 }
3446 goto out_unlock;
3447 }
3448 if (!--p->time_slice) {
3449 dequeue_task(p, rq->active);
3450 set_tsk_need_resched(p);
3451 p->prio = effective_prio(p);
3452 p->time_slice = task_timeslice(p);
3453 p->first_time_slice = 0;
3454
3455 if (!rq->expired_timestamp)
3456 rq->expired_timestamp = jiffies;
3457 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3458 enqueue_task(p, rq->expired);
3459 if (p->static_prio < rq->best_expired_prio)
3460 rq->best_expired_prio = p->static_prio;
3461 } else
3462 enqueue_task(p, rq->active);
3463 } else {
3464 /*
3465 * Prevent a too long timeslice allowing a task to monopolize
3466 * the CPU. We do this by splitting up the timeslice into
3467 * smaller pieces.
3468 *
3469 * Note: this does not mean the task's timeslices expire or
3470 * get lost in any way, they just might be preempted by
3471 * another task of equal priority. (one with higher
3472 * priority would have preempted this task already.) We
3473 * requeue this task to the end of the list on this priority
3474 * level, which is in essence a round-robin of tasks with
3475 * equal priority.
3476 *
3477 * This only applies to tasks in the interactive
3478 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3479 */
3480 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3481 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3482 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3483 (p->array == rq->active)) {
3484
3485 requeue_task(p, rq->active);
3486 set_tsk_need_resched(p);
3487 }
3488 }
3489out_unlock:
3490 spin_unlock(&rq->lock);
3491}
3492
3493/* 3195/*
3494 * This function gets called by the timer code, with HZ frequency. 3196 * This function gets called by the timer code, with HZ frequency.
3495 * We call it with interrupts disabled. 3197 * We call it with interrupts disabled.
@@ -3499,20 +3201,19 @@ out_unlock:
3499 */ 3201 */
3500void scheduler_tick(void) 3202void scheduler_tick(void)
3501{ 3203{
3502 unsigned long long now = sched_clock();
3503 struct task_struct *p = current;
3504 int cpu = smp_processor_id(); 3204 int cpu = smp_processor_id();
3505 int idle_at_tick = idle_cpu(cpu);
3506 struct rq *rq = cpu_rq(cpu); 3205 struct rq *rq = cpu_rq(cpu);
3206 struct task_struct *curr = rq->curr;
3507 3207
3508 update_cpu_clock(p, rq, now); 3208 spin_lock(&rq->lock);
3209 if (curr != rq->idle) /* FIXME: needed? */
3210 curr->sched_class->task_tick(rq, curr);
3211 update_cpu_load(rq);
3212 spin_unlock(&rq->lock);
3509 3213
3510 if (!idle_at_tick)
3511 task_running_tick(rq, p);
3512#ifdef CONFIG_SMP 3214#ifdef CONFIG_SMP
3513 update_load(rq); 3215 rq->idle_at_tick = idle_cpu(cpu);
3514 rq->idle_at_tick = idle_at_tick; 3216 trigger_load_balance(rq, cpu);
3515 trigger_load_balance(cpu);
3516#endif 3217#endif
3517} 3218}
3518 3219
@@ -3554,170 +3255,129 @@ EXPORT_SYMBOL(sub_preempt_count);
3554 3255
3555#endif 3256#endif
3556 3257
3557static inline int interactive_sleep(enum sleep_type sleep_type) 3258/*
3259 * Print scheduling while atomic bug:
3260 */
3261static noinline void __schedule_bug(struct task_struct *prev)
3558{ 3262{
3559 return (sleep_type == SLEEP_INTERACTIVE || 3263 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3560 sleep_type == SLEEP_INTERRUPTED); 3264 prev->comm, preempt_count(), prev->pid);
3265 debug_show_held_locks(prev);
3266 if (irqs_disabled())
3267 print_irqtrace_events(prev);
3268 dump_stack();
3561} 3269}
3562 3270
3563/* 3271/*
3564 * schedule() is the main scheduler function. 3272 * Various schedule()-time debugging checks and statistics:
3565 */ 3273 */
3566asmlinkage void __sched schedule(void) 3274static inline void schedule_debug(struct task_struct *prev)
3567{ 3275{
3568 struct task_struct *prev, *next;
3569 struct prio_array *array;
3570 struct list_head *queue;
3571 unsigned long long now;
3572 unsigned long run_time;
3573 int cpu, idx, new_prio;
3574 long *switch_count;
3575 struct rq *rq;
3576
3577 /* 3276 /*
3578 * Test if we are atomic. Since do_exit() needs to call into 3277 * Test if we are atomic. Since do_exit() needs to call into
3579 * schedule() atomically, we ignore that path for now. 3278 * schedule() atomically, we ignore that path for now.
3580 * Otherwise, whine if we are scheduling when we should not be. 3279 * Otherwise, whine if we are scheduling when we should not be.
3581 */ 3280 */
3582 if (unlikely(in_atomic() && !current->exit_state)) { 3281 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3583 printk(KERN_ERR "BUG: scheduling while atomic: " 3282 __schedule_bug(prev);
3584 "%s/0x%08x/%d\n",
3585 current->comm, preempt_count(), current->pid);
3586 debug_show_held_locks(current);
3587 if (irqs_disabled())
3588 print_irqtrace_events(current);
3589 dump_stack();
3590 }
3591 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3592 3283
3593need_resched: 3284 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3594 preempt_disable();
3595 prev = current;
3596 release_kernel_lock(prev);
3597need_resched_nonpreemptible:
3598 rq = this_rq();
3599 3285
3600 /* 3286 schedstat_inc(this_rq(), sched_cnt);
3601 * The idle thread is not allowed to schedule! 3287}
3602 * Remove this check after it has been exercised a bit.
3603 */
3604 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3605 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3606 dump_stack();
3607 }
3608 3288
3609 schedstat_inc(rq, sched_cnt); 3289/*
3610 now = sched_clock(); 3290 * Pick up the highest-prio task:
3611 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3291 */
3612 run_time = now - prev->timestamp; 3292static inline struct task_struct *
3613 if (unlikely((long long)(now - prev->timestamp) < 0)) 3293pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3614 run_time = 0; 3294{
3615 } else 3295 struct sched_class *class;
3616 run_time = NS_MAX_SLEEP_AVG; 3296 struct task_struct *p;
3617 3297
3618 /* 3298 /*
3619 * Tasks charged proportionately less run_time at high sleep_avg to 3299 * Optimization: we know that if all tasks are in
3620 * delay them losing their interactive status 3300 * the fair class we can call that function directly:
3621 */ 3301 */
3622 run_time /= (CURRENT_BONUS(prev) ? : 1); 3302 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3623 3303 p = fair_sched_class.pick_next_task(rq, now);
3624 spin_lock_irq(&rq->lock); 3304 if (likely(p))
3625 3305 return p;
3626 switch_count = &prev->nivcsw;
3627 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3628 switch_count = &prev->nvcsw;
3629 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3630 unlikely(signal_pending(prev))))
3631 prev->state = TASK_RUNNING;
3632 else {
3633 if (prev->state == TASK_UNINTERRUPTIBLE)
3634 rq->nr_uninterruptible++;
3635 deactivate_task(prev, rq);
3636 }
3637 }
3638
3639 cpu = smp_processor_id();
3640 if (unlikely(!rq->nr_running)) {
3641 idle_balance(cpu, rq);
3642 if (!rq->nr_running) {
3643 next = rq->idle;
3644 rq->expired_timestamp = 0;
3645 goto switch_tasks;
3646 }
3647 } 3306 }
3648 3307
3649 array = rq->active; 3308 class = sched_class_highest;
3650 if (unlikely(!array->nr_active)) { 3309 for ( ; ; ) {
3310 p = class->pick_next_task(rq, now);
3311 if (p)
3312 return p;
3651 /* 3313 /*
3652 * Switch the active and expired arrays. 3314 * Will never be NULL as the idle class always
3315 * returns a non-NULL p:
3653 */ 3316 */
3654 schedstat_inc(rq, sched_switch); 3317 class = class->next;
3655 rq->active = rq->expired;
3656 rq->expired = array;
3657 array = rq->active;
3658 rq->expired_timestamp = 0;
3659 rq->best_expired_prio = MAX_PRIO;
3660 } 3318 }
3319}
3320
3321/*
3322 * schedule() is the main scheduler function.
3323 */
3324asmlinkage void __sched schedule(void)
3325{
3326 struct task_struct *prev, *next;
3327 long *switch_count;
3328 struct rq *rq;
3329 u64 now;
3330 int cpu;
3661 3331
3662 idx = sched_find_first_bit(array->bitmap); 3332need_resched:
3663 queue = array->queue + idx; 3333 preempt_disable();
3664 next = list_entry(queue->next, struct task_struct, run_list); 3334 cpu = smp_processor_id();
3335 rq = cpu_rq(cpu);
3336 rcu_qsctr_inc(cpu);
3337 prev = rq->curr;
3338 switch_count = &prev->nivcsw;
3665 3339
3666 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3340 release_kernel_lock(prev);
3667 unsigned long long delta = now - next->timestamp; 3341need_resched_nonpreemptible:
3668 if (unlikely((long long)(now - next->timestamp) < 0))
3669 delta = 0;
3670 3342
3671 if (next->sleep_type == SLEEP_INTERACTIVE) 3343 schedule_debug(prev);
3672 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3673 3344
3674 array = next->array; 3345 spin_lock_irq(&rq->lock);
3675 new_prio = recalc_task_prio(next, next->timestamp + delta); 3346 clear_tsk_need_resched(prev);
3676 3347
3677 if (unlikely(next->prio != new_prio)) { 3348 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3678 dequeue_task(next, array); 3349 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3679 next->prio = new_prio; 3350 unlikely(signal_pending(prev)))) {
3680 enqueue_task(next, array); 3351 prev->state = TASK_RUNNING;
3352 } else {
3353 deactivate_task(rq, prev, 1);
3681 } 3354 }
3355 switch_count = &prev->nvcsw;
3682 } 3356 }
3683 next->sleep_type = SLEEP_NORMAL;
3684switch_tasks:
3685 if (next == rq->idle)
3686 schedstat_inc(rq, sched_goidle);
3687 prefetch(next);
3688 prefetch_stack(next);
3689 clear_tsk_need_resched(prev);
3690 rcu_qsctr_inc(task_cpu(prev));
3691 3357
3692 update_cpu_clock(prev, rq, now); 3358 if (unlikely(!rq->nr_running))
3359 idle_balance(cpu, rq);
3693 3360
3694 prev->sleep_avg -= run_time; 3361 now = __rq_clock(rq);
3695 if ((long)prev->sleep_avg <= 0) 3362 prev->sched_class->put_prev_task(rq, prev, now);
3696 prev->sleep_avg = 0; 3363 next = pick_next_task(rq, prev, now);
3697 prev->timestamp = prev->last_ran = now;
3698 3364
3699 sched_info_switch(prev, next); 3365 sched_info_switch(prev, next);
3366
3700 if (likely(prev != next)) { 3367 if (likely(prev != next)) {
3701 next->timestamp = next->last_ran = now;
3702 rq->nr_switches++; 3368 rq->nr_switches++;
3703 rq->curr = next; 3369 rq->curr = next;
3704 ++*switch_count; 3370 ++*switch_count;
3705 3371
3706 prepare_task_switch(rq, next); 3372 context_switch(rq, prev, next); /* unlocks the rq */
3707 prev = context_switch(rq, prev, next);
3708 barrier();
3709 /*
3710 * this_rq must be evaluated again because prev may have moved
3711 * CPUs since it called schedule(), thus the 'rq' on its stack
3712 * frame will be invalid.
3713 */
3714 finish_task_switch(this_rq(), prev);
3715 } else 3373 } else
3716 spin_unlock_irq(&rq->lock); 3374 spin_unlock_irq(&rq->lock);
3717 3375
3718 prev = current; 3376 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3719 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3377 cpu = smp_processor_id();
3378 rq = cpu_rq(cpu);
3720 goto need_resched_nonpreemptible; 3379 goto need_resched_nonpreemptible;
3380 }
3721 preempt_enable_no_resched(); 3381 preempt_enable_no_resched();
3722 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3382 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3723 goto need_resched; 3383 goto need_resched;
@@ -4045,74 +3705,85 @@ out:
4045} 3705}
4046EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3706EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4047 3707
4048 3708static inline void
4049#define SLEEP_ON_VAR \ 3709sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4050 unsigned long flags; \ 3710{
4051 wait_queue_t wait; \ 3711 spin_lock_irqsave(&q->lock, *flags);
4052 init_waitqueue_entry(&wait, current); 3712 __add_wait_queue(q, wait);
4053
4054#define SLEEP_ON_HEAD \
4055 spin_lock_irqsave(&q->lock,flags); \
4056 __add_wait_queue(q, &wait); \
4057 spin_unlock(&q->lock); 3713 spin_unlock(&q->lock);
3714}
4058 3715
4059#define SLEEP_ON_TAIL \ 3716static inline void
4060 spin_lock_irq(&q->lock); \ 3717sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
4061 __remove_wait_queue(q, &wait); \ 3718{
4062 spin_unlock_irqrestore(&q->lock, flags); 3719 spin_lock_irq(&q->lock);
3720 __remove_wait_queue(q, wait);
3721 spin_unlock_irqrestore(&q->lock, *flags);
3722}
4063 3723
4064void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3724void __sched interruptible_sleep_on(wait_queue_head_t *q)
4065{ 3725{
4066 SLEEP_ON_VAR 3726 unsigned long flags;
3727 wait_queue_t wait;
3728
3729 init_waitqueue_entry(&wait, current);
4067 3730
4068 current->state = TASK_INTERRUPTIBLE; 3731 current->state = TASK_INTERRUPTIBLE;
4069 3732
4070 SLEEP_ON_HEAD 3733 sleep_on_head(q, &wait, &flags);
4071 schedule(); 3734 schedule();
4072 SLEEP_ON_TAIL 3735 sleep_on_tail(q, &wait, &flags);
4073} 3736}
4074EXPORT_SYMBOL(interruptible_sleep_on); 3737EXPORT_SYMBOL(interruptible_sleep_on);
4075 3738
4076long fastcall __sched 3739long __sched
4077interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3740interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4078{ 3741{
4079 SLEEP_ON_VAR 3742 unsigned long flags;
3743 wait_queue_t wait;
3744
3745 init_waitqueue_entry(&wait, current);
4080 3746
4081 current->state = TASK_INTERRUPTIBLE; 3747 current->state = TASK_INTERRUPTIBLE;
4082 3748
4083 SLEEP_ON_HEAD 3749 sleep_on_head(q, &wait, &flags);
4084 timeout = schedule_timeout(timeout); 3750 timeout = schedule_timeout(timeout);
4085 SLEEP_ON_TAIL 3751 sleep_on_tail(q, &wait, &flags);
4086 3752
4087 return timeout; 3753 return timeout;
4088} 3754}
4089EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3755EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4090 3756
4091void fastcall __sched sleep_on(wait_queue_head_t *q) 3757void __sched sleep_on(wait_queue_head_t *q)
4092{ 3758{
4093 SLEEP_ON_VAR 3759 unsigned long flags;
3760 wait_queue_t wait;
3761
3762 init_waitqueue_entry(&wait, current);
4094 3763
4095 current->state = TASK_UNINTERRUPTIBLE; 3764 current->state = TASK_UNINTERRUPTIBLE;
4096 3765
4097 SLEEP_ON_HEAD 3766 sleep_on_head(q, &wait, &flags);
4098 schedule(); 3767 schedule();
4099 SLEEP_ON_TAIL 3768 sleep_on_tail(q, &wait, &flags);
4100} 3769}
4101EXPORT_SYMBOL(sleep_on); 3770EXPORT_SYMBOL(sleep_on);
4102 3771
4103long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3772long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4104{ 3773{
4105 SLEEP_ON_VAR 3774 unsigned long flags;
3775 wait_queue_t wait;
3776
3777 init_waitqueue_entry(&wait, current);
4106 3778
4107 current->state = TASK_UNINTERRUPTIBLE; 3779 current->state = TASK_UNINTERRUPTIBLE;
4108 3780
4109 SLEEP_ON_HEAD 3781 sleep_on_head(q, &wait, &flags);
4110 timeout = schedule_timeout(timeout); 3782 timeout = schedule_timeout(timeout);
4111 SLEEP_ON_TAIL 3783 sleep_on_tail(q, &wait, &flags);
4112 3784
4113 return timeout; 3785 return timeout;
4114} 3786}
4115
4116EXPORT_SYMBOL(sleep_on_timeout); 3787EXPORT_SYMBOL(sleep_on_timeout);
4117 3788
4118#ifdef CONFIG_RT_MUTEXES 3789#ifdef CONFIG_RT_MUTEXES
@@ -4129,29 +3800,30 @@ EXPORT_SYMBOL(sleep_on_timeout);
4129 */ 3800 */
4130void rt_mutex_setprio(struct task_struct *p, int prio) 3801void rt_mutex_setprio(struct task_struct *p, int prio)
4131{ 3802{
4132 struct prio_array *array;
4133 unsigned long flags; 3803 unsigned long flags;
3804 int oldprio, on_rq;
4134 struct rq *rq; 3805 struct rq *rq;
4135 int oldprio; 3806 u64 now;
4136 3807
4137 BUG_ON(prio < 0 || prio > MAX_PRIO); 3808 BUG_ON(prio < 0 || prio > MAX_PRIO);
4138 3809
4139 rq = task_rq_lock(p, &flags); 3810 rq = task_rq_lock(p, &flags);
3811 now = rq_clock(rq);
4140 3812
4141 oldprio = p->prio; 3813 oldprio = p->prio;
4142 array = p->array; 3814 on_rq = p->se.on_rq;
4143 if (array) 3815 if (on_rq)
4144 dequeue_task(p, array); 3816 dequeue_task(rq, p, 0, now);
3817
3818 if (rt_prio(prio))
3819 p->sched_class = &rt_sched_class;
3820 else
3821 p->sched_class = &fair_sched_class;
3822
4145 p->prio = prio; 3823 p->prio = prio;
4146 3824
4147 if (array) { 3825 if (on_rq) {
4148 /* 3826 enqueue_task(rq, p, 0, now);
4149 * If changing to an RT priority then queue it
4150 * in the active array!
4151 */
4152 if (rt_task(p))
4153 array = rq->active;
4154 enqueue_task(p, array);
4155 /* 3827 /*
4156 * Reschedule if we are currently running on this runqueue and 3828 * Reschedule if we are currently running on this runqueue and
4157 * our priority decreased, or if we are not currently running on 3829 * our priority decreased, or if we are not currently running on
@@ -4160,8 +3832,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4160 if (task_running(rq, p)) { 3832 if (task_running(rq, p)) {
4161 if (p->prio > oldprio) 3833 if (p->prio > oldprio)
4162 resched_task(rq->curr); 3834 resched_task(rq->curr);
4163 } else if (TASK_PREEMPTS_CURR(p, rq)) 3835 } else {
4164 resched_task(rq->curr); 3836 check_preempt_curr(rq, p);
3837 }
4165 } 3838 }
4166 task_rq_unlock(rq, &flags); 3839 task_rq_unlock(rq, &flags);
4167} 3840}
@@ -4170,10 +3843,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4170 3843
4171void set_user_nice(struct task_struct *p, long nice) 3844void set_user_nice(struct task_struct *p, long nice)
4172{ 3845{
4173 struct prio_array *array; 3846 int old_prio, delta, on_rq;
4174 int old_prio, delta;
4175 unsigned long flags; 3847 unsigned long flags;
4176 struct rq *rq; 3848 struct rq *rq;
3849 u64 now;
4177 3850
4178 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3851 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4179 return; 3852 return;
@@ -4182,20 +3855,21 @@ void set_user_nice(struct task_struct *p, long nice)
4182 * the task might be in the middle of scheduling on another CPU. 3855 * the task might be in the middle of scheduling on another CPU.
4183 */ 3856 */
4184 rq = task_rq_lock(p, &flags); 3857 rq = task_rq_lock(p, &flags);
3858 now = rq_clock(rq);
4185 /* 3859 /*
4186 * The RT priorities are set via sched_setscheduler(), but we still 3860 * The RT priorities are set via sched_setscheduler(), but we still
4187 * allow the 'normal' nice value to be set - but as expected 3861 * allow the 'normal' nice value to be set - but as expected
4188 * it wont have any effect on scheduling until the task is 3862 * it wont have any effect on scheduling until the task is
4189 * not SCHED_NORMAL/SCHED_BATCH: 3863 * SCHED_FIFO/SCHED_RR:
4190 */ 3864 */
4191 if (has_rt_policy(p)) { 3865 if (task_has_rt_policy(p)) {
4192 p->static_prio = NICE_TO_PRIO(nice); 3866 p->static_prio = NICE_TO_PRIO(nice);
4193 goto out_unlock; 3867 goto out_unlock;
4194 } 3868 }
4195 array = p->array; 3869 on_rq = p->se.on_rq;
4196 if (array) { 3870 if (on_rq) {
4197 dequeue_task(p, array); 3871 dequeue_task(rq, p, 0, now);
4198 dec_raw_weighted_load(rq, p); 3872 dec_load(rq, p, now);
4199 } 3873 }
4200 3874
4201 p->static_prio = NICE_TO_PRIO(nice); 3875 p->static_prio = NICE_TO_PRIO(nice);
@@ -4204,9 +3878,9 @@ void set_user_nice(struct task_struct *p, long nice)
4204 p->prio = effective_prio(p); 3878 p->prio = effective_prio(p);
4205 delta = p->prio - old_prio; 3879 delta = p->prio - old_prio;
4206 3880
4207 if (array) { 3881 if (on_rq) {
4208 enqueue_task(p, array); 3882 enqueue_task(rq, p, 0, now);
4209 inc_raw_weighted_load(rq, p); 3883 inc_load(rq, p, now);
4210 /* 3884 /*
4211 * If the task increased its priority or is running and 3885 * If the task increased its priority or is running and
4212 * lowered its priority, then reschedule its CPU: 3886 * lowered its priority, then reschedule its CPU:
@@ -4326,20 +4000,28 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4326} 4000}
4327 4001
4328/* Actually do priority change: must hold rq lock. */ 4002/* Actually do priority change: must hold rq lock. */
4329static void __setscheduler(struct task_struct *p, int policy, int prio) 4003static void
4004__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4330{ 4005{
4331 BUG_ON(p->array); 4006 BUG_ON(p->se.on_rq);
4332 4007
4333 p->policy = policy; 4008 p->policy = policy;
4009 switch (p->policy) {
4010 case SCHED_NORMAL:
4011 case SCHED_BATCH:
4012 case SCHED_IDLE:
4013 p->sched_class = &fair_sched_class;
4014 break;
4015 case SCHED_FIFO:
4016 case SCHED_RR:
4017 p->sched_class = &rt_sched_class;
4018 break;
4019 }
4020
4334 p->rt_priority = prio; 4021 p->rt_priority = prio;
4335 p->normal_prio = normal_prio(p); 4022 p->normal_prio = normal_prio(p);
4336 /* we are holding p->pi_lock already */ 4023 /* we are holding p->pi_lock already */
4337 p->prio = rt_mutex_getprio(p); 4024 p->prio = rt_mutex_getprio(p);
4338 /*
4339 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4340 */
4341 if (policy == SCHED_BATCH)
4342 p->sleep_avg = 0;
4343 set_load_weight(p); 4025 set_load_weight(p);
4344} 4026}
4345 4027
@@ -4354,8 +4036,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
4354int sched_setscheduler(struct task_struct *p, int policy, 4036int sched_setscheduler(struct task_struct *p, int policy,
4355 struct sched_param *param) 4037 struct sched_param *param)
4356{ 4038{
4357 int retval, oldprio, oldpolicy = -1; 4039 int retval, oldprio, oldpolicy = -1, on_rq;
4358 struct prio_array *array;
4359 unsigned long flags; 4040 unsigned long flags;
4360 struct rq *rq; 4041 struct rq *rq;
4361 4042
@@ -4366,27 +4047,27 @@ recheck:
4366 if (policy < 0) 4047 if (policy < 0)
4367 policy = oldpolicy = p->policy; 4048 policy = oldpolicy = p->policy;
4368 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4049 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4369 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4050 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4051 policy != SCHED_IDLE)
4370 return -EINVAL; 4052 return -EINVAL;
4371 /* 4053 /*
4372 * Valid priorities for SCHED_FIFO and SCHED_RR are 4054 * Valid priorities for SCHED_FIFO and SCHED_RR are
4373 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4055 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4374 * SCHED_BATCH is 0. 4056 * SCHED_BATCH and SCHED_IDLE is 0.
4375 */ 4057 */
4376 if (param->sched_priority < 0 || 4058 if (param->sched_priority < 0 ||
4377 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4059 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4378 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4060 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4379 return -EINVAL; 4061 return -EINVAL;
4380 if (is_rt_policy(policy) != (param->sched_priority != 0)) 4062 if (rt_policy(policy) != (param->sched_priority != 0))
4381 return -EINVAL; 4063 return -EINVAL;
4382 4064
4383 /* 4065 /*
4384 * Allow unprivileged RT tasks to decrease priority: 4066 * Allow unprivileged RT tasks to decrease priority:
4385 */ 4067 */
4386 if (!capable(CAP_SYS_NICE)) { 4068 if (!capable(CAP_SYS_NICE)) {
4387 if (is_rt_policy(policy)) { 4069 if (rt_policy(policy)) {
4388 unsigned long rlim_rtprio; 4070 unsigned long rlim_rtprio;
4389 unsigned long flags;
4390 4071
4391 if (!lock_task_sighand(p, &flags)) 4072 if (!lock_task_sighand(p, &flags))
4392 return -ESRCH; 4073 return -ESRCH;
@@ -4402,6 +4083,12 @@ recheck:
4402 param->sched_priority > rlim_rtprio) 4083 param->sched_priority > rlim_rtprio)
4403 return -EPERM; 4084 return -EPERM;
4404 } 4085 }
4086 /*
4087 * Like positive nice levels, dont allow tasks to
4088 * move out of SCHED_IDLE either:
4089 */
4090 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4091 return -EPERM;
4405 4092
4406 /* can't change other user's priorities */ 4093 /* can't change other user's priorities */
4407 if ((current->euid != p->euid) && 4094 if ((current->euid != p->euid) &&
@@ -4429,13 +4116,13 @@ recheck:
4429 spin_unlock_irqrestore(&p->pi_lock, flags); 4116 spin_unlock_irqrestore(&p->pi_lock, flags);
4430 goto recheck; 4117 goto recheck;
4431 } 4118 }
4432 array = p->array; 4119 on_rq = p->se.on_rq;
4433 if (array) 4120 if (on_rq)
4434 deactivate_task(p, rq); 4121 deactivate_task(rq, p, 0);
4435 oldprio = p->prio; 4122 oldprio = p->prio;
4436 __setscheduler(p, policy, param->sched_priority); 4123 __setscheduler(rq, p, policy, param->sched_priority);
4437 if (array) { 4124 if (on_rq) {
4438 __activate_task(p, rq); 4125 activate_task(rq, p, 0);
4439 /* 4126 /*
4440 * Reschedule if we are currently running on this runqueue and 4127 * Reschedule if we are currently running on this runqueue and
4441 * our priority decreased, or if we are not currently running on 4128 * our priority decreased, or if we are not currently running on
@@ -4444,8 +4131,9 @@ recheck:
4444 if (task_running(rq, p)) { 4131 if (task_running(rq, p)) {
4445 if (p->prio > oldprio) 4132 if (p->prio > oldprio)
4446 resched_task(rq->curr); 4133 resched_task(rq->curr);
4447 } else if (TASK_PREEMPTS_CURR(p, rq)) 4134 } else {
4448 resched_task(rq->curr); 4135 check_preempt_curr(rq, p);
4136 }
4449 } 4137 }
4450 __task_rq_unlock(rq); 4138 __task_rq_unlock(rq);
4451 spin_unlock_irqrestore(&p->pi_lock, flags); 4139 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4717,41 +4405,18 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4717/** 4405/**
4718 * sys_sched_yield - yield the current processor to other threads. 4406 * sys_sched_yield - yield the current processor to other threads.
4719 * 4407 *
4720 * This function yields the current CPU by moving the calling thread 4408 * This function yields the current CPU to other tasks. If there are no
4721 * to the expired array. If there are no other threads running on this 4409 * other threads running on this CPU then this function will return.
4722 * CPU then this function will return.
4723 */ 4410 */
4724asmlinkage long sys_sched_yield(void) 4411asmlinkage long sys_sched_yield(void)
4725{ 4412{
4726 struct rq *rq = this_rq_lock(); 4413 struct rq *rq = this_rq_lock();
4727 struct prio_array *array = current->array, *target = rq->expired;
4728 4414
4729 schedstat_inc(rq, yld_cnt); 4415 schedstat_inc(rq, yld_cnt);
4730 /* 4416 if (unlikely(rq->nr_running == 1))
4731 * We implement yielding by moving the task into the expired
4732 * queue.
4733 *
4734 * (special rule: RT tasks will just roundrobin in the active
4735 * array.)
4736 */
4737 if (rt_task(current))
4738 target = rq->active;
4739
4740 if (array->nr_active == 1) {
4741 schedstat_inc(rq, yld_act_empty); 4417 schedstat_inc(rq, yld_act_empty);
4742 if (!rq->expired->nr_active) 4418 else
4743 schedstat_inc(rq, yld_both_empty); 4419 current->sched_class->yield_task(rq, current);
4744 } else if (!rq->expired->nr_active)
4745 schedstat_inc(rq, yld_exp_empty);
4746
4747 if (array != target) {
4748 dequeue_task(current, array);
4749 enqueue_task(current, target);
4750 } else
4751 /*
4752 * requeue_task is cheaper so perform that if possible.
4753 */
4754 requeue_task(current, array);
4755 4420
4756 /* 4421 /*
4757 * Since we are going to call schedule() anyway, there's 4422 * Since we are going to call schedule() anyway, there's
@@ -4902,6 +4567,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
4902 break; 4567 break;
4903 case SCHED_NORMAL: 4568 case SCHED_NORMAL:
4904 case SCHED_BATCH: 4569 case SCHED_BATCH:
4570 case SCHED_IDLE:
4905 ret = 0; 4571 ret = 0;
4906 break; 4572 break;
4907 } 4573 }
@@ -4926,6 +4592,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4926 break; 4592 break;
4927 case SCHED_NORMAL: 4593 case SCHED_NORMAL:
4928 case SCHED_BATCH: 4594 case SCHED_BATCH:
4595 case SCHED_IDLE:
4929 ret = 0; 4596 ret = 0;
4930 } 4597 }
4931 return ret; 4598 return ret;
@@ -4960,7 +4627,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4960 goto out_unlock; 4627 goto out_unlock;
4961 4628
4962 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4629 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4963 0 : task_timeslice(p), &t); 4630 0 : static_prio_timeslice(p->static_prio), &t);
4964 read_unlock(&tasklist_lock); 4631 read_unlock(&tasklist_lock);
4965 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4632 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4966out_nounlock: 4633out_nounlock:
@@ -5035,6 +4702,9 @@ void show_state_filter(unsigned long state_filter)
5035 4702
5036 touch_all_softlockup_watchdogs(); 4703 touch_all_softlockup_watchdogs();
5037 4704
4705#ifdef CONFIG_SCHED_DEBUG
4706 sysrq_sched_debug_show();
4707#endif
5038 read_unlock(&tasklist_lock); 4708 read_unlock(&tasklist_lock);
5039 /* 4709 /*
5040 * Only show locks if all tasks are dumped: 4710 * Only show locks if all tasks are dumped:
@@ -5043,6 +4713,11 @@ void show_state_filter(unsigned long state_filter)
5043 debug_show_all_locks(); 4713 debug_show_all_locks();
5044} 4714}
5045 4715
4716void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4717{
4718 idle->sched_class = &idle_sched_class;
4719}
4720
5046/** 4721/**
5047 * init_idle - set up an idle thread for a given CPU 4722 * init_idle - set up an idle thread for a given CPU
5048 * @idle: task in question 4723 * @idle: task in question
@@ -5056,13 +4731,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5056 struct rq *rq = cpu_rq(cpu); 4731 struct rq *rq = cpu_rq(cpu);
5057 unsigned long flags; 4732 unsigned long flags;
5058 4733
5059 idle->timestamp = sched_clock(); 4734 __sched_fork(idle);
5060 idle->sleep_avg = 0; 4735 idle->se.exec_start = sched_clock();
5061 idle->array = NULL; 4736
5062 idle->prio = idle->normal_prio = MAX_PRIO; 4737 idle->prio = idle->normal_prio = MAX_PRIO;
5063 idle->state = TASK_RUNNING;
5064 idle->cpus_allowed = cpumask_of_cpu(cpu); 4738 idle->cpus_allowed = cpumask_of_cpu(cpu);
5065 set_task_cpu(idle, cpu); 4739 __set_task_cpu(idle, cpu);
5066 4740
5067 spin_lock_irqsave(&rq->lock, flags); 4741 spin_lock_irqsave(&rq->lock, flags);
5068 rq->curr = rq->idle = idle; 4742 rq->curr = rq->idle = idle;
@@ -5077,6 +4751,10 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5077#else 4751#else
5078 task_thread_info(idle)->preempt_count = 0; 4752 task_thread_info(idle)->preempt_count = 0;
5079#endif 4753#endif
4754 /*
4755 * The idle tasks have their own, simple scheduling class:
4756 */
4757 idle->sched_class = &idle_sched_class;
5080} 4758}
5081 4759
5082/* 4760/*
@@ -5088,6 +4766,28 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5088 */ 4766 */
5089cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4767cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5090 4768
4769/*
4770 * Increase the granularity value when there are more CPUs,
4771 * because with more CPUs the 'effective latency' as visible
4772 * to users decreases. But the relationship is not linear,
4773 * so pick a second-best guess by going with the log2 of the
4774 * number of CPUs.
4775 *
4776 * This idea comes from the SD scheduler of Con Kolivas:
4777 */
4778static inline void sched_init_granularity(void)
4779{
4780 unsigned int factor = 1 + ilog2(num_online_cpus());
4781 const unsigned long gran_limit = 10000000;
4782
4783 sysctl_sched_granularity *= factor;
4784 if (sysctl_sched_granularity > gran_limit)
4785 sysctl_sched_granularity = gran_limit;
4786
4787 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4788 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4789}
4790
5091#ifdef CONFIG_SMP 4791#ifdef CONFIG_SMP
5092/* 4792/*
5093 * This is how migration works: 4793 * This is how migration works:
@@ -5161,7 +4861,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
5161static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4861static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5162{ 4862{
5163 struct rq *rq_dest, *rq_src; 4863 struct rq *rq_dest, *rq_src;
5164 int ret = 0; 4864 int ret = 0, on_rq;
5165 4865
5166 if (unlikely(cpu_is_offline(dest_cpu))) 4866 if (unlikely(cpu_is_offline(dest_cpu)))
5167 return ret; 4867 return ret;
@@ -5177,20 +4877,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5177 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4877 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5178 goto out; 4878 goto out;
5179 4879
4880 on_rq = p->se.on_rq;
4881 if (on_rq)
4882 deactivate_task(rq_src, p, 0);
5180 set_task_cpu(p, dest_cpu); 4883 set_task_cpu(p, dest_cpu);
5181 if (p->array) { 4884 if (on_rq) {
5182 /* 4885 activate_task(rq_dest, p, 0);
5183 * Sync timestamp with rq_dest's before activating. 4886 check_preempt_curr(rq_dest, p);
5184 * The same thing could be achieved by doing this step
5185 * afterwards, and pretending it was a local activate.
5186 * This way is cleaner and logically correct.
5187 */
5188 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5189 + rq_dest->most_recent_timestamp;
5190 deactivate_task(p, rq_src);
5191 __activate_task(p, rq_dest);
5192 if (TASK_PREEMPTS_CURR(p, rq_dest))
5193 resched_task(rq_dest->curr);
5194 } 4887 }
5195 ret = 1; 4888 ret = 1;
5196out: 4889out:
@@ -5342,7 +5035,8 @@ static void migrate_live_tasks(int src_cpu)
5342 write_unlock_irq(&tasklist_lock); 5035 write_unlock_irq(&tasklist_lock);
5343} 5036}
5344 5037
5345/* Schedules idle task to be the next runnable task on current CPU. 5038/*
5039 * Schedules idle task to be the next runnable task on current CPU.
5346 * It does so by boosting its priority to highest possible and adding it to 5040 * It does so by boosting its priority to highest possible and adding it to
5347 * the _front_ of the runqueue. Used by CPU offline code. 5041 * the _front_ of the runqueue. Used by CPU offline code.
5348 */ 5042 */
@@ -5362,10 +5056,10 @@ void sched_idle_next(void)
5362 */ 5056 */
5363 spin_lock_irqsave(&rq->lock, flags); 5057 spin_lock_irqsave(&rq->lock, flags);
5364 5058
5365 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5059 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5366 5060
5367 /* Add idle task to the _front_ of its priority queue: */ 5061 /* Add idle task to the _front_ of its priority queue: */
5368 __activate_idle_task(p, rq); 5062 activate_idle_task(p, rq);
5369 5063
5370 spin_unlock_irqrestore(&rq->lock, flags); 5064 spin_unlock_irqrestore(&rq->lock, flags);
5371} 5065}
@@ -5415,16 +5109,15 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5415static void migrate_dead_tasks(unsigned int dead_cpu) 5109static void migrate_dead_tasks(unsigned int dead_cpu)
5416{ 5110{
5417 struct rq *rq = cpu_rq(dead_cpu); 5111 struct rq *rq = cpu_rq(dead_cpu);
5418 unsigned int arr, i; 5112 struct task_struct *next;
5419 5113
5420 for (arr = 0; arr < 2; arr++) { 5114 for ( ; ; ) {
5421 for (i = 0; i < MAX_PRIO; i++) { 5115 if (!rq->nr_running)
5422 struct list_head *list = &rq->arrays[arr].queue[i]; 5116 break;
5423 5117 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5424 while (!list_empty(list)) 5118 if (!next)
5425 migrate_dead(dead_cpu, list_entry(list->next, 5119 break;
5426 struct task_struct, run_list)); 5120 migrate_dead(dead_cpu, next);
5427 }
5428 } 5121 }
5429} 5122}
5430#endif /* CONFIG_HOTPLUG_CPU */ 5123#endif /* CONFIG_HOTPLUG_CPU */
@@ -5448,14 +5141,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5448 5141
5449 case CPU_UP_PREPARE: 5142 case CPU_UP_PREPARE:
5450 case CPU_UP_PREPARE_FROZEN: 5143 case CPU_UP_PREPARE_FROZEN:
5451 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5144 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5452 if (IS_ERR(p)) 5145 if (IS_ERR(p))
5453 return NOTIFY_BAD; 5146 return NOTIFY_BAD;
5454 p->flags |= PF_NOFREEZE; 5147 p->flags |= PF_NOFREEZE;
5455 kthread_bind(p, cpu); 5148 kthread_bind(p, cpu);
5456 /* Must be high prio: stop_machine expects to yield to it. */ 5149 /* Must be high prio: stop_machine expects to yield to it. */
5457 rq = task_rq_lock(p, &flags); 5150 rq = task_rq_lock(p, &flags);
5458 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5151 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5459 task_rq_unlock(rq, &flags); 5152 task_rq_unlock(rq, &flags);
5460 cpu_rq(cpu)->migration_thread = p; 5153 cpu_rq(cpu)->migration_thread = p;
5461 break; 5154 break;
@@ -5486,9 +5179,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5486 rq->migration_thread = NULL; 5179 rq->migration_thread = NULL;
5487 /* Idle task back to normal (off runqueue, low prio) */ 5180 /* Idle task back to normal (off runqueue, low prio) */
5488 rq = task_rq_lock(rq->idle, &flags); 5181 rq = task_rq_lock(rq->idle, &flags);
5489 deactivate_task(rq->idle, rq); 5182 deactivate_task(rq, rq->idle, 0);
5490 rq->idle->static_prio = MAX_PRIO; 5183 rq->idle->static_prio = MAX_PRIO;
5491 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5184 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5185 rq->idle->sched_class = &idle_sched_class;
5492 migrate_dead_tasks(cpu); 5186 migrate_dead_tasks(cpu);
5493 task_rq_unlock(rq, &flags); 5187 task_rq_unlock(rq, &flags);
5494 migrate_nr_uninterruptible(rq); 5188 migrate_nr_uninterruptible(rq);
@@ -5797,483 +5491,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5797 5491
5798#define SD_NODES_PER_DOMAIN 16 5492#define SD_NODES_PER_DOMAIN 16
5799 5493
5800/*
5801 * Self-tuning task migration cost measurement between source and target CPUs.
5802 *
5803 * This is done by measuring the cost of manipulating buffers of varying
5804 * sizes. For a given buffer-size here are the steps that are taken:
5805 *
5806 * 1) the source CPU reads+dirties a shared buffer
5807 * 2) the target CPU reads+dirties the same shared buffer
5808 *
5809 * We measure how long they take, in the following 4 scenarios:
5810 *
5811 * - source: CPU1, target: CPU2 | cost1
5812 * - source: CPU2, target: CPU1 | cost2
5813 * - source: CPU1, target: CPU1 | cost3
5814 * - source: CPU2, target: CPU2 | cost4
5815 *
5816 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5817 * the cost of migration.
5818 *
5819 * We then start off from a small buffer-size and iterate up to larger
5820 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5821 * doing a maximum search for the cost. (The maximum cost for a migration
5822 * normally occurs when the working set size is around the effective cache
5823 * size.)
5824 */
5825#define SEARCH_SCOPE 2
5826#define MIN_CACHE_SIZE (64*1024U)
5827#define DEFAULT_CACHE_SIZE (5*1024*1024U)
5828#define ITERATIONS 1
5829#define SIZE_THRESH 130
5830#define COST_THRESH 130
5831
5832/*
5833 * The migration cost is a function of 'domain distance'. Domain
5834 * distance is the number of steps a CPU has to iterate down its
5835 * domain tree to share a domain with the other CPU. The farther
5836 * two CPUs are from each other, the larger the distance gets.
5837 *
5838 * Note that we use the distance only to cache measurement results,
5839 * the distance value is not used numerically otherwise. When two
5840 * CPUs have the same distance it is assumed that the migration
5841 * cost is the same. (this is a simplification but quite practical)
5842 */
5843#define MAX_DOMAIN_DISTANCE 32
5844
5845static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5846 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5847/*
5848 * Architectures may override the migration cost and thus avoid
5849 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5850 * virtualized hardware:
5851 */
5852#ifdef CONFIG_DEFAULT_MIGRATION_COST
5853 CONFIG_DEFAULT_MIGRATION_COST
5854#else
5855 -1LL
5856#endif
5857};
5858
5859/*
5860 * Allow override of migration cost - in units of microseconds.
5861 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5862 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5863 */
5864static int __init migration_cost_setup(char *str)
5865{
5866 int ints[MAX_DOMAIN_DISTANCE+1], i;
5867
5868 str = get_options(str, ARRAY_SIZE(ints), ints);
5869
5870 printk("#ints: %d\n", ints[0]);
5871 for (i = 1; i <= ints[0]; i++) {
5872 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5873 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5874 }
5875 return 1;
5876}
5877
5878__setup ("migration_cost=", migration_cost_setup);
5879
5880/*
5881 * Global multiplier (divisor) for migration-cutoff values,
5882 * in percentiles. E.g. use a value of 150 to get 1.5 times
5883 * longer cache-hot cutoff times.
5884 *
5885 * (We scale it from 100 to 128 to long long handling easier.)
5886 */
5887
5888#define MIGRATION_FACTOR_SCALE 128
5889
5890static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5891
5892static int __init setup_migration_factor(char *str)
5893{
5894 get_option(&str, &migration_factor);
5895 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5896 return 1;
5897}
5898
5899__setup("migration_factor=", setup_migration_factor);
5900
5901/*
5902 * Estimated distance of two CPUs, measured via the number of domains
5903 * we have to pass for the two CPUs to be in the same span:
5904 */
5905static unsigned long domain_distance(int cpu1, int cpu2)
5906{
5907 unsigned long distance = 0;
5908 struct sched_domain *sd;
5909
5910 for_each_domain(cpu1, sd) {
5911 WARN_ON(!cpu_isset(cpu1, sd->span));
5912 if (cpu_isset(cpu2, sd->span))
5913 return distance;
5914 distance++;
5915 }
5916 if (distance >= MAX_DOMAIN_DISTANCE) {
5917 WARN_ON(1);
5918 distance = MAX_DOMAIN_DISTANCE-1;
5919 }
5920
5921 return distance;
5922}
5923
5924static unsigned int migration_debug;
5925
5926static int __init setup_migration_debug(char *str)
5927{
5928 get_option(&str, &migration_debug);
5929 return 1;
5930}
5931
5932__setup("migration_debug=", setup_migration_debug);
5933
5934/*
5935 * Maximum cache-size that the scheduler should try to measure.
5936 * Architectures with larger caches should tune this up during
5937 * bootup. Gets used in the domain-setup code (i.e. during SMP
5938 * bootup).
5939 */
5940unsigned int max_cache_size;
5941
5942static int __init setup_max_cache_size(char *str)
5943{
5944 get_option(&str, &max_cache_size);
5945 return 1;
5946}
5947
5948__setup("max_cache_size=", setup_max_cache_size);
5949
5950/*
5951 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5952 * is the operation that is timed, so we try to generate unpredictable
5953 * cachemisses that still end up filling the L2 cache:
5954 */
5955static void touch_cache(void *__cache, unsigned long __size)
5956{
5957 unsigned long size = __size / sizeof(long);
5958 unsigned long chunk1 = size / 3;
5959 unsigned long chunk2 = 2 * size / 3;
5960 unsigned long *cache = __cache;
5961 int i;
5962
5963 for (i = 0; i < size/6; i += 8) {
5964 switch (i % 6) {
5965 case 0: cache[i]++;
5966 case 1: cache[size-1-i]++;
5967 case 2: cache[chunk1-i]++;
5968 case 3: cache[chunk1+i]++;
5969 case 4: cache[chunk2-i]++;
5970 case 5: cache[chunk2+i]++;
5971 }
5972 }
5973}
5974
5975/*
5976 * Measure the cache-cost of one task migration. Returns in units of nsec.
5977 */
5978static unsigned long long
5979measure_one(void *cache, unsigned long size, int source, int target)
5980{
5981 cpumask_t mask, saved_mask;
5982 unsigned long long t0, t1, t2, t3, cost;
5983
5984 saved_mask = current->cpus_allowed;
5985
5986 /*
5987 * Flush source caches to RAM and invalidate them:
5988 */
5989 sched_cacheflush();
5990
5991 /*
5992 * Migrate to the source CPU:
5993 */
5994 mask = cpumask_of_cpu(source);
5995 set_cpus_allowed(current, mask);
5996 WARN_ON(smp_processor_id() != source);
5997
5998 /*
5999 * Dirty the working set:
6000 */
6001 t0 = sched_clock();
6002 touch_cache(cache, size);
6003 t1 = sched_clock();
6004
6005 /*
6006 * Migrate to the target CPU, dirty the L2 cache and access
6007 * the shared buffer. (which represents the working set
6008 * of a migrated task.)
6009 */
6010 mask = cpumask_of_cpu(target);
6011 set_cpus_allowed(current, mask);
6012 WARN_ON(smp_processor_id() != target);
6013
6014 t2 = sched_clock();
6015 touch_cache(cache, size);
6016 t3 = sched_clock();
6017
6018 cost = t1-t0 + t3-t2;
6019
6020 if (migration_debug >= 2)
6021 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
6022 source, target, t1-t0, t1-t0, t3-t2, cost);
6023 /*
6024 * Flush target caches to RAM and invalidate them:
6025 */
6026 sched_cacheflush();
6027
6028 set_cpus_allowed(current, saved_mask);
6029
6030 return cost;
6031}
6032
6033/*
6034 * Measure a series of task migrations and return the average
6035 * result. Since this code runs early during bootup the system
6036 * is 'undisturbed' and the average latency makes sense.
6037 *
6038 * The algorithm in essence auto-detects the relevant cache-size,
6039 * so it will properly detect different cachesizes for different
6040 * cache-hierarchies, depending on how the CPUs are connected.
6041 *
6042 * Architectures can prime the upper limit of the search range via
6043 * max_cache_size, otherwise the search range defaults to 20MB...64K.
6044 */
6045static unsigned long long
6046measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
6047{
6048 unsigned long long cost1, cost2;
6049 int i;
6050
6051 /*
6052 * Measure the migration cost of 'size' bytes, over an
6053 * average of 10 runs:
6054 *
6055 * (We perturb the cache size by a small (0..4k)
6056 * value to compensate size/alignment related artifacts.
6057 * We also subtract the cost of the operation done on
6058 * the same CPU.)
6059 */
6060 cost1 = 0;
6061
6062 /*
6063 * dry run, to make sure we start off cache-cold on cpu1,
6064 * and to get any vmalloc pagefaults in advance:
6065 */
6066 measure_one(cache, size, cpu1, cpu2);
6067 for (i = 0; i < ITERATIONS; i++)
6068 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
6069
6070 measure_one(cache, size, cpu2, cpu1);
6071 for (i = 0; i < ITERATIONS; i++)
6072 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
6073
6074 /*
6075 * (We measure the non-migrating [cached] cost on both
6076 * cpu1 and cpu2, to handle CPUs with different speeds)
6077 */
6078 cost2 = 0;
6079
6080 measure_one(cache, size, cpu1, cpu1);
6081 for (i = 0; i < ITERATIONS; i++)
6082 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
6083
6084 measure_one(cache, size, cpu2, cpu2);
6085 for (i = 0; i < ITERATIONS; i++)
6086 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
6087
6088 /*
6089 * Get the per-iteration migration cost:
6090 */
6091 do_div(cost1, 2 * ITERATIONS);
6092 do_div(cost2, 2 * ITERATIONS);
6093
6094 return cost1 - cost2;
6095}
6096
6097static unsigned long long measure_migration_cost(int cpu1, int cpu2)
6098{
6099 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
6100 unsigned int max_size, size, size_found = 0;
6101 long long cost = 0, prev_cost;
6102 void *cache;
6103
6104 /*
6105 * Search from max_cache_size*5 down to 64K - the real relevant
6106 * cachesize has to lie somewhere inbetween.
6107 */
6108 if (max_cache_size) {
6109 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
6110 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
6111 } else {
6112 /*
6113 * Since we have no estimation about the relevant
6114 * search range
6115 */
6116 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
6117 size = MIN_CACHE_SIZE;
6118 }
6119
6120 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
6121 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
6122 return 0;
6123 }
6124
6125 /*
6126 * Allocate the working set:
6127 */
6128 cache = vmalloc(max_size);
6129 if (!cache) {
6130 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
6131 return 1000000; /* return 1 msec on very small boxen */
6132 }
6133
6134 while (size <= max_size) {
6135 prev_cost = cost;
6136 cost = measure_cost(cpu1, cpu2, cache, size);
6137
6138 /*
6139 * Update the max:
6140 */
6141 if (cost > 0) {
6142 if (max_cost < cost) {
6143 max_cost = cost;
6144 size_found = size;
6145 }
6146 }
6147 /*
6148 * Calculate average fluctuation, we use this to prevent
6149 * noise from triggering an early break out of the loop:
6150 */
6151 fluct = abs(cost - prev_cost);
6152 avg_fluct = (avg_fluct + fluct)/2;
6153
6154 if (migration_debug)
6155 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
6156 "(%8Ld %8Ld)\n",
6157 cpu1, cpu2, size,
6158 (long)cost / 1000000,
6159 ((long)cost / 100000) % 10,
6160 (long)max_cost / 1000000,
6161 ((long)max_cost / 100000) % 10,
6162 domain_distance(cpu1, cpu2),
6163 cost, avg_fluct);
6164
6165 /*
6166 * If we iterated at least 20% past the previous maximum,
6167 * and the cost has dropped by more than 20% already,
6168 * (taking fluctuations into account) then we assume to
6169 * have found the maximum and break out of the loop early:
6170 */
6171 if (size_found && (size*100 > size_found*SIZE_THRESH))
6172 if (cost+avg_fluct <= 0 ||
6173 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
6174
6175 if (migration_debug)
6176 printk("-> found max.\n");
6177 break;
6178 }
6179 /*
6180 * Increase the cachesize in 10% steps:
6181 */
6182 size = size * 10 / 9;
6183 }
6184
6185 if (migration_debug)
6186 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
6187 cpu1, cpu2, size_found, max_cost);
6188
6189 vfree(cache);
6190
6191 /*
6192 * A task is considered 'cache cold' if at least 2 times
6193 * the worst-case cost of migration has passed.
6194 *
6195 * (this limit is only listened to if the load-balancing
6196 * situation is 'nice' - if there is a large imbalance we
6197 * ignore it for the sake of CPU utilization and
6198 * processing fairness.)
6199 */
6200 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
6201}
6202
6203static void calibrate_migration_costs(const cpumask_t *cpu_map)
6204{
6205 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
6206 unsigned long j0, j1, distance, max_distance = 0;
6207 struct sched_domain *sd;
6208
6209 j0 = jiffies;
6210
6211 /*
6212 * First pass - calculate the cacheflush times:
6213 */
6214 for_each_cpu_mask(cpu1, *cpu_map) {
6215 for_each_cpu_mask(cpu2, *cpu_map) {
6216 if (cpu1 == cpu2)
6217 continue;
6218 distance = domain_distance(cpu1, cpu2);
6219 max_distance = max(max_distance, distance);
6220 /*
6221 * No result cached yet?
6222 */
6223 if (migration_cost[distance] == -1LL)
6224 migration_cost[distance] =
6225 measure_migration_cost(cpu1, cpu2);
6226 }
6227 }
6228 /*
6229 * Second pass - update the sched domain hierarchy with
6230 * the new cache-hot-time estimations:
6231 */
6232 for_each_cpu_mask(cpu, *cpu_map) {
6233 distance = 0;
6234 for_each_domain(cpu, sd) {
6235 sd->cache_hot_time = migration_cost[distance];
6236 distance++;
6237 }
6238 }
6239 /*
6240 * Print the matrix:
6241 */
6242 if (migration_debug)
6243 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
6244 max_cache_size,
6245#ifdef CONFIG_X86
6246 cpu_khz/1000
6247#else
6248 -1
6249#endif
6250 );
6251 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
6252 printk("migration_cost=");
6253 for (distance = 0; distance <= max_distance; distance++) {
6254 if (distance)
6255 printk(",");
6256 printk("%ld", (long)migration_cost[distance] / 1000);
6257 }
6258 printk("\n");
6259 }
6260 j1 = jiffies;
6261 if (migration_debug)
6262 printk("migration: %ld seconds\n", (j1-j0) / HZ);
6263
6264 /*
6265 * Move back to the original CPU. NUMA-Q gets confused
6266 * if we migrate to another quad during bootup.
6267 */
6268 if (raw_smp_processor_id() != orig_cpu) {
6269 cpumask_t mask = cpumask_of_cpu(orig_cpu),
6270 saved_mask = current->cpus_allowed;
6271
6272 set_cpus_allowed(current, mask);
6273 set_cpus_allowed(current, saved_mask);
6274 }
6275}
6276
6277#ifdef CONFIG_NUMA 5494#ifdef CONFIG_NUMA
6278 5495
6279/** 5496/**
@@ -6574,7 +5791,6 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6574static int build_sched_domains(const cpumask_t *cpu_map) 5791static int build_sched_domains(const cpumask_t *cpu_map)
6575{ 5792{
6576 int i; 5793 int i;
6577 struct sched_domain *sd;
6578#ifdef CONFIG_NUMA 5794#ifdef CONFIG_NUMA
6579 struct sched_group **sched_group_nodes = NULL; 5795 struct sched_group **sched_group_nodes = NULL;
6580 int sd_allnodes = 0; 5796 int sd_allnodes = 0;
@@ -6582,7 +5798,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6582 /* 5798 /*
6583 * Allocate the per-node list of sched groups 5799 * Allocate the per-node list of sched groups
6584 */ 5800 */
6585 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 5801 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
6586 GFP_KERNEL); 5802 GFP_KERNEL);
6587 if (!sched_group_nodes) { 5803 if (!sched_group_nodes) {
6588 printk(KERN_WARNING "Can not alloc sched group node list\n"); 5804 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6601,8 +5817,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6601 cpus_and(nodemask, nodemask, *cpu_map); 5817 cpus_and(nodemask, nodemask, *cpu_map);
6602 5818
6603#ifdef CONFIG_NUMA 5819#ifdef CONFIG_NUMA
6604 if (cpus_weight(*cpu_map) 5820 if (cpus_weight(*cpu_map) >
6605 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 5821 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6606 sd = &per_cpu(allnodes_domains, i); 5822 sd = &per_cpu(allnodes_domains, i);
6607 *sd = SD_ALLNODES_INIT; 5823 *sd = SD_ALLNODES_INIT;
6608 sd->span = *cpu_map; 5824 sd->span = *cpu_map;
@@ -6661,7 +5877,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6661 if (i != first_cpu(this_sibling_map)) 5877 if (i != first_cpu(this_sibling_map))
6662 continue; 5878 continue;
6663 5879
6664 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 5880 init_sched_build_groups(this_sibling_map, cpu_map,
5881 &cpu_to_cpu_group);
6665 } 5882 }
6666#endif 5883#endif
6667 5884
@@ -6672,11 +5889,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6672 cpus_and(this_core_map, this_core_map, *cpu_map); 5889 cpus_and(this_core_map, this_core_map, *cpu_map);
6673 if (i != first_cpu(this_core_map)) 5890 if (i != first_cpu(this_core_map))
6674 continue; 5891 continue;
6675 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 5892 init_sched_build_groups(this_core_map, cpu_map,
5893 &cpu_to_core_group);
6676 } 5894 }
6677#endif 5895#endif
6678 5896
6679
6680 /* Set up physical groups */ 5897 /* Set up physical groups */
6681 for (i = 0; i < MAX_NUMNODES; i++) { 5898 for (i = 0; i < MAX_NUMNODES; i++) {
6682 cpumask_t nodemask = node_to_cpumask(i); 5899 cpumask_t nodemask = node_to_cpumask(i);
@@ -6691,7 +5908,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6691#ifdef CONFIG_NUMA 5908#ifdef CONFIG_NUMA
6692 /* Set up node groups */ 5909 /* Set up node groups */
6693 if (sd_allnodes) 5910 if (sd_allnodes)
6694 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 5911 init_sched_build_groups(*cpu_map, cpu_map,
5912 &cpu_to_allnodes_group);
6695 5913
6696 for (i = 0; i < MAX_NUMNODES; i++) { 5914 for (i = 0; i < MAX_NUMNODES; i++) {
6697 /* Set up node groups */ 5915 /* Set up node groups */
@@ -6719,6 +5937,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6719 sched_group_nodes[i] = sg; 5937 sched_group_nodes[i] = sg;
6720 for_each_cpu_mask(j, nodemask) { 5938 for_each_cpu_mask(j, nodemask) {
6721 struct sched_domain *sd; 5939 struct sched_domain *sd;
5940
6722 sd = &per_cpu(node_domains, j); 5941 sd = &per_cpu(node_domains, j);
6723 sd->groups = sg; 5942 sd->groups = sg;
6724 } 5943 }
@@ -6763,19 +5982,22 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6763 /* Calculate CPU power for physical packages and nodes */ 5982 /* Calculate CPU power for physical packages and nodes */
6764#ifdef CONFIG_SCHED_SMT 5983#ifdef CONFIG_SCHED_SMT
6765 for_each_cpu_mask(i, *cpu_map) { 5984 for_each_cpu_mask(i, *cpu_map) {
6766 sd = &per_cpu(cpu_domains, i); 5985 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5986
6767 init_sched_groups_power(i, sd); 5987 init_sched_groups_power(i, sd);
6768 } 5988 }
6769#endif 5989#endif
6770#ifdef CONFIG_SCHED_MC 5990#ifdef CONFIG_SCHED_MC
6771 for_each_cpu_mask(i, *cpu_map) { 5991 for_each_cpu_mask(i, *cpu_map) {
6772 sd = &per_cpu(core_domains, i); 5992 struct sched_domain *sd = &per_cpu(core_domains, i);
5993
6773 init_sched_groups_power(i, sd); 5994 init_sched_groups_power(i, sd);
6774 } 5995 }
6775#endif 5996#endif
6776 5997
6777 for_each_cpu_mask(i, *cpu_map) { 5998 for_each_cpu_mask(i, *cpu_map) {
6778 sd = &per_cpu(phys_domains, i); 5999 struct sched_domain *sd = &per_cpu(phys_domains, i);
6000
6779 init_sched_groups_power(i, sd); 6001 init_sched_groups_power(i, sd);
6780 } 6002 }
6781 6003
@@ -6803,10 +6025,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6803#endif 6025#endif
6804 cpu_attach_domain(sd, i); 6026 cpu_attach_domain(sd, i);
6805 } 6027 }
6806 /*
6807 * Tune cache-hot values:
6808 */
6809 calibrate_migration_costs(cpu_map);
6810 6028
6811 return 0; 6029 return 0;
6812 6030
@@ -7013,10 +6231,12 @@ void __init sched_init_smp(void)
7013 /* Move init over to a non-isolated CPU */ 6231 /* Move init over to a non-isolated CPU */
7014 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6232 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7015 BUG(); 6233 BUG();
6234 sched_init_granularity();
7016} 6235}
7017#else 6236#else
7018void __init sched_init_smp(void) 6237void __init sched_init_smp(void)
7019{ 6238{
6239 sched_init_granularity();
7020} 6240}
7021#endif /* CONFIG_SMP */ 6241#endif /* CONFIG_SMP */
7022 6242
@@ -7030,28 +6250,51 @@ int in_sched_functions(unsigned long addr)
7030 && addr < (unsigned long)__sched_text_end); 6250 && addr < (unsigned long)__sched_text_end);
7031} 6251}
7032 6252
6253static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6254{
6255 cfs_rq->tasks_timeline = RB_ROOT;
6256 cfs_rq->fair_clock = 1;
6257#ifdef CONFIG_FAIR_GROUP_SCHED
6258 cfs_rq->rq = rq;
6259#endif
6260}
6261
7033void __init sched_init(void) 6262void __init sched_init(void)
7034{ 6263{
7035 int i, j, k; 6264 u64 now = sched_clock();
7036 int highest_cpu = 0; 6265 int highest_cpu = 0;
6266 int i, j;
6267
6268 /*
6269 * Link up the scheduling class hierarchy:
6270 */
6271 rt_sched_class.next = &fair_sched_class;
6272 fair_sched_class.next = &idle_sched_class;
6273 idle_sched_class.next = NULL;
7037 6274
7038 for_each_possible_cpu(i) { 6275 for_each_possible_cpu(i) {
7039 struct prio_array *array; 6276 struct rt_prio_array *array;
7040 struct rq *rq; 6277 struct rq *rq;
7041 6278
7042 rq = cpu_rq(i); 6279 rq = cpu_rq(i);
7043 spin_lock_init(&rq->lock); 6280 spin_lock_init(&rq->lock);
7044 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6281 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7045 rq->nr_running = 0; 6282 rq->nr_running = 0;
7046 rq->active = rq->arrays; 6283 rq->clock = 1;
7047 rq->expired = rq->arrays + 1; 6284 init_cfs_rq(&rq->cfs, rq);
7048 rq->best_expired_prio = MAX_PRIO; 6285#ifdef CONFIG_FAIR_GROUP_SCHED
6286 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6287 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6288#endif
6289 rq->ls.load_update_last = now;
6290 rq->ls.load_update_start = now;
7049 6291
6292 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6293 rq->cpu_load[j] = 0;
7050#ifdef CONFIG_SMP 6294#ifdef CONFIG_SMP
7051 rq->sd = NULL; 6295 rq->sd = NULL;
7052 for (j = 1; j < 3; j++)
7053 rq->cpu_load[j] = 0;
7054 rq->active_balance = 0; 6296 rq->active_balance = 0;
6297 rq->next_balance = jiffies;
7055 rq->push_cpu = 0; 6298 rq->push_cpu = 0;
7056 rq->cpu = i; 6299 rq->cpu = i;
7057 rq->migration_thread = NULL; 6300 rq->migration_thread = NULL;
@@ -7059,16 +6302,14 @@ void __init sched_init(void)
7059#endif 6302#endif
7060 atomic_set(&rq->nr_iowait, 0); 6303 atomic_set(&rq->nr_iowait, 0);
7061 6304
7062 for (j = 0; j < 2; j++) { 6305 array = &rq->rt.active;
7063 array = rq->arrays + j; 6306 for (j = 0; j < MAX_RT_PRIO; j++) {
7064 for (k = 0; k < MAX_PRIO; k++) { 6307 INIT_LIST_HEAD(array->queue + j);
7065 INIT_LIST_HEAD(array->queue + k); 6308 __clear_bit(j, array->bitmap);
7066 __clear_bit(k, array->bitmap);
7067 }
7068 // delimiter for bitsearch
7069 __set_bit(MAX_PRIO, array->bitmap);
7070 } 6309 }
7071 highest_cpu = i; 6310 highest_cpu = i;
6311 /* delimiter for bitsearch: */
6312 __set_bit(MAX_RT_PRIO, array->bitmap);
7072 } 6313 }
7073 6314
7074 set_load_weight(&init_task); 6315 set_load_weight(&init_task);
@@ -7095,6 +6336,10 @@ void __init sched_init(void)
7095 * when this runqueue becomes "idle". 6336 * when this runqueue becomes "idle".
7096 */ 6337 */
7097 init_idle(current, smp_processor_id()); 6338 init_idle(current, smp_processor_id());
6339 /*
6340 * During early bootup we pretend to be a normal task:
6341 */
6342 current->sched_class = &fair_sched_class;
7098} 6343}
7099 6344
7100#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6345#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7125,29 +6370,55 @@ EXPORT_SYMBOL(__might_sleep);
7125#ifdef CONFIG_MAGIC_SYSRQ 6370#ifdef CONFIG_MAGIC_SYSRQ
7126void normalize_rt_tasks(void) 6371void normalize_rt_tasks(void)
7127{ 6372{
7128 struct prio_array *array;
7129 struct task_struct *g, *p; 6373 struct task_struct *g, *p;
7130 unsigned long flags; 6374 unsigned long flags;
7131 struct rq *rq; 6375 struct rq *rq;
6376 int on_rq;
7132 6377
7133 read_lock_irq(&tasklist_lock); 6378 read_lock_irq(&tasklist_lock);
7134
7135 do_each_thread(g, p) { 6379 do_each_thread(g, p) {
7136 if (!rt_task(p)) 6380 p->se.fair_key = 0;
6381 p->se.wait_runtime = 0;
6382 p->se.wait_start_fair = 0;
6383 p->se.wait_start = 0;
6384 p->se.exec_start = 0;
6385 p->se.sleep_start = 0;
6386 p->se.sleep_start_fair = 0;
6387 p->se.block_start = 0;
6388 task_rq(p)->cfs.fair_clock = 0;
6389 task_rq(p)->clock = 0;
6390
6391 if (!rt_task(p)) {
6392 /*
6393 * Renice negative nice level userspace
6394 * tasks back to 0:
6395 */
6396 if (TASK_NICE(p) < 0 && p->mm)
6397 set_user_nice(p, 0);
7137 continue; 6398 continue;
6399 }
7138 6400
7139 spin_lock_irqsave(&p->pi_lock, flags); 6401 spin_lock_irqsave(&p->pi_lock, flags);
7140 rq = __task_rq_lock(p); 6402 rq = __task_rq_lock(p);
6403#ifdef CONFIG_SMP
6404 /*
6405 * Do not touch the migration thread:
6406 */
6407 if (p == rq->migration_thread)
6408 goto out_unlock;
6409#endif
7141 6410
7142 array = p->array; 6411 on_rq = p->se.on_rq;
7143 if (array) 6412 if (on_rq)
7144 deactivate_task(p, task_rq(p)); 6413 deactivate_task(task_rq(p), p, 0);
7145 __setscheduler(p, SCHED_NORMAL, 0); 6414 __setscheduler(rq, p, SCHED_NORMAL, 0);
7146 if (array) { 6415 if (on_rq) {
7147 __activate_task(p, task_rq(p)); 6416 activate_task(task_rq(p), p, 0);
7148 resched_task(rq->curr); 6417 resched_task(rq->curr);
7149 } 6418 }
7150 6419#ifdef CONFIG_SMP
6420 out_unlock:
6421#endif
7151 __task_rq_unlock(rq); 6422 __task_rq_unlock(rq);
7152 spin_unlock_irqrestore(&p->pi_lock, flags); 6423 spin_unlock_irqrestore(&p->pi_lock, flags);
7153 } while_each_thread(g, p); 6424 } while_each_thread(g, p);