diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1972 |
1 files changed, 1297 insertions, 675 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7d..b44b9a43b0fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
31 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
32 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
33 | #include <linux/debug_locks.h> | ||
33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
35 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
@@ -50,6 +51,7 @@ | |||
50 | #include <linux/times.h> | 51 | #include <linux/times.h> |
51 | #include <linux/acct.h> | 52 | #include <linux/acct.h> |
52 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
54 | #include <linux/delayacct.h> | ||
53 | #include <asm/tlb.h> | 55 | #include <asm/tlb.h> |
54 | 56 | ||
55 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
@@ -168,29 +170,28 @@ | |||
168 | */ | 170 | */ |
169 | 171 | ||
170 | #define SCALE_PRIO(x, prio) \ | 172 | #define SCALE_PRIO(x, prio) \ |
171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 173 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
172 | 174 | ||
173 | static unsigned int task_timeslice(task_t *p) | 175 | static unsigned int static_prio_timeslice(int static_prio) |
174 | { | 176 | { |
175 | if (p->static_prio < NICE_TO_PRIO(0)) | 177 | if (static_prio < NICE_TO_PRIO(0)) |
176 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 178 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
177 | else | 179 | else |
178 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 180 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
181 | } | ||
182 | |||
183 | static inline unsigned int task_timeslice(struct task_struct *p) | ||
184 | { | ||
185 | return static_prio_timeslice(p->static_prio); | ||
179 | } | 186 | } |
180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | ||
181 | < (long long) (sd)->cache_hot_time) | ||
182 | 187 | ||
183 | /* | 188 | /* |
184 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
185 | */ | 190 | */ |
186 | 191 | ||
187 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
188 | |||
189 | typedef struct runqueue runqueue_t; | ||
190 | |||
191 | struct prio_array { | 192 | struct prio_array { |
192 | unsigned int nr_active; | 193 | unsigned int nr_active; |
193 | unsigned long bitmap[BITMAP_SIZE]; | 194 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
194 | struct list_head queue[MAX_PRIO]; | 195 | struct list_head queue[MAX_PRIO]; |
195 | }; | 196 | }; |
196 | 197 | ||
@@ -201,7 +202,7 @@ struct prio_array { | |||
201 | * (such as the load balancing or the thread migration code), lock | 202 | * (such as the load balancing or the thread migration code), lock |
202 | * acquire operations must be ordered by ascending &runqueue. | 203 | * acquire operations must be ordered by ascending &runqueue. |
203 | */ | 204 | */ |
204 | struct runqueue { | 205 | struct rq { |
205 | spinlock_t lock; | 206 | spinlock_t lock; |
206 | 207 | ||
207 | /* | 208 | /* |
@@ -209,6 +210,7 @@ struct runqueue { | |||
209 | * remote CPUs use both these fields when doing load calculation. | 210 | * remote CPUs use both these fields when doing load calculation. |
210 | */ | 211 | */ |
211 | unsigned long nr_running; | 212 | unsigned long nr_running; |
213 | unsigned long raw_weighted_load; | ||
212 | #ifdef CONFIG_SMP | 214 | #ifdef CONFIG_SMP |
213 | unsigned long cpu_load[3]; | 215 | unsigned long cpu_load[3]; |
214 | #endif | 216 | #endif |
@@ -224,9 +226,9 @@ struct runqueue { | |||
224 | 226 | ||
225 | unsigned long expired_timestamp; | 227 | unsigned long expired_timestamp; |
226 | unsigned long long timestamp_last_tick; | 228 | unsigned long long timestamp_last_tick; |
227 | task_t *curr, *idle; | 229 | struct task_struct *curr, *idle; |
228 | struct mm_struct *prev_mm; | 230 | struct mm_struct *prev_mm; |
229 | prio_array_t *active, *expired, arrays[2]; | 231 | struct prio_array *active, *expired, arrays[2]; |
230 | int best_expired_prio; | 232 | int best_expired_prio; |
231 | atomic_t nr_iowait; | 233 | atomic_t nr_iowait; |
232 | 234 | ||
@@ -237,9 +239,8 @@ struct runqueue { | |||
237 | int active_balance; | 239 | int active_balance; |
238 | int push_cpu; | 240 | int push_cpu; |
239 | 241 | ||
240 | task_t *migration_thread; | 242 | struct task_struct *migration_thread; |
241 | struct list_head migration_queue; | 243 | struct list_head migration_queue; |
242 | int cpu; | ||
243 | #endif | 244 | #endif |
244 | 245 | ||
245 | #ifdef CONFIG_SCHEDSTATS | 246 | #ifdef CONFIG_SCHEDSTATS |
@@ -261,9 +262,10 @@ struct runqueue { | |||
261 | unsigned long ttwu_cnt; | 262 | unsigned long ttwu_cnt; |
262 | unsigned long ttwu_local; | 263 | unsigned long ttwu_local; |
263 | #endif | 264 | #endif |
265 | struct lock_class_key rq_lock_key; | ||
264 | }; | 266 | }; |
265 | 267 | ||
266 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 268 | static DEFINE_PER_CPU(struct rq, runqueues); |
267 | 269 | ||
268 | /* | 270 | /* |
269 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 271 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
@@ -272,8 +274,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); | |||
272 | * The domain tree of any CPU may only be accessed from within | 274 | * The domain tree of any CPU may only be accessed from within |
273 | * preempt-disabled sections. | 275 | * preempt-disabled sections. |
274 | */ | 276 | */ |
275 | #define for_each_domain(cpu, domain) \ | 277 | #define for_each_domain(cpu, __sd) \ |
276 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | 278 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
277 | 279 | ||
278 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 280 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
279 | #define this_rq() (&__get_cpu_var(runqueues)) | 281 | #define this_rq() (&__get_cpu_var(runqueues)) |
@@ -288,26 +290,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | |||
288 | #endif | 290 | #endif |
289 | 291 | ||
290 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 292 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
291 | static inline int task_running(runqueue_t *rq, task_t *p) | 293 | static inline int task_running(struct rq *rq, struct task_struct *p) |
292 | { | 294 | { |
293 | return rq->curr == p; | 295 | return rq->curr == p; |
294 | } | 296 | } |
295 | 297 | ||
296 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 298 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
297 | { | 299 | { |
298 | } | 300 | } |
299 | 301 | ||
300 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 302 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
301 | { | 303 | { |
302 | #ifdef CONFIG_DEBUG_SPINLOCK | 304 | #ifdef CONFIG_DEBUG_SPINLOCK |
303 | /* this is a valid case when another task releases the spinlock */ | 305 | /* this is a valid case when another task releases the spinlock */ |
304 | rq->lock.owner = current; | 306 | rq->lock.owner = current; |
305 | #endif | 307 | #endif |
308 | /* | ||
309 | * If we are tracking spinlock dependencies then we have to | ||
310 | * fix up the runqueue lock - which gets 'carried over' from | ||
311 | * prev into current: | ||
312 | */ | ||
313 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
314 | |||
306 | spin_unlock_irq(&rq->lock); | 315 | spin_unlock_irq(&rq->lock); |
307 | } | 316 | } |
308 | 317 | ||
309 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 318 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
310 | static inline int task_running(runqueue_t *rq, task_t *p) | 319 | static inline int task_running(struct rq *rq, struct task_struct *p) |
311 | { | 320 | { |
312 | #ifdef CONFIG_SMP | 321 | #ifdef CONFIG_SMP |
313 | return p->oncpu; | 322 | return p->oncpu; |
@@ -316,7 +325,7 @@ static inline int task_running(runqueue_t *rq, task_t *p) | |||
316 | #endif | 325 | #endif |
317 | } | 326 | } |
318 | 327 | ||
319 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 328 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
320 | { | 329 | { |
321 | #ifdef CONFIG_SMP | 330 | #ifdef CONFIG_SMP |
322 | /* | 331 | /* |
@@ -333,7 +342,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | |||
333 | #endif | 342 | #endif |
334 | } | 343 | } |
335 | 344 | ||
336 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 345 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
337 | { | 346 | { |
338 | #ifdef CONFIG_SMP | 347 | #ifdef CONFIG_SMP |
339 | /* | 348 | /* |
@@ -351,14 +360,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
351 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 360 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
352 | 361 | ||
353 | /* | 362 | /* |
363 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
364 | * Must be called interrupts disabled. | ||
365 | */ | ||
366 | static inline struct rq *__task_rq_lock(struct task_struct *p) | ||
367 | __acquires(rq->lock) | ||
368 | { | ||
369 | struct rq *rq; | ||
370 | |||
371 | repeat_lock_task: | ||
372 | rq = task_rq(p); | ||
373 | spin_lock(&rq->lock); | ||
374 | if (unlikely(rq != task_rq(p))) { | ||
375 | spin_unlock(&rq->lock); | ||
376 | goto repeat_lock_task; | ||
377 | } | ||
378 | return rq; | ||
379 | } | ||
380 | |||
381 | /* | ||
354 | * task_rq_lock - lock the runqueue a given task resides on and disable | 382 | * task_rq_lock - lock the runqueue a given task resides on and disable |
355 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 383 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
356 | * explicitly disabling preemption. | 384 | * explicitly disabling preemption. |
357 | */ | 385 | */ |
358 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 386 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
359 | __acquires(rq->lock) | 387 | __acquires(rq->lock) |
360 | { | 388 | { |
361 | struct runqueue *rq; | 389 | struct rq *rq; |
362 | 390 | ||
363 | repeat_lock_task: | 391 | repeat_lock_task: |
364 | local_irq_save(*flags); | 392 | local_irq_save(*flags); |
@@ -371,7 +399,13 @@ repeat_lock_task: | |||
371 | return rq; | 399 | return rq; |
372 | } | 400 | } |
373 | 401 | ||
374 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 402 | static inline void __task_rq_unlock(struct rq *rq) |
403 | __releases(rq->lock) | ||
404 | { | ||
405 | spin_unlock(&rq->lock); | ||
406 | } | ||
407 | |||
408 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | ||
375 | __releases(rq->lock) | 409 | __releases(rq->lock) |
376 | { | 410 | { |
377 | spin_unlock_irqrestore(&rq->lock, *flags); | 411 | spin_unlock_irqrestore(&rq->lock, *flags); |
@@ -391,7 +425,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
391 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 425 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
392 | seq_printf(seq, "timestamp %lu\n", jiffies); | 426 | seq_printf(seq, "timestamp %lu\n", jiffies); |
393 | for_each_online_cpu(cpu) { | 427 | for_each_online_cpu(cpu) { |
394 | runqueue_t *rq = cpu_rq(cpu); | 428 | struct rq *rq = cpu_rq(cpu); |
395 | #ifdef CONFIG_SMP | 429 | #ifdef CONFIG_SMP |
396 | struct sched_domain *sd; | 430 | struct sched_domain *sd; |
397 | int dcnt = 0; | 431 | int dcnt = 0; |
@@ -468,9 +502,36 @@ struct file_operations proc_schedstat_operations = { | |||
468 | .release = single_release, | 502 | .release = single_release, |
469 | }; | 503 | }; |
470 | 504 | ||
505 | /* | ||
506 | * Expects runqueue lock to be held for atomicity of update | ||
507 | */ | ||
508 | static inline void | ||
509 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
510 | { | ||
511 | if (rq) { | ||
512 | rq->rq_sched_info.run_delay += delta_jiffies; | ||
513 | rq->rq_sched_info.pcnt++; | ||
514 | } | ||
515 | } | ||
516 | |||
517 | /* | ||
518 | * Expects runqueue lock to be held for atomicity of update | ||
519 | */ | ||
520 | static inline void | ||
521 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
522 | { | ||
523 | if (rq) | ||
524 | rq->rq_sched_info.cpu_time += delta_jiffies; | ||
525 | } | ||
471 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 526 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
472 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 527 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
473 | #else /* !CONFIG_SCHEDSTATS */ | 528 | #else /* !CONFIG_SCHEDSTATS */ |
529 | static inline void | ||
530 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
531 | {} | ||
532 | static inline void | ||
533 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
534 | {} | ||
474 | # define schedstat_inc(rq, field) do { } while (0) | 535 | # define schedstat_inc(rq, field) do { } while (0) |
475 | # define schedstat_add(rq, field, amt) do { } while (0) | 536 | # define schedstat_add(rq, field, amt) do { } while (0) |
476 | #endif | 537 | #endif |
@@ -478,10 +539,10 @@ struct file_operations proc_schedstat_operations = { | |||
478 | /* | 539 | /* |
479 | * rq_lock - lock a given runqueue and disable interrupts. | 540 | * rq_lock - lock a given runqueue and disable interrupts. |
480 | */ | 541 | */ |
481 | static inline runqueue_t *this_rq_lock(void) | 542 | static inline struct rq *this_rq_lock(void) |
482 | __acquires(rq->lock) | 543 | __acquires(rq->lock) |
483 | { | 544 | { |
484 | runqueue_t *rq; | 545 | struct rq *rq; |
485 | 546 | ||
486 | local_irq_disable(); | 547 | local_irq_disable(); |
487 | rq = this_rq(); | 548 | rq = this_rq(); |
@@ -490,7 +551,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
490 | return rq; | 551 | return rq; |
491 | } | 552 | } |
492 | 553 | ||
493 | #ifdef CONFIG_SCHEDSTATS | 554 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
494 | /* | 555 | /* |
495 | * Called when a process is dequeued from the active array and given | 556 | * Called when a process is dequeued from the active array and given |
496 | * the cpu. We should note that with the exception of interactive | 557 | * the cpu. We should note that with the exception of interactive |
@@ -506,7 +567,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
506 | * long it was from the *first* time it was queued to the time that it | 567 | * long it was from the *first* time it was queued to the time that it |
507 | * finally hit a cpu. | 568 | * finally hit a cpu. |
508 | */ | 569 | */ |
509 | static inline void sched_info_dequeued(task_t *t) | 570 | static inline void sched_info_dequeued(struct task_struct *t) |
510 | { | 571 | { |
511 | t->sched_info.last_queued = 0; | 572 | t->sched_info.last_queued = 0; |
512 | } | 573 | } |
@@ -516,23 +577,18 @@ static inline void sched_info_dequeued(task_t *t) | |||
516 | * long it was waiting to run. We also note when it began so that we | 577 | * long it was waiting to run. We also note when it began so that we |
517 | * can keep stats on how long its timeslice is. | 578 | * can keep stats on how long its timeslice is. |
518 | */ | 579 | */ |
519 | static void sched_info_arrive(task_t *t) | 580 | static void sched_info_arrive(struct task_struct *t) |
520 | { | 581 | { |
521 | unsigned long now = jiffies, diff = 0; | 582 | unsigned long now = jiffies, delta_jiffies = 0; |
522 | struct runqueue *rq = task_rq(t); | ||
523 | 583 | ||
524 | if (t->sched_info.last_queued) | 584 | if (t->sched_info.last_queued) |
525 | diff = now - t->sched_info.last_queued; | 585 | delta_jiffies = now - t->sched_info.last_queued; |
526 | sched_info_dequeued(t); | 586 | sched_info_dequeued(t); |
527 | t->sched_info.run_delay += diff; | 587 | t->sched_info.run_delay += delta_jiffies; |
528 | t->sched_info.last_arrival = now; | 588 | t->sched_info.last_arrival = now; |
529 | t->sched_info.pcnt++; | 589 | t->sched_info.pcnt++; |
530 | 590 | ||
531 | if (!rq) | 591 | rq_sched_info_arrive(task_rq(t), delta_jiffies); |
532 | return; | ||
533 | |||
534 | rq->rq_sched_info.run_delay += diff; | ||
535 | rq->rq_sched_info.pcnt++; | ||
536 | } | 592 | } |
537 | 593 | ||
538 | /* | 594 | /* |
@@ -550,25 +606,23 @@ static void sched_info_arrive(task_t *t) | |||
550 | * the timestamp if it is already not set. It's assumed that | 606 | * the timestamp if it is already not set. It's assumed that |
551 | * sched_info_dequeued() will clear that stamp when appropriate. | 607 | * sched_info_dequeued() will clear that stamp when appropriate. |
552 | */ | 608 | */ |
553 | static inline void sched_info_queued(task_t *t) | 609 | static inline void sched_info_queued(struct task_struct *t) |
554 | { | 610 | { |
555 | if (!t->sched_info.last_queued) | 611 | if (unlikely(sched_info_on())) |
556 | t->sched_info.last_queued = jiffies; | 612 | if (!t->sched_info.last_queued) |
613 | t->sched_info.last_queued = jiffies; | ||
557 | } | 614 | } |
558 | 615 | ||
559 | /* | 616 | /* |
560 | * Called when a process ceases being the active-running process, either | 617 | * Called when a process ceases being the active-running process, either |
561 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 618 | * voluntarily or involuntarily. Now we can calculate how long we ran. |
562 | */ | 619 | */ |
563 | static inline void sched_info_depart(task_t *t) | 620 | static inline void sched_info_depart(struct task_struct *t) |
564 | { | 621 | { |
565 | struct runqueue *rq = task_rq(t); | 622 | unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; |
566 | unsigned long diff = jiffies - t->sched_info.last_arrival; | ||
567 | |||
568 | t->sched_info.cpu_time += diff; | ||
569 | 623 | ||
570 | if (rq) | 624 | t->sched_info.cpu_time += delta_jiffies; |
571 | rq->rq_sched_info.cpu_time += diff; | 625 | rq_sched_info_depart(task_rq(t), delta_jiffies); |
572 | } | 626 | } |
573 | 627 | ||
574 | /* | 628 | /* |
@@ -576,9 +630,10 @@ static inline void sched_info_depart(task_t *t) | |||
576 | * their time slice. (This may also be called when switching to or from | 630 | * their time slice. (This may also be called when switching to or from |
577 | * the idle task.) We are only called when prev != next. | 631 | * the idle task.) We are only called when prev != next. |
578 | */ | 632 | */ |
579 | static inline void sched_info_switch(task_t *prev, task_t *next) | 633 | static inline void |
634 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
580 | { | 635 | { |
581 | struct runqueue *rq = task_rq(prev); | 636 | struct rq *rq = task_rq(prev); |
582 | 637 | ||
583 | /* | 638 | /* |
584 | * prev now departs the cpu. It's not interesting to record | 639 | * prev now departs the cpu. It's not interesting to record |
@@ -591,15 +646,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next) | |||
591 | if (next != rq->idle) | 646 | if (next != rq->idle) |
592 | sched_info_arrive(next); | 647 | sched_info_arrive(next); |
593 | } | 648 | } |
649 | static inline void | ||
650 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
651 | { | ||
652 | if (unlikely(sched_info_on())) | ||
653 | __sched_info_switch(prev, next); | ||
654 | } | ||
594 | #else | 655 | #else |
595 | #define sched_info_queued(t) do { } while (0) | 656 | #define sched_info_queued(t) do { } while (0) |
596 | #define sched_info_switch(t, next) do { } while (0) | 657 | #define sched_info_switch(t, next) do { } while (0) |
597 | #endif /* CONFIG_SCHEDSTATS */ | 658 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
598 | 659 | ||
599 | /* | 660 | /* |
600 | * Adding/removing a task to/from a priority array: | 661 | * Adding/removing a task to/from a priority array: |
601 | */ | 662 | */ |
602 | static void dequeue_task(struct task_struct *p, prio_array_t *array) | 663 | static void dequeue_task(struct task_struct *p, struct prio_array *array) |
603 | { | 664 | { |
604 | array->nr_active--; | 665 | array->nr_active--; |
605 | list_del(&p->run_list); | 666 | list_del(&p->run_list); |
@@ -607,7 +668,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) | |||
607 | __clear_bit(p->prio, array->bitmap); | 668 | __clear_bit(p->prio, array->bitmap); |
608 | } | 669 | } |
609 | 670 | ||
610 | static void enqueue_task(struct task_struct *p, prio_array_t *array) | 671 | static void enqueue_task(struct task_struct *p, struct prio_array *array) |
611 | { | 672 | { |
612 | sched_info_queued(p); | 673 | sched_info_queued(p); |
613 | list_add_tail(&p->run_list, array->queue + p->prio); | 674 | list_add_tail(&p->run_list, array->queue + p->prio); |
@@ -620,12 +681,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) | |||
620 | * Put task to the end of the run list without the overhead of dequeue | 681 | * Put task to the end of the run list without the overhead of dequeue |
621 | * followed by enqueue. | 682 | * followed by enqueue. |
622 | */ | 683 | */ |
623 | static void requeue_task(struct task_struct *p, prio_array_t *array) | 684 | static void requeue_task(struct task_struct *p, struct prio_array *array) |
624 | { | 685 | { |
625 | list_move_tail(&p->run_list, array->queue + p->prio); | 686 | list_move_tail(&p->run_list, array->queue + p->prio); |
626 | } | 687 | } |
627 | 688 | ||
628 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | 689 | static inline void |
690 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | ||
629 | { | 691 | { |
630 | list_add(&p->run_list, array->queue + p->prio); | 692 | list_add(&p->run_list, array->queue + p->prio); |
631 | __set_bit(p->prio, array->bitmap); | 693 | __set_bit(p->prio, array->bitmap); |
@@ -634,7 +696,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
634 | } | 696 | } |
635 | 697 | ||
636 | /* | 698 | /* |
637 | * effective_prio - return the priority that is based on the static | 699 | * __normal_prio - return the priority that is based on the static |
638 | * priority but is modified by bonuses/penalties. | 700 | * priority but is modified by bonuses/penalties. |
639 | * | 701 | * |
640 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 702 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
@@ -647,13 +709,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
647 | * | 709 | * |
648 | * Both properties are important to certain workloads. | 710 | * Both properties are important to certain workloads. |
649 | */ | 711 | */ |
650 | static int effective_prio(task_t *p) | 712 | |
713 | static inline int __normal_prio(struct task_struct *p) | ||
651 | { | 714 | { |
652 | int bonus, prio; | 715 | int bonus, prio; |
653 | 716 | ||
654 | if (rt_task(p)) | ||
655 | return p->prio; | ||
656 | |||
657 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 717 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
658 | 718 | ||
659 | prio = p->static_prio - bonus; | 719 | prio = p->static_prio - bonus; |
@@ -665,57 +725,165 @@ static int effective_prio(task_t *p) | |||
665 | } | 725 | } |
666 | 726 | ||
667 | /* | 727 | /* |
728 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
729 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
730 | * each task makes to its run queue's load is weighted according to its | ||
731 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
732 | * scaled version of the new time slice allocation that they receive on time | ||
733 | * slice expiry etc. | ||
734 | */ | ||
735 | |||
736 | /* | ||
737 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
738 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
739 | * this code will need modification | ||
740 | */ | ||
741 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
742 | #define LOAD_WEIGHT(lp) \ | ||
743 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
744 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
745 | LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
746 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
747 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
748 | |||
749 | static void set_load_weight(struct task_struct *p) | ||
750 | { | ||
751 | if (has_rt_policy(p)) { | ||
752 | #ifdef CONFIG_SMP | ||
753 | if (p == task_rq(p)->migration_thread) | ||
754 | /* | ||
755 | * The migration thread does the actual balancing. | ||
756 | * Giving its load any weight will skew balancing | ||
757 | * adversely. | ||
758 | */ | ||
759 | p->load_weight = 0; | ||
760 | else | ||
761 | #endif | ||
762 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
763 | } else | ||
764 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
765 | } | ||
766 | |||
767 | static inline void | ||
768 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
769 | { | ||
770 | rq->raw_weighted_load += p->load_weight; | ||
771 | } | ||
772 | |||
773 | static inline void | ||
774 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
775 | { | ||
776 | rq->raw_weighted_load -= p->load_weight; | ||
777 | } | ||
778 | |||
779 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
780 | { | ||
781 | rq->nr_running++; | ||
782 | inc_raw_weighted_load(rq, p); | ||
783 | } | ||
784 | |||
785 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | ||
786 | { | ||
787 | rq->nr_running--; | ||
788 | dec_raw_weighted_load(rq, p); | ||
789 | } | ||
790 | |||
791 | /* | ||
792 | * Calculate the expected normal priority: i.e. priority | ||
793 | * without taking RT-inheritance into account. Might be | ||
794 | * boosted by interactivity modifiers. Changes upon fork, | ||
795 | * setprio syscalls, and whenever the interactivity | ||
796 | * estimator recalculates. | ||
797 | */ | ||
798 | static inline int normal_prio(struct task_struct *p) | ||
799 | { | ||
800 | int prio; | ||
801 | |||
802 | if (has_rt_policy(p)) | ||
803 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
804 | else | ||
805 | prio = __normal_prio(p); | ||
806 | return prio; | ||
807 | } | ||
808 | |||
809 | /* | ||
810 | * Calculate the current priority, i.e. the priority | ||
811 | * taken into account by the scheduler. This value might | ||
812 | * be boosted by RT tasks, or might be boosted by | ||
813 | * interactivity modifiers. Will be RT if the task got | ||
814 | * RT-boosted. If not then it returns p->normal_prio. | ||
815 | */ | ||
816 | static int effective_prio(struct task_struct *p) | ||
817 | { | ||
818 | p->normal_prio = normal_prio(p); | ||
819 | /* | ||
820 | * If we are RT tasks or we were boosted to RT priority, | ||
821 | * keep the priority unchanged. Otherwise, update priority | ||
822 | * to the normal priority: | ||
823 | */ | ||
824 | if (!rt_prio(p->prio)) | ||
825 | return p->normal_prio; | ||
826 | return p->prio; | ||
827 | } | ||
828 | |||
829 | /* | ||
668 | * __activate_task - move a task to the runqueue. | 830 | * __activate_task - move a task to the runqueue. |
669 | */ | 831 | */ |
670 | static void __activate_task(task_t *p, runqueue_t *rq) | 832 | static void __activate_task(struct task_struct *p, struct rq *rq) |
671 | { | 833 | { |
672 | prio_array_t *target = rq->active; | 834 | struct prio_array *target = rq->active; |
673 | 835 | ||
674 | if (batch_task(p)) | 836 | if (batch_task(p)) |
675 | target = rq->expired; | 837 | target = rq->expired; |
676 | enqueue_task(p, target); | 838 | enqueue_task(p, target); |
677 | rq->nr_running++; | 839 | inc_nr_running(p, rq); |
678 | } | 840 | } |
679 | 841 | ||
680 | /* | 842 | /* |
681 | * __activate_idle_task - move idle task to the _front_ of runqueue. | 843 | * __activate_idle_task - move idle task to the _front_ of runqueue. |
682 | */ | 844 | */ |
683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 845 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) |
684 | { | 846 | { |
685 | enqueue_task_head(p, rq->active); | 847 | enqueue_task_head(p, rq->active); |
686 | rq->nr_running++; | 848 | inc_nr_running(p, rq); |
687 | } | 849 | } |
688 | 850 | ||
689 | static int recalc_task_prio(task_t *p, unsigned long long now) | 851 | /* |
852 | * Recalculate p->normal_prio and p->prio after having slept, | ||
853 | * updating the sleep-average too: | ||
854 | */ | ||
855 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
690 | { | 856 | { |
691 | /* Caller must always ensure 'now >= p->timestamp' */ | 857 | /* Caller must always ensure 'now >= p->timestamp' */ |
692 | unsigned long long __sleep_time = now - p->timestamp; | 858 | unsigned long sleep_time = now - p->timestamp; |
693 | unsigned long sleep_time; | ||
694 | 859 | ||
695 | if (batch_task(p)) | 860 | if (batch_task(p)) |
696 | sleep_time = 0; | 861 | sleep_time = 0; |
697 | else { | ||
698 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
699 | sleep_time = NS_MAX_SLEEP_AVG; | ||
700 | else | ||
701 | sleep_time = (unsigned long)__sleep_time; | ||
702 | } | ||
703 | 862 | ||
704 | if (likely(sleep_time > 0)) { | 863 | if (likely(sleep_time > 0)) { |
705 | /* | 864 | /* |
706 | * User tasks that sleep a long time are categorised as | 865 | * This ceiling is set to the lowest priority that would allow |
707 | * idle. They will only have their sleep_avg increased to a | 866 | * a task to be reinserted into the active array on timeslice |
708 | * level that makes them just interactive priority to stay | 867 | * completion. |
709 | * active yet prevent them suddenly becoming cpu hogs and | ||
710 | * starving other processes. | ||
711 | */ | 868 | */ |
712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | 869 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
713 | unsigned long ceiling; | ||
714 | 870 | ||
715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 871 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
716 | DEF_TIMESLICE); | 872 | /* |
717 | if (p->sleep_avg < ceiling) | 873 | * Prevents user tasks from achieving best priority |
718 | p->sleep_avg = ceiling; | 874 | * with one single large enough sleep. |
875 | */ | ||
876 | p->sleep_avg = ceiling; | ||
877 | /* | ||
878 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
879 | * nice(0) task 1ms sleep away from promotion, and | ||
880 | * gives it 700ms to round-robin with no chance of | ||
881 | * being demoted. This is more than generous, so | ||
882 | * mark this sleep as non-interactive to prevent the | ||
883 | * on-runqueue bonus logic from intervening should | ||
884 | * this task not receive cpu immediately. | ||
885 | */ | ||
886 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
719 | } else { | 887 | } else { |
720 | /* | 888 | /* |
721 | * Tasks waking from uninterruptible sleep are | 889 | * Tasks waking from uninterruptible sleep are |
@@ -723,12 +891,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
723 | * are likely to be waiting on I/O | 891 | * are likely to be waiting on I/O |
724 | */ | 892 | */ |
725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 893 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 894 | if (p->sleep_avg >= ceiling) |
727 | sleep_time = 0; | 895 | sleep_time = 0; |
728 | else if (p->sleep_avg + sleep_time >= | 896 | else if (p->sleep_avg + sleep_time >= |
729 | INTERACTIVE_SLEEP(p)) { | 897 | ceiling) { |
730 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 898 | p->sleep_avg = ceiling; |
731 | sleep_time = 0; | 899 | sleep_time = 0; |
732 | } | 900 | } |
733 | } | 901 | } |
734 | 902 | ||
@@ -742,9 +910,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
742 | */ | 910 | */ |
743 | p->sleep_avg += sleep_time; | 911 | p->sleep_avg += sleep_time; |
744 | 912 | ||
745 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
746 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
747 | } | 913 | } |
914 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
915 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
748 | } | 916 | } |
749 | 917 | ||
750 | return effective_prio(p); | 918 | return effective_prio(p); |
@@ -756,7 +924,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
756 | * Update all the scheduling statistics stuff. (sleep average | 924 | * Update all the scheduling statistics stuff. (sleep average |
757 | * calculation, priority modifiers, etc.) | 925 | * calculation, priority modifiers, etc.) |
758 | */ | 926 | */ |
759 | static void activate_task(task_t *p, runqueue_t *rq, int local) | 927 | static void activate_task(struct task_struct *p, struct rq *rq, int local) |
760 | { | 928 | { |
761 | unsigned long long now; | 929 | unsigned long long now; |
762 | 930 | ||
@@ -764,7 +932,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
764 | #ifdef CONFIG_SMP | 932 | #ifdef CONFIG_SMP |
765 | if (!local) { | 933 | if (!local) { |
766 | /* Compensate for drifting sched_clock */ | 934 | /* Compensate for drifting sched_clock */ |
767 | runqueue_t *this_rq = this_rq(); | 935 | struct rq *this_rq = this_rq(); |
768 | now = (now - this_rq->timestamp_last_tick) | 936 | now = (now - this_rq->timestamp_last_tick) |
769 | + rq->timestamp_last_tick; | 937 | + rq->timestamp_last_tick; |
770 | } | 938 | } |
@@ -803,9 +971,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
803 | /* | 971 | /* |
804 | * deactivate_task - remove a task from the runqueue. | 972 | * deactivate_task - remove a task from the runqueue. |
805 | */ | 973 | */ |
806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 974 | static void deactivate_task(struct task_struct *p, struct rq *rq) |
807 | { | 975 | { |
808 | rq->nr_running--; | 976 | dec_nr_running(p, rq); |
809 | dequeue_task(p, p->array); | 977 | dequeue_task(p, p->array); |
810 | p->array = NULL; | 978 | p->array = NULL; |
811 | } | 979 | } |
@@ -818,7 +986,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) | |||
818 | * the target CPU. | 986 | * the target CPU. |
819 | */ | 987 | */ |
820 | #ifdef CONFIG_SMP | 988 | #ifdef CONFIG_SMP |
821 | static void resched_task(task_t *p) | 989 | |
990 | #ifndef tsk_is_polling | ||
991 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | ||
992 | #endif | ||
993 | |||
994 | static void resched_task(struct task_struct *p) | ||
822 | { | 995 | { |
823 | int cpu; | 996 | int cpu; |
824 | 997 | ||
@@ -833,13 +1006,13 @@ static void resched_task(task_t *p) | |||
833 | if (cpu == smp_processor_id()) | 1006 | if (cpu == smp_processor_id()) |
834 | return; | 1007 | return; |
835 | 1008 | ||
836 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ | 1009 | /* NEED_RESCHED must be visible before we test polling */ |
837 | smp_mb(); | 1010 | smp_mb(); |
838 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | 1011 | if (!tsk_is_polling(p)) |
839 | smp_send_reschedule(cpu); | 1012 | smp_send_reschedule(cpu); |
840 | } | 1013 | } |
841 | #else | 1014 | #else |
842 | static inline void resched_task(task_t *p) | 1015 | static inline void resched_task(struct task_struct *p) |
843 | { | 1016 | { |
844 | assert_spin_locked(&task_rq(p)->lock); | 1017 | assert_spin_locked(&task_rq(p)->lock); |
845 | set_tsk_need_resched(p); | 1018 | set_tsk_need_resched(p); |
@@ -850,28 +1023,35 @@ static inline void resched_task(task_t *p) | |||
850 | * task_curr - is this task currently executing on a CPU? | 1023 | * task_curr - is this task currently executing on a CPU? |
851 | * @p: the task in question. | 1024 | * @p: the task in question. |
852 | */ | 1025 | */ |
853 | inline int task_curr(const task_t *p) | 1026 | inline int task_curr(const struct task_struct *p) |
854 | { | 1027 | { |
855 | return cpu_curr(task_cpu(p)) == p; | 1028 | return cpu_curr(task_cpu(p)) == p; |
856 | } | 1029 | } |
857 | 1030 | ||
1031 | /* Used instead of source_load when we know the type == 0 */ | ||
1032 | unsigned long weighted_cpuload(const int cpu) | ||
1033 | { | ||
1034 | return cpu_rq(cpu)->raw_weighted_load; | ||
1035 | } | ||
1036 | |||
858 | #ifdef CONFIG_SMP | 1037 | #ifdef CONFIG_SMP |
859 | typedef struct { | 1038 | struct migration_req { |
860 | struct list_head list; | 1039 | struct list_head list; |
861 | 1040 | ||
862 | task_t *task; | 1041 | struct task_struct *task; |
863 | int dest_cpu; | 1042 | int dest_cpu; |
864 | 1043 | ||
865 | struct completion done; | 1044 | struct completion done; |
866 | } migration_req_t; | 1045 | }; |
867 | 1046 | ||
868 | /* | 1047 | /* |
869 | * The task's runqueue lock must be held. | 1048 | * The task's runqueue lock must be held. |
870 | * Returns true if you have to wait for migration thread. | 1049 | * Returns true if you have to wait for migration thread. |
871 | */ | 1050 | */ |
872 | static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | 1051 | static int |
1052 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | ||
873 | { | 1053 | { |
874 | runqueue_t *rq = task_rq(p); | 1054 | struct rq *rq = task_rq(p); |
875 | 1055 | ||
876 | /* | 1056 | /* |
877 | * If the task is not on a runqueue (and not running), then | 1057 | * If the task is not on a runqueue (and not running), then |
@@ -886,6 +1066,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
886 | req->task = p; | 1066 | req->task = p; |
887 | req->dest_cpu = dest_cpu; | 1067 | req->dest_cpu = dest_cpu; |
888 | list_add(&req->list, &rq->migration_queue); | 1068 | list_add(&req->list, &rq->migration_queue); |
1069 | |||
889 | return 1; | 1070 | return 1; |
890 | } | 1071 | } |
891 | 1072 | ||
@@ -898,10 +1079,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
898 | * smp_call_function() if an IPI is sent by the same process we are | 1079 | * smp_call_function() if an IPI is sent by the same process we are |
899 | * waiting to become inactive. | 1080 | * waiting to become inactive. |
900 | */ | 1081 | */ |
901 | void wait_task_inactive(task_t *p) | 1082 | void wait_task_inactive(struct task_struct *p) |
902 | { | 1083 | { |
903 | unsigned long flags; | 1084 | unsigned long flags; |
904 | runqueue_t *rq; | 1085 | struct rq *rq; |
905 | int preempted; | 1086 | int preempted; |
906 | 1087 | ||
907 | repeat: | 1088 | repeat: |
@@ -932,7 +1113,7 @@ repeat: | |||
932 | * to another CPU then no harm is done and the purpose has been | 1113 | * to another CPU then no harm is done and the purpose has been |
933 | * achieved as well. | 1114 | * achieved as well. |
934 | */ | 1115 | */ |
935 | void kick_process(task_t *p) | 1116 | void kick_process(struct task_struct *p) |
936 | { | 1117 | { |
937 | int cpu; | 1118 | int cpu; |
938 | 1119 | ||
@@ -944,32 +1125,45 @@ void kick_process(task_t *p) | |||
944 | } | 1125 | } |
945 | 1126 | ||
946 | /* | 1127 | /* |
947 | * Return a low guess at the load of a migration-source cpu. | 1128 | * Return a low guess at the load of a migration-source cpu weighted |
1129 | * according to the scheduling class and "nice" value. | ||
948 | * | 1130 | * |
949 | * We want to under-estimate the load of migration sources, to | 1131 | * We want to under-estimate the load of migration sources, to |
950 | * balance conservatively. | 1132 | * balance conservatively. |
951 | */ | 1133 | */ |
952 | static inline unsigned long source_load(int cpu, int type) | 1134 | static inline unsigned long source_load(int cpu, int type) |
953 | { | 1135 | { |
954 | runqueue_t *rq = cpu_rq(cpu); | 1136 | struct rq *rq = cpu_rq(cpu); |
955 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1137 | |
956 | if (type == 0) | 1138 | if (type == 0) |
957 | return load_now; | 1139 | return rq->raw_weighted_load; |
958 | 1140 | ||
959 | return min(rq->cpu_load[type-1], load_now); | 1141 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
960 | } | 1142 | } |
961 | 1143 | ||
962 | /* | 1144 | /* |
963 | * Return a high guess at the load of a migration-target cpu | 1145 | * Return a high guess at the load of a migration-target cpu weighted |
1146 | * according to the scheduling class and "nice" value. | ||
964 | */ | 1147 | */ |
965 | static inline unsigned long target_load(int cpu, int type) | 1148 | static inline unsigned long target_load(int cpu, int type) |
966 | { | 1149 | { |
967 | runqueue_t *rq = cpu_rq(cpu); | 1150 | struct rq *rq = cpu_rq(cpu); |
968 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1151 | |
969 | if (type == 0) | 1152 | if (type == 0) |
970 | return load_now; | 1153 | return rq->raw_weighted_load; |
971 | 1154 | ||
972 | return max(rq->cpu_load[type-1], load_now); | 1155 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); |
1156 | } | ||
1157 | |||
1158 | /* | ||
1159 | * Return the average load per task on the cpu's run queue | ||
1160 | */ | ||
1161 | static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
1162 | { | ||
1163 | struct rq *rq = cpu_rq(cpu); | ||
1164 | unsigned long n = rq->nr_running; | ||
1165 | |||
1166 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | ||
973 | } | 1167 | } |
974 | 1168 | ||
975 | /* | 1169 | /* |
@@ -1042,7 +1236,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1042 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1236 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1043 | 1237 | ||
1044 | for_each_cpu_mask(i, tmp) { | 1238 | for_each_cpu_mask(i, tmp) { |
1045 | load = source_load(i, 0); | 1239 | load = weighted_cpuload(i); |
1046 | 1240 | ||
1047 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1241 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1048 | min_load = load; | 1242 | min_load = load; |
@@ -1069,9 +1263,15 @@ static int sched_balance_self(int cpu, int flag) | |||
1069 | struct task_struct *t = current; | 1263 | struct task_struct *t = current; |
1070 | struct sched_domain *tmp, *sd = NULL; | 1264 | struct sched_domain *tmp, *sd = NULL; |
1071 | 1265 | ||
1072 | for_each_domain(cpu, tmp) | 1266 | for_each_domain(cpu, tmp) { |
1267 | /* | ||
1268 | * If power savings logic is enabled for a domain, stop there. | ||
1269 | */ | ||
1270 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
1271 | break; | ||
1073 | if (tmp->flags & flag) | 1272 | if (tmp->flags & flag) |
1074 | sd = tmp; | 1273 | sd = tmp; |
1274 | } | ||
1075 | 1275 | ||
1076 | while (sd) { | 1276 | while (sd) { |
1077 | cpumask_t span; | 1277 | cpumask_t span; |
@@ -1116,7 +1316,7 @@ nextlevel: | |||
1116 | * Returns the CPU we should wake onto. | 1316 | * Returns the CPU we should wake onto. |
1117 | */ | 1317 | */ |
1118 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1318 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
1119 | static int wake_idle(int cpu, task_t *p) | 1319 | static int wake_idle(int cpu, struct task_struct *p) |
1120 | { | 1320 | { |
1121 | cpumask_t tmp; | 1321 | cpumask_t tmp; |
1122 | struct sched_domain *sd; | 1322 | struct sched_domain *sd; |
@@ -1139,7 +1339,7 @@ static int wake_idle(int cpu, task_t *p) | |||
1139 | return cpu; | 1339 | return cpu; |
1140 | } | 1340 | } |
1141 | #else | 1341 | #else |
1142 | static inline int wake_idle(int cpu, task_t *p) | 1342 | static inline int wake_idle(int cpu, struct task_struct *p) |
1143 | { | 1343 | { |
1144 | return cpu; | 1344 | return cpu; |
1145 | } | 1345 | } |
@@ -1159,15 +1359,15 @@ static inline int wake_idle(int cpu, task_t *p) | |||
1159 | * | 1359 | * |
1160 | * returns failure only if the task is already active. | 1360 | * returns failure only if the task is already active. |
1161 | */ | 1361 | */ |
1162 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) | 1362 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
1163 | { | 1363 | { |
1164 | int cpu, this_cpu, success = 0; | 1364 | int cpu, this_cpu, success = 0; |
1165 | unsigned long flags; | 1365 | unsigned long flags; |
1166 | long old_state; | 1366 | long old_state; |
1167 | runqueue_t *rq; | 1367 | struct rq *rq; |
1168 | #ifdef CONFIG_SMP | 1368 | #ifdef CONFIG_SMP |
1169 | unsigned long load, this_load; | ||
1170 | struct sched_domain *sd, *this_sd = NULL; | 1369 | struct sched_domain *sd, *this_sd = NULL; |
1370 | unsigned long load, this_load; | ||
1171 | int new_cpu; | 1371 | int new_cpu; |
1172 | #endif | 1372 | #endif |
1173 | 1373 | ||
@@ -1221,17 +1421,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1221 | 1421 | ||
1222 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1422 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1223 | unsigned long tl = this_load; | 1423 | unsigned long tl = this_load; |
1424 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1425 | |||
1224 | /* | 1426 | /* |
1225 | * If sync wakeup then subtract the (maximum possible) | 1427 | * If sync wakeup then subtract the (maximum possible) |
1226 | * effect of the currently running task from the load | 1428 | * effect of the currently running task from the load |
1227 | * of the current CPU: | 1429 | * of the current CPU: |
1228 | */ | 1430 | */ |
1229 | if (sync) | 1431 | if (sync) |
1230 | tl -= SCHED_LOAD_SCALE; | 1432 | tl -= current->load_weight; |
1231 | 1433 | ||
1232 | if ((tl <= load && | 1434 | if ((tl <= load && |
1233 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1435 | tl + target_load(cpu, idx) <= tl_per_task) || |
1234 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1436 | 100*(tl + p->load_weight) <= imbalance*load) { |
1235 | /* | 1437 | /* |
1236 | * This domain has SD_WAKE_AFFINE and | 1438 | * This domain has SD_WAKE_AFFINE and |
1237 | * p is cache cold in this domain, and | 1439 | * p is cache cold in this domain, and |
@@ -1315,15 +1517,14 @@ out: | |||
1315 | return success; | 1517 | return success; |
1316 | } | 1518 | } |
1317 | 1519 | ||
1318 | int fastcall wake_up_process(task_t *p) | 1520 | int fastcall wake_up_process(struct task_struct *p) |
1319 | { | 1521 | { |
1320 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1522 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
1321 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1523 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
1322 | } | 1524 | } |
1323 | |||
1324 | EXPORT_SYMBOL(wake_up_process); | 1525 | EXPORT_SYMBOL(wake_up_process); |
1325 | 1526 | ||
1326 | int fastcall wake_up_state(task_t *p, unsigned int state) | 1527 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) |
1327 | { | 1528 | { |
1328 | return try_to_wake_up(p, state, 0); | 1529 | return try_to_wake_up(p, state, 0); |
1329 | } | 1530 | } |
@@ -1332,7 +1533,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
1332 | * Perform scheduler related setup for a newly forked process p. | 1533 | * Perform scheduler related setup for a newly forked process p. |
1333 | * p is forked by current. | 1534 | * p is forked by current. |
1334 | */ | 1535 | */ |
1335 | void fastcall sched_fork(task_t *p, int clone_flags) | 1536 | void fastcall sched_fork(struct task_struct *p, int clone_flags) |
1336 | { | 1537 | { |
1337 | int cpu = get_cpu(); | 1538 | int cpu = get_cpu(); |
1338 | 1539 | ||
@@ -1348,10 +1549,17 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1348 | * event cannot wake it up and insert it on the runqueue either. | 1549 | * event cannot wake it up and insert it on the runqueue either. |
1349 | */ | 1550 | */ |
1350 | p->state = TASK_RUNNING; | 1551 | p->state = TASK_RUNNING; |
1552 | |||
1553 | /* | ||
1554 | * Make sure we do not leak PI boosting priority to the child: | ||
1555 | */ | ||
1556 | p->prio = current->normal_prio; | ||
1557 | |||
1351 | INIT_LIST_HEAD(&p->run_list); | 1558 | INIT_LIST_HEAD(&p->run_list); |
1352 | p->array = NULL; | 1559 | p->array = NULL; |
1353 | #ifdef CONFIG_SCHEDSTATS | 1560 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1354 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1561 | if (unlikely(sched_info_on())) |
1562 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
1355 | #endif | 1563 | #endif |
1356 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1564 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
1357 | p->oncpu = 0; | 1565 | p->oncpu = 0; |
@@ -1394,11 +1602,11 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1394 | * that must be done for every newly created context, then puts the task | 1602 | * that must be done for every newly created context, then puts the task |
1395 | * on the runqueue and wakes it. | 1603 | * on the runqueue and wakes it. |
1396 | */ | 1604 | */ |
1397 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | 1605 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
1398 | { | 1606 | { |
1607 | struct rq *rq, *this_rq; | ||
1399 | unsigned long flags; | 1608 | unsigned long flags; |
1400 | int this_cpu, cpu; | 1609 | int this_cpu, cpu; |
1401 | runqueue_t *rq, *this_rq; | ||
1402 | 1610 | ||
1403 | rq = task_rq_lock(p, &flags); | 1611 | rq = task_rq_lock(p, &flags); |
1404 | BUG_ON(p->state != TASK_RUNNING); | 1612 | BUG_ON(p->state != TASK_RUNNING); |
@@ -1427,10 +1635,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1427 | __activate_task(p, rq); | 1635 | __activate_task(p, rq); |
1428 | else { | 1636 | else { |
1429 | p->prio = current->prio; | 1637 | p->prio = current->prio; |
1638 | p->normal_prio = current->normal_prio; | ||
1430 | list_add_tail(&p->run_list, ¤t->run_list); | 1639 | list_add_tail(&p->run_list, ¤t->run_list); |
1431 | p->array = current->array; | 1640 | p->array = current->array; |
1432 | p->array->nr_active++; | 1641 | p->array->nr_active++; |
1433 | rq->nr_running++; | 1642 | inc_nr_running(p, rq); |
1434 | } | 1643 | } |
1435 | set_need_resched(); | 1644 | set_need_resched(); |
1436 | } else | 1645 | } else |
@@ -1477,10 +1686,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1477 | * artificially, because any timeslice recovered here | 1686 | * artificially, because any timeslice recovered here |
1478 | * was given away by the parent in the first place.) | 1687 | * was given away by the parent in the first place.) |
1479 | */ | 1688 | */ |
1480 | void fastcall sched_exit(task_t *p) | 1689 | void fastcall sched_exit(struct task_struct *p) |
1481 | { | 1690 | { |
1482 | unsigned long flags; | 1691 | unsigned long flags; |
1483 | runqueue_t *rq; | 1692 | struct rq *rq; |
1484 | 1693 | ||
1485 | /* | 1694 | /* |
1486 | * If the child was a (relative-) CPU hog then decrease | 1695 | * If the child was a (relative-) CPU hog then decrease |
@@ -1511,7 +1720,7 @@ void fastcall sched_exit(task_t *p) | |||
1511 | * prepare_task_switch sets up locking and calls architecture specific | 1720 | * prepare_task_switch sets up locking and calls architecture specific |
1512 | * hooks. | 1721 | * hooks. |
1513 | */ | 1722 | */ |
1514 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | 1723 | static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) |
1515 | { | 1724 | { |
1516 | prepare_lock_switch(rq, next); | 1725 | prepare_lock_switch(rq, next); |
1517 | prepare_arch_switch(next); | 1726 | prepare_arch_switch(next); |
@@ -1532,7 +1741,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | |||
1532 | * with the lock held can cause deadlocks; see schedule() for | 1741 | * with the lock held can cause deadlocks; see schedule() for |
1533 | * details.) | 1742 | * details.) |
1534 | */ | 1743 | */ |
1535 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | 1744 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) |
1536 | __releases(rq->lock) | 1745 | __releases(rq->lock) |
1537 | { | 1746 | { |
1538 | struct mm_struct *mm = rq->prev_mm; | 1747 | struct mm_struct *mm = rq->prev_mm; |
@@ -1570,10 +1779,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | |||
1570 | * schedule_tail - first thing a freshly forked thread must call. | 1779 | * schedule_tail - first thing a freshly forked thread must call. |
1571 | * @prev: the thread we just switched away from. | 1780 | * @prev: the thread we just switched away from. |
1572 | */ | 1781 | */ |
1573 | asmlinkage void schedule_tail(task_t *prev) | 1782 | asmlinkage void schedule_tail(struct task_struct *prev) |
1574 | __releases(rq->lock) | 1783 | __releases(rq->lock) |
1575 | { | 1784 | { |
1576 | runqueue_t *rq = this_rq(); | 1785 | struct rq *rq = this_rq(); |
1786 | |||
1577 | finish_task_switch(rq, prev); | 1787 | finish_task_switch(rq, prev); |
1578 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1788 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
1579 | /* In this case, finish_task_switch does not reenable preemption */ | 1789 | /* In this case, finish_task_switch does not reenable preemption */ |
@@ -1587,8 +1797,9 @@ asmlinkage void schedule_tail(task_t *prev) | |||
1587 | * context_switch - switch to the new MM and the new | 1797 | * context_switch - switch to the new MM and the new |
1588 | * thread's register state. | 1798 | * thread's register state. |
1589 | */ | 1799 | */ |
1590 | static inline | 1800 | static inline struct task_struct * |
1591 | task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | 1801 | context_switch(struct rq *rq, struct task_struct *prev, |
1802 | struct task_struct *next) | ||
1592 | { | 1803 | { |
1593 | struct mm_struct *mm = next->mm; | 1804 | struct mm_struct *mm = next->mm; |
1594 | struct mm_struct *oldmm = prev->active_mm; | 1805 | struct mm_struct *oldmm = prev->active_mm; |
@@ -1605,6 +1816,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | |||
1605 | WARN_ON(rq->prev_mm); | 1816 | WARN_ON(rq->prev_mm); |
1606 | rq->prev_mm = oldmm; | 1817 | rq->prev_mm = oldmm; |
1607 | } | 1818 | } |
1819 | /* | ||
1820 | * Since the runqueue lock will be released by the next | ||
1821 | * task (which is an invalid locking op but in the case | ||
1822 | * of the scheduler it's an obvious special-case), so we | ||
1823 | * do an early lockdep release here: | ||
1824 | */ | ||
1825 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
1826 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
1827 | #endif | ||
1608 | 1828 | ||
1609 | /* Here we just switch the register state and the stack. */ | 1829 | /* Here we just switch the register state and the stack. */ |
1610 | switch_to(prev, next, prev); | 1830 | switch_to(prev, next, prev); |
@@ -1648,7 +1868,8 @@ unsigned long nr_uninterruptible(void) | |||
1648 | 1868 | ||
1649 | unsigned long long nr_context_switches(void) | 1869 | unsigned long long nr_context_switches(void) |
1650 | { | 1870 | { |
1651 | unsigned long long i, sum = 0; | 1871 | int i; |
1872 | unsigned long long sum = 0; | ||
1652 | 1873 | ||
1653 | for_each_possible_cpu(i) | 1874 | for_each_possible_cpu(i) |
1654 | sum += cpu_rq(i)->nr_switches; | 1875 | sum += cpu_rq(i)->nr_switches; |
@@ -1684,15 +1905,21 @@ unsigned long nr_active(void) | |||
1684 | #ifdef CONFIG_SMP | 1905 | #ifdef CONFIG_SMP |
1685 | 1906 | ||
1686 | /* | 1907 | /* |
1908 | * Is this task likely cache-hot: | ||
1909 | */ | ||
1910 | static inline int | ||
1911 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | ||
1912 | { | ||
1913 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | ||
1914 | } | ||
1915 | |||
1916 | /* | ||
1687 | * double_rq_lock - safely lock two runqueues | 1917 | * double_rq_lock - safely lock two runqueues |
1688 | * | 1918 | * |
1689 | * We must take them in cpu order to match code in | ||
1690 | * dependent_sleeper and wake_dependent_sleeper. | ||
1691 | * | ||
1692 | * Note this does not disable interrupts like task_rq_lock, | 1919 | * Note this does not disable interrupts like task_rq_lock, |
1693 | * you need to do so manually before calling. | 1920 | * you need to do so manually before calling. |
1694 | */ | 1921 | */ |
1695 | static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | 1922 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
1696 | __acquires(rq1->lock) | 1923 | __acquires(rq1->lock) |
1697 | __acquires(rq2->lock) | 1924 | __acquires(rq2->lock) |
1698 | { | 1925 | { |
@@ -1700,7 +1927,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1700 | spin_lock(&rq1->lock); | 1927 | spin_lock(&rq1->lock); |
1701 | __acquire(rq2->lock); /* Fake it out ;) */ | 1928 | __acquire(rq2->lock); /* Fake it out ;) */ |
1702 | } else { | 1929 | } else { |
1703 | if (rq1->cpu < rq2->cpu) { | 1930 | if (rq1 < rq2) { |
1704 | spin_lock(&rq1->lock); | 1931 | spin_lock(&rq1->lock); |
1705 | spin_lock(&rq2->lock); | 1932 | spin_lock(&rq2->lock); |
1706 | } else { | 1933 | } else { |
@@ -1716,7 +1943,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1716 | * Note this does not restore interrupts like task_rq_unlock, | 1943 | * Note this does not restore interrupts like task_rq_unlock, |
1717 | * you need to do so manually after calling. | 1944 | * you need to do so manually after calling. |
1718 | */ | 1945 | */ |
1719 | static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | 1946 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
1720 | __releases(rq1->lock) | 1947 | __releases(rq1->lock) |
1721 | __releases(rq2->lock) | 1948 | __releases(rq2->lock) |
1722 | { | 1949 | { |
@@ -1730,13 +1957,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | |||
1730 | /* | 1957 | /* |
1731 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 1958 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
1732 | */ | 1959 | */ |
1733 | static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | 1960 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1734 | __releases(this_rq->lock) | 1961 | __releases(this_rq->lock) |
1735 | __acquires(busiest->lock) | 1962 | __acquires(busiest->lock) |
1736 | __acquires(this_rq->lock) | 1963 | __acquires(this_rq->lock) |
1737 | { | 1964 | { |
1738 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1965 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1739 | if (busiest->cpu < this_rq->cpu) { | 1966 | if (busiest < this_rq) { |
1740 | spin_unlock(&this_rq->lock); | 1967 | spin_unlock(&this_rq->lock); |
1741 | spin_lock(&busiest->lock); | 1968 | spin_lock(&busiest->lock); |
1742 | spin_lock(&this_rq->lock); | 1969 | spin_lock(&this_rq->lock); |
@@ -1751,11 +1978,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1751 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1978 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
1752 | * the cpu_allowed mask is restored. | 1979 | * the cpu_allowed mask is restored. |
1753 | */ | 1980 | */ |
1754 | static void sched_migrate_task(task_t *p, int dest_cpu) | 1981 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
1755 | { | 1982 | { |
1756 | migration_req_t req; | 1983 | struct migration_req req; |
1757 | runqueue_t *rq; | ||
1758 | unsigned long flags; | 1984 | unsigned long flags; |
1985 | struct rq *rq; | ||
1759 | 1986 | ||
1760 | rq = task_rq_lock(p, &flags); | 1987 | rq = task_rq_lock(p, &flags); |
1761 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 1988 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
@@ -1766,11 +1993,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu) | |||
1766 | if (migrate_task(p, dest_cpu, &req)) { | 1993 | if (migrate_task(p, dest_cpu, &req)) { |
1767 | /* Need to wait for migration thread (might exit: take ref). */ | 1994 | /* Need to wait for migration thread (might exit: take ref). */ |
1768 | struct task_struct *mt = rq->migration_thread; | 1995 | struct task_struct *mt = rq->migration_thread; |
1996 | |||
1769 | get_task_struct(mt); | 1997 | get_task_struct(mt); |
1770 | task_rq_unlock(rq, &flags); | 1998 | task_rq_unlock(rq, &flags); |
1771 | wake_up_process(mt); | 1999 | wake_up_process(mt); |
1772 | put_task_struct(mt); | 2000 | put_task_struct(mt); |
1773 | wait_for_completion(&req.done); | 2001 | wait_for_completion(&req.done); |
2002 | |||
1774 | return; | 2003 | return; |
1775 | } | 2004 | } |
1776 | out: | 2005 | out: |
@@ -1794,14 +2023,14 @@ void sched_exec(void) | |||
1794 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2023 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1795 | * Both runqueues must be locked. | 2024 | * Both runqueues must be locked. |
1796 | */ | 2025 | */ |
1797 | static | 2026 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, |
1798 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 2027 | struct task_struct *p, struct rq *this_rq, |
1799 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 2028 | struct prio_array *this_array, int this_cpu) |
1800 | { | 2029 | { |
1801 | dequeue_task(p, src_array); | 2030 | dequeue_task(p, src_array); |
1802 | src_rq->nr_running--; | 2031 | dec_nr_running(p, src_rq); |
1803 | set_task_cpu(p, this_cpu); | 2032 | set_task_cpu(p, this_cpu); |
1804 | this_rq->nr_running++; | 2033 | inc_nr_running(p, this_rq); |
1805 | enqueue_task(p, this_array); | 2034 | enqueue_task(p, this_array); |
1806 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 2035 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1807 | + this_rq->timestamp_last_tick; | 2036 | + this_rq->timestamp_last_tick; |
@@ -1817,7 +2046,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1817 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 2046 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1818 | */ | 2047 | */ |
1819 | static | 2048 | static |
1820 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 2049 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
1821 | struct sched_domain *sd, enum idle_type idle, | 2050 | struct sched_domain *sd, enum idle_type idle, |
1822 | int *all_pinned) | 2051 | int *all_pinned) |
1823 | { | 2052 | { |
@@ -1848,26 +2077,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1848 | return 1; | 2077 | return 1; |
1849 | } | 2078 | } |
1850 | 2079 | ||
2080 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
2081 | |||
1851 | /* | 2082 | /* |
1852 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 2083 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
1853 | * as part of a balancing operation within "domain". Returns the number of | 2084 | * load from busiest to this_rq, as part of a balancing operation within |
1854 | * tasks moved. | 2085 | * "domain". Returns the number of tasks moved. |
1855 | * | 2086 | * |
1856 | * Called with both runqueues locked. | 2087 | * Called with both runqueues locked. |
1857 | */ | 2088 | */ |
1858 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 2089 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1859 | unsigned long max_nr_move, struct sched_domain *sd, | 2090 | unsigned long max_nr_move, unsigned long max_load_move, |
1860 | enum idle_type idle, int *all_pinned) | 2091 | struct sched_domain *sd, enum idle_type idle, |
2092 | int *all_pinned) | ||
1861 | { | 2093 | { |
1862 | prio_array_t *array, *dst_array; | 2094 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, |
2095 | best_prio_seen, skip_for_load; | ||
2096 | struct prio_array *array, *dst_array; | ||
1863 | struct list_head *head, *curr; | 2097 | struct list_head *head, *curr; |
1864 | int idx, pulled = 0, pinned = 0; | 2098 | struct task_struct *tmp; |
1865 | task_t *tmp; | 2099 | long rem_load_move; |
1866 | 2100 | ||
1867 | if (max_nr_move == 0) | 2101 | if (max_nr_move == 0 || max_load_move == 0) |
1868 | goto out; | 2102 | goto out; |
1869 | 2103 | ||
2104 | rem_load_move = max_load_move; | ||
1870 | pinned = 1; | 2105 | pinned = 1; |
2106 | this_best_prio = rq_best_prio(this_rq); | ||
2107 | best_prio = rq_best_prio(busiest); | ||
2108 | /* | ||
2109 | * Enable handling of the case where there is more than one task | ||
2110 | * with the best priority. If the current running task is one | ||
2111 | * of those with prio==best_prio we know it won't be moved | ||
2112 | * and therefore it's safe to override the skip (based on load) of | ||
2113 | * any task we find with that prio. | ||
2114 | */ | ||
2115 | best_prio_seen = best_prio == busiest->curr->prio; | ||
1871 | 2116 | ||
1872 | /* | 2117 | /* |
1873 | * We first consider expired tasks. Those will likely not be | 2118 | * We first consider expired tasks. Those will likely not be |
@@ -1903,11 +2148,22 @@ skip_bitmap: | |||
1903 | head = array->queue + idx; | 2148 | head = array->queue + idx; |
1904 | curr = head->prev; | 2149 | curr = head->prev; |
1905 | skip_queue: | 2150 | skip_queue: |
1906 | tmp = list_entry(curr, task_t, run_list); | 2151 | tmp = list_entry(curr, struct task_struct, run_list); |
1907 | 2152 | ||
1908 | curr = curr->prev; | 2153 | curr = curr->prev; |
1909 | 2154 | ||
1910 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2155 | /* |
2156 | * To help distribute high priority tasks accross CPUs we don't | ||
2157 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2158 | * prio value) on its new queue regardless of its load weight | ||
2159 | */ | ||
2160 | skip_for_load = tmp->load_weight > rem_load_move; | ||
2161 | if (skip_for_load && idx < this_best_prio) | ||
2162 | skip_for_load = !best_prio_seen && idx == best_prio; | ||
2163 | if (skip_for_load || | ||
2164 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
2165 | |||
2166 | best_prio_seen |= idx == best_prio; | ||
1911 | if (curr != head) | 2167 | if (curr != head) |
1912 | goto skip_queue; | 2168 | goto skip_queue; |
1913 | idx++; | 2169 | idx++; |
@@ -1921,9 +2177,15 @@ skip_queue: | |||
1921 | 2177 | ||
1922 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2178 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1923 | pulled++; | 2179 | pulled++; |
2180 | rem_load_move -= tmp->load_weight; | ||
1924 | 2181 | ||
1925 | /* We only want to steal up to the prescribed number of tasks. */ | 2182 | /* |
1926 | if (pulled < max_nr_move) { | 2183 | * We only want to steal up to the prescribed number of tasks |
2184 | * and the prescribed amount of weighted load. | ||
2185 | */ | ||
2186 | if (pulled < max_nr_move && rem_load_move > 0) { | ||
2187 | if (idx < this_best_prio) | ||
2188 | this_best_prio = idx; | ||
1927 | if (curr != head) | 2189 | if (curr != head) |
1928 | goto skip_queue; | 2190 | goto skip_queue; |
1929 | idx++; | 2191 | idx++; |
@@ -1944,8 +2206,8 @@ out: | |||
1944 | 2206 | ||
1945 | /* | 2207 | /* |
1946 | * find_busiest_group finds and returns the busiest CPU group within the | 2208 | * find_busiest_group finds and returns the busiest CPU group within the |
1947 | * domain. It calculates and returns the number of tasks which should be | 2209 | * domain. It calculates and returns the amount of weighted load which |
1948 | * moved to restore balance via the imbalance parameter. | 2210 | * should be moved to restore balance via the imbalance parameter. |
1949 | */ | 2211 | */ |
1950 | static struct sched_group * | 2212 | static struct sched_group * |
1951 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2213 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
@@ -1954,9 +2216,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1954 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2216 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1955 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2217 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1956 | unsigned long max_pull; | 2218 | unsigned long max_pull; |
2219 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
2220 | unsigned long this_load_per_task, this_nr_running; | ||
1957 | int load_idx; | 2221 | int load_idx; |
2222 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2223 | int power_savings_balance = 1; | ||
2224 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
2225 | unsigned long min_nr_running = ULONG_MAX; | ||
2226 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
2227 | #endif | ||
1958 | 2228 | ||
1959 | max_load = this_load = total_load = total_pwr = 0; | 2229 | max_load = this_load = total_load = total_pwr = 0; |
2230 | busiest_load_per_task = busiest_nr_running = 0; | ||
2231 | this_load_per_task = this_nr_running = 0; | ||
1960 | if (idle == NOT_IDLE) | 2232 | if (idle == NOT_IDLE) |
1961 | load_idx = sd->busy_idx; | 2233 | load_idx = sd->busy_idx; |
1962 | else if (idle == NEWLY_IDLE) | 2234 | else if (idle == NEWLY_IDLE) |
@@ -1965,16 +2237,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1965 | load_idx = sd->idle_idx; | 2237 | load_idx = sd->idle_idx; |
1966 | 2238 | ||
1967 | do { | 2239 | do { |
1968 | unsigned long load; | 2240 | unsigned long load, group_capacity; |
1969 | int local_group; | 2241 | int local_group; |
1970 | int i; | 2242 | int i; |
2243 | unsigned long sum_nr_running, sum_weighted_load; | ||
1971 | 2244 | ||
1972 | local_group = cpu_isset(this_cpu, group->cpumask); | 2245 | local_group = cpu_isset(this_cpu, group->cpumask); |
1973 | 2246 | ||
1974 | /* Tally up the load of all CPUs in the group */ | 2247 | /* Tally up the load of all CPUs in the group */ |
1975 | avg_load = 0; | 2248 | sum_weighted_load = sum_nr_running = avg_load = 0; |
1976 | 2249 | ||
1977 | for_each_cpu_mask(i, group->cpumask) { | 2250 | for_each_cpu_mask(i, group->cpumask) { |
2251 | struct rq *rq = cpu_rq(i); | ||
2252 | |||
1978 | if (*sd_idle && !idle_cpu(i)) | 2253 | if (*sd_idle && !idle_cpu(i)) |
1979 | *sd_idle = 0; | 2254 | *sd_idle = 0; |
1980 | 2255 | ||
@@ -1985,6 +2260,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1985 | load = source_load(i, load_idx); | 2260 | load = source_load(i, load_idx); |
1986 | 2261 | ||
1987 | avg_load += load; | 2262 | avg_load += load; |
2263 | sum_nr_running += rq->nr_running; | ||
2264 | sum_weighted_load += rq->raw_weighted_load; | ||
1988 | } | 2265 | } |
1989 | 2266 | ||
1990 | total_load += avg_load; | 2267 | total_load += avg_load; |
@@ -1993,17 +2270,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1993 | /* Adjust by relative CPU power of the group */ | 2270 | /* Adjust by relative CPU power of the group */ |
1994 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2271 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
1995 | 2272 | ||
2273 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
2274 | |||
1996 | if (local_group) { | 2275 | if (local_group) { |
1997 | this_load = avg_load; | 2276 | this_load = avg_load; |
1998 | this = group; | 2277 | this = group; |
1999 | } else if (avg_load > max_load) { | 2278 | this_nr_running = sum_nr_running; |
2279 | this_load_per_task = sum_weighted_load; | ||
2280 | } else if (avg_load > max_load && | ||
2281 | sum_nr_running > group_capacity) { | ||
2000 | max_load = avg_load; | 2282 | max_load = avg_load; |
2001 | busiest = group; | 2283 | busiest = group; |
2284 | busiest_nr_running = sum_nr_running; | ||
2285 | busiest_load_per_task = sum_weighted_load; | ||
2002 | } | 2286 | } |
2287 | |||
2288 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2289 | /* | ||
2290 | * Busy processors will not participate in power savings | ||
2291 | * balance. | ||
2292 | */ | ||
2293 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2294 | goto group_next; | ||
2295 | |||
2296 | /* | ||
2297 | * If the local group is idle or completely loaded | ||
2298 | * no need to do power savings balance at this domain | ||
2299 | */ | ||
2300 | if (local_group && (this_nr_running >= group_capacity || | ||
2301 | !this_nr_running)) | ||
2302 | power_savings_balance = 0; | ||
2303 | |||
2304 | /* | ||
2305 | * If a group is already running at full capacity or idle, | ||
2306 | * don't include that group in power savings calculations | ||
2307 | */ | ||
2308 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
2309 | || !sum_nr_running) | ||
2310 | goto group_next; | ||
2311 | |||
2312 | /* | ||
2313 | * Calculate the group which has the least non-idle load. | ||
2314 | * This is the group from where we need to pick up the load | ||
2315 | * for saving power | ||
2316 | */ | ||
2317 | if ((sum_nr_running < min_nr_running) || | ||
2318 | (sum_nr_running == min_nr_running && | ||
2319 | first_cpu(group->cpumask) < | ||
2320 | first_cpu(group_min->cpumask))) { | ||
2321 | group_min = group; | ||
2322 | min_nr_running = sum_nr_running; | ||
2323 | min_load_per_task = sum_weighted_load / | ||
2324 | sum_nr_running; | ||
2325 | } | ||
2326 | |||
2327 | /* | ||
2328 | * Calculate the group which is almost near its | ||
2329 | * capacity but still has some space to pick up some load | ||
2330 | * from other group and save more power | ||
2331 | */ | ||
2332 | if (sum_nr_running <= group_capacity - 1) { | ||
2333 | if (sum_nr_running > leader_nr_running || | ||
2334 | (sum_nr_running == leader_nr_running && | ||
2335 | first_cpu(group->cpumask) > | ||
2336 | first_cpu(group_leader->cpumask))) { | ||
2337 | group_leader = group; | ||
2338 | leader_nr_running = sum_nr_running; | ||
2339 | } | ||
2340 | } | ||
2341 | group_next: | ||
2342 | #endif | ||
2003 | group = group->next; | 2343 | group = group->next; |
2004 | } while (group != sd->groups); | 2344 | } while (group != sd->groups); |
2005 | 2345 | ||
2006 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 2346 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2007 | goto out_balanced; | 2347 | goto out_balanced; |
2008 | 2348 | ||
2009 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2349 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
@@ -2012,6 +2352,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2012 | 100*max_load <= sd->imbalance_pct*this_load) | 2352 | 100*max_load <= sd->imbalance_pct*this_load) |
2013 | goto out_balanced; | 2353 | goto out_balanced; |
2014 | 2354 | ||
2355 | busiest_load_per_task /= busiest_nr_running; | ||
2015 | /* | 2356 | /* |
2016 | * We're trying to get all the cpus to the average_load, so we don't | 2357 | * We're trying to get all the cpus to the average_load, so we don't |
2017 | * want to push ourselves above the average load, nor do we wish to | 2358 | * want to push ourselves above the average load, nor do we wish to |
@@ -2023,21 +2364,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2023 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2364 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2024 | * appear as very large values with unsigned longs. | 2365 | * appear as very large values with unsigned longs. |
2025 | */ | 2366 | */ |
2367 | if (max_load <= busiest_load_per_task) | ||
2368 | goto out_balanced; | ||
2369 | |||
2370 | /* | ||
2371 | * In the presence of smp nice balancing, certain scenarios can have | ||
2372 | * max load less than avg load(as we skip the groups at or below | ||
2373 | * its cpu_power, while calculating max_load..) | ||
2374 | */ | ||
2375 | if (max_load < avg_load) { | ||
2376 | *imbalance = 0; | ||
2377 | goto small_imbalance; | ||
2378 | } | ||
2026 | 2379 | ||
2027 | /* Don't want to pull so many tasks that a group would go idle */ | 2380 | /* Don't want to pull so many tasks that a group would go idle */ |
2028 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2381 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2029 | 2382 | ||
2030 | /* How much load to actually move to equalise the imbalance */ | 2383 | /* How much load to actually move to equalise the imbalance */ |
2031 | *imbalance = min(max_pull * busiest->cpu_power, | 2384 | *imbalance = min(max_pull * busiest->cpu_power, |
2032 | (avg_load - this_load) * this->cpu_power) | 2385 | (avg_load - this_load) * this->cpu_power) |
2033 | / SCHED_LOAD_SCALE; | 2386 | / SCHED_LOAD_SCALE; |
2034 | 2387 | ||
2035 | if (*imbalance < SCHED_LOAD_SCALE) { | 2388 | /* |
2036 | unsigned long pwr_now = 0, pwr_move = 0; | 2389 | * if *imbalance is less than the average load per runnable task |
2037 | unsigned long tmp; | 2390 | * there is no gaurantee that any tasks will be moved so we'll have |
2391 | * a think about bumping its value to force at least one task to be | ||
2392 | * moved | ||
2393 | */ | ||
2394 | if (*imbalance < busiest_load_per_task) { | ||
2395 | unsigned long tmp, pwr_now, pwr_move; | ||
2396 | unsigned int imbn; | ||
2397 | |||
2398 | small_imbalance: | ||
2399 | pwr_move = pwr_now = 0; | ||
2400 | imbn = 2; | ||
2401 | if (this_nr_running) { | ||
2402 | this_load_per_task /= this_nr_running; | ||
2403 | if (busiest_load_per_task > this_load_per_task) | ||
2404 | imbn = 1; | ||
2405 | } else | ||
2406 | this_load_per_task = SCHED_LOAD_SCALE; | ||
2038 | 2407 | ||
2039 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2408 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
2040 | *imbalance = 1; | 2409 | *imbalance = busiest_load_per_task; |
2041 | return busiest; | 2410 | return busiest; |
2042 | } | 2411 | } |
2043 | 2412 | ||
@@ -2047,39 +2416,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2047 | * moving them. | 2416 | * moving them. |
2048 | */ | 2417 | */ |
2049 | 2418 | ||
2050 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2419 | pwr_now += busiest->cpu_power * |
2051 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2420 | min(busiest_load_per_task, max_load); |
2421 | pwr_now += this->cpu_power * | ||
2422 | min(this_load_per_task, this_load); | ||
2052 | pwr_now /= SCHED_LOAD_SCALE; | 2423 | pwr_now /= SCHED_LOAD_SCALE; |
2053 | 2424 | ||
2054 | /* Amount of load we'd subtract */ | 2425 | /* Amount of load we'd subtract */ |
2055 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2426 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; |
2056 | if (max_load > tmp) | 2427 | if (max_load > tmp) |
2057 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2428 | pwr_move += busiest->cpu_power * |
2058 | max_load - tmp); | 2429 | min(busiest_load_per_task, max_load - tmp); |
2059 | 2430 | ||
2060 | /* Amount of load we'd add */ | 2431 | /* Amount of load we'd add */ |
2061 | if (max_load*busiest->cpu_power < | 2432 | if (max_load*busiest->cpu_power < |
2062 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2433 | busiest_load_per_task*SCHED_LOAD_SCALE) |
2063 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2434 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
2064 | else | 2435 | else |
2065 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2436 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; |
2066 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2437 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); |
2067 | pwr_move /= SCHED_LOAD_SCALE; | 2438 | pwr_move /= SCHED_LOAD_SCALE; |
2068 | 2439 | ||
2069 | /* Move if we gain throughput */ | 2440 | /* Move if we gain throughput */ |
2070 | if (pwr_move <= pwr_now) | 2441 | if (pwr_move <= pwr_now) |
2071 | goto out_balanced; | 2442 | goto out_balanced; |
2072 | 2443 | ||
2073 | *imbalance = 1; | 2444 | *imbalance = busiest_load_per_task; |
2074 | return busiest; | ||
2075 | } | 2445 | } |
2076 | 2446 | ||
2077 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
2078 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
2079 | return busiest; | 2447 | return busiest; |
2080 | 2448 | ||
2081 | out_balanced: | 2449 | out_balanced: |
2450 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2451 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2452 | goto ret; | ||
2082 | 2453 | ||
2454 | if (this == group_leader && group_leader != group_min) { | ||
2455 | *imbalance = min_load_per_task; | ||
2456 | return group_min; | ||
2457 | } | ||
2458 | ret: | ||
2459 | #endif | ||
2083 | *imbalance = 0; | 2460 | *imbalance = 0; |
2084 | return NULL; | 2461 | return NULL; |
2085 | } | 2462 | } |
@@ -2087,19 +2464,23 @@ out_balanced: | |||
2087 | /* | 2464 | /* |
2088 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2465 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2089 | */ | 2466 | */ |
2090 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2467 | static struct rq * |
2091 | enum idle_type idle) | 2468 | find_busiest_queue(struct sched_group *group, enum idle_type idle, |
2469 | unsigned long imbalance) | ||
2092 | { | 2470 | { |
2093 | unsigned long load, max_load = 0; | 2471 | struct rq *busiest = NULL, *rq; |
2094 | runqueue_t *busiest = NULL; | 2472 | unsigned long max_load = 0; |
2095 | int i; | 2473 | int i; |
2096 | 2474 | ||
2097 | for_each_cpu_mask(i, group->cpumask) { | 2475 | for_each_cpu_mask(i, group->cpumask) { |
2098 | load = source_load(i, 0); | 2476 | rq = cpu_rq(i); |
2099 | 2477 | ||
2100 | if (load > max_load) { | 2478 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) |
2101 | max_load = load; | 2479 | continue; |
2102 | busiest = cpu_rq(i); | 2480 | |
2481 | if (rq->raw_weighted_load > max_load) { | ||
2482 | max_load = rq->raw_weighted_load; | ||
2483 | busiest = rq; | ||
2103 | } | 2484 | } |
2104 | } | 2485 | } |
2105 | 2486 | ||
@@ -2112,23 +2493,27 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
2112 | */ | 2493 | */ |
2113 | #define MAX_PINNED_INTERVAL 512 | 2494 | #define MAX_PINNED_INTERVAL 512 |
2114 | 2495 | ||
2496 | static inline unsigned long minus_1_or_zero(unsigned long n) | ||
2497 | { | ||
2498 | return n > 0 ? n - 1 : 0; | ||
2499 | } | ||
2500 | |||
2115 | /* | 2501 | /* |
2116 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2502 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2117 | * tasks if there is an imbalance. | 2503 | * tasks if there is an imbalance. |
2118 | * | 2504 | * |
2119 | * Called with this_rq unlocked. | 2505 | * Called with this_rq unlocked. |
2120 | */ | 2506 | */ |
2121 | static int load_balance(int this_cpu, runqueue_t *this_rq, | 2507 | static int load_balance(int this_cpu, struct rq *this_rq, |
2122 | struct sched_domain *sd, enum idle_type idle) | 2508 | struct sched_domain *sd, enum idle_type idle) |
2123 | { | 2509 | { |
2510 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
2124 | struct sched_group *group; | 2511 | struct sched_group *group; |
2125 | runqueue_t *busiest; | ||
2126 | unsigned long imbalance; | 2512 | unsigned long imbalance; |
2127 | int nr_moved, all_pinned = 0; | 2513 | struct rq *busiest; |
2128 | int active_balance = 0; | ||
2129 | int sd_idle = 0; | ||
2130 | 2514 | ||
2131 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2515 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2516 | !sched_smt_power_savings) | ||
2132 | sd_idle = 1; | 2517 | sd_idle = 1; |
2133 | 2518 | ||
2134 | schedstat_inc(sd, lb_cnt[idle]); | 2519 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2139,7 +2524,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2139 | goto out_balanced; | 2524 | goto out_balanced; |
2140 | } | 2525 | } |
2141 | 2526 | ||
2142 | busiest = find_busiest_queue(group, idle); | 2527 | busiest = find_busiest_queue(group, idle, imbalance); |
2143 | if (!busiest) { | 2528 | if (!busiest) { |
2144 | schedstat_inc(sd, lb_nobusyq[idle]); | 2529 | schedstat_inc(sd, lb_nobusyq[idle]); |
2145 | goto out_balanced; | 2530 | goto out_balanced; |
@@ -2159,7 +2544,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2159 | */ | 2544 | */ |
2160 | double_rq_lock(this_rq, busiest); | 2545 | double_rq_lock(this_rq, busiest); |
2161 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2546 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2162 | imbalance, sd, idle, &all_pinned); | 2547 | minus_1_or_zero(busiest->nr_running), |
2548 | imbalance, sd, idle, &all_pinned); | ||
2163 | double_rq_unlock(this_rq, busiest); | 2549 | double_rq_unlock(this_rq, busiest); |
2164 | 2550 | ||
2165 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2551 | /* All tasks on this runqueue were pinned by CPU affinity */ |
@@ -2216,7 +2602,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2216 | sd->balance_interval *= 2; | 2602 | sd->balance_interval *= 2; |
2217 | } | 2603 | } |
2218 | 2604 | ||
2219 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2605 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2606 | !sched_smt_power_savings) | ||
2220 | return -1; | 2607 | return -1; |
2221 | return nr_moved; | 2608 | return nr_moved; |
2222 | 2609 | ||
@@ -2231,7 +2618,8 @@ out_one_pinned: | |||
2231 | (sd->balance_interval < sd->max_interval)) | 2618 | (sd->balance_interval < sd->max_interval)) |
2232 | sd->balance_interval *= 2; | 2619 | sd->balance_interval *= 2; |
2233 | 2620 | ||
2234 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2621 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2622 | !sched_smt_power_savings) | ||
2235 | return -1; | 2623 | return -1; |
2236 | return 0; | 2624 | return 0; |
2237 | } | 2625 | } |
@@ -2243,16 +2631,16 @@ out_one_pinned: | |||
2243 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | 2631 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). |
2244 | * this_rq is locked. | 2632 | * this_rq is locked. |
2245 | */ | 2633 | */ |
2246 | static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | 2634 | static int |
2247 | struct sched_domain *sd) | 2635 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) |
2248 | { | 2636 | { |
2249 | struct sched_group *group; | 2637 | struct sched_group *group; |
2250 | runqueue_t *busiest = NULL; | 2638 | struct rq *busiest = NULL; |
2251 | unsigned long imbalance; | 2639 | unsigned long imbalance; |
2252 | int nr_moved = 0; | 2640 | int nr_moved = 0; |
2253 | int sd_idle = 0; | 2641 | int sd_idle = 0; |
2254 | 2642 | ||
2255 | if (sd->flags & SD_SHARE_CPUPOWER) | 2643 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2256 | sd_idle = 1; | 2644 | sd_idle = 1; |
2257 | 2645 | ||
2258 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2646 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2262,7 +2650,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2262 | goto out_balanced; | 2650 | goto out_balanced; |
2263 | } | 2651 | } |
2264 | 2652 | ||
2265 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2653 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); |
2266 | if (!busiest) { | 2654 | if (!busiest) { |
2267 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2655 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2268 | goto out_balanced; | 2656 | goto out_balanced; |
@@ -2277,6 +2665,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2277 | /* Attempt to move tasks */ | 2665 | /* Attempt to move tasks */ |
2278 | double_lock_balance(this_rq, busiest); | 2666 | double_lock_balance(this_rq, busiest); |
2279 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2667 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2668 | minus_1_or_zero(busiest->nr_running), | ||
2280 | imbalance, sd, NEWLY_IDLE, NULL); | 2669 | imbalance, sd, NEWLY_IDLE, NULL); |
2281 | spin_unlock(&busiest->lock); | 2670 | spin_unlock(&busiest->lock); |
2282 | } | 2671 | } |
@@ -2292,9 +2681,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2292 | 2681 | ||
2293 | out_balanced: | 2682 | out_balanced: |
2294 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2683 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2295 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2684 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2685 | !sched_smt_power_savings) | ||
2296 | return -1; | 2686 | return -1; |
2297 | sd->nr_balance_failed = 0; | 2687 | sd->nr_balance_failed = 0; |
2688 | |||
2298 | return 0; | 2689 | return 0; |
2299 | } | 2690 | } |
2300 | 2691 | ||
@@ -2302,16 +2693,15 @@ out_balanced: | |||
2302 | * idle_balance is called by schedule() if this_cpu is about to become | 2693 | * idle_balance is called by schedule() if this_cpu is about to become |
2303 | * idle. Attempts to pull tasks from other CPUs. | 2694 | * idle. Attempts to pull tasks from other CPUs. |
2304 | */ | 2695 | */ |
2305 | static void idle_balance(int this_cpu, runqueue_t *this_rq) | 2696 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2306 | { | 2697 | { |
2307 | struct sched_domain *sd; | 2698 | struct sched_domain *sd; |
2308 | 2699 | ||
2309 | for_each_domain(this_cpu, sd) { | 2700 | for_each_domain(this_cpu, sd) { |
2310 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2701 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
2311 | if (load_balance_newidle(this_cpu, this_rq, sd)) { | 2702 | /* If we've pulled tasks over stop searching: */ |
2312 | /* We've pulled tasks over so stop searching */ | 2703 | if (load_balance_newidle(this_cpu, this_rq, sd)) |
2313 | break; | 2704 | break; |
2314 | } | ||
2315 | } | 2705 | } |
2316 | } | 2706 | } |
2317 | } | 2707 | } |
@@ -2324,14 +2714,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq) | |||
2324 | * | 2714 | * |
2325 | * Called with busiest_rq locked. | 2715 | * Called with busiest_rq locked. |
2326 | */ | 2716 | */ |
2327 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2717 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) |
2328 | { | 2718 | { |
2329 | struct sched_domain *sd; | ||
2330 | runqueue_t *target_rq; | ||
2331 | int target_cpu = busiest_rq->push_cpu; | 2719 | int target_cpu = busiest_rq->push_cpu; |
2720 | struct sched_domain *sd; | ||
2721 | struct rq *target_rq; | ||
2332 | 2722 | ||
2723 | /* Is there any task to move? */ | ||
2333 | if (busiest_rq->nr_running <= 1) | 2724 | if (busiest_rq->nr_running <= 1) |
2334 | /* no task to move */ | ||
2335 | return; | 2725 | return; |
2336 | 2726 | ||
2337 | target_rq = cpu_rq(target_cpu); | 2727 | target_rq = cpu_rq(target_cpu); |
@@ -2347,21 +2737,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
2347 | double_lock_balance(busiest_rq, target_rq); | 2737 | double_lock_balance(busiest_rq, target_rq); |
2348 | 2738 | ||
2349 | /* Search for an sd spanning us and the target CPU. */ | 2739 | /* Search for an sd spanning us and the target CPU. */ |
2350 | for_each_domain(target_cpu, sd) | 2740 | for_each_domain(target_cpu, sd) { |
2351 | if ((sd->flags & SD_LOAD_BALANCE) && | 2741 | if ((sd->flags & SD_LOAD_BALANCE) && |
2352 | cpu_isset(busiest_cpu, sd->span)) | 2742 | cpu_isset(busiest_cpu, sd->span)) |
2353 | break; | 2743 | break; |
2744 | } | ||
2354 | 2745 | ||
2355 | if (unlikely(sd == NULL)) | 2746 | if (likely(sd)) { |
2356 | goto out; | 2747 | schedstat_inc(sd, alb_cnt); |
2357 | |||
2358 | schedstat_inc(sd, alb_cnt); | ||
2359 | 2748 | ||
2360 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2749 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
2361 | schedstat_inc(sd, alb_pushed); | 2750 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, |
2362 | else | 2751 | NULL)) |
2363 | schedstat_inc(sd, alb_failed); | 2752 | schedstat_inc(sd, alb_pushed); |
2364 | out: | 2753 | else |
2754 | schedstat_inc(sd, alb_failed); | ||
2755 | } | ||
2365 | spin_unlock(&target_rq->lock); | 2756 | spin_unlock(&target_rq->lock); |
2366 | } | 2757 | } |
2367 | 2758 | ||
@@ -2374,23 +2765,27 @@ out: | |||
2374 | * Balancing parameters are set up in arch_init_sched_domains. | 2765 | * Balancing parameters are set up in arch_init_sched_domains. |
2375 | */ | 2766 | */ |
2376 | 2767 | ||
2377 | /* Don't have all balancing operations going off at once */ | 2768 | /* Don't have all balancing operations going off at once: */ |
2378 | #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) | 2769 | static inline unsigned long cpu_offset(int cpu) |
2770 | { | ||
2771 | return jiffies + cpu * HZ / NR_CPUS; | ||
2772 | } | ||
2379 | 2773 | ||
2380 | static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | 2774 | static void |
2381 | enum idle_type idle) | 2775 | rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) |
2382 | { | 2776 | { |
2383 | unsigned long old_load, this_load; | 2777 | unsigned long this_load, interval, j = cpu_offset(this_cpu); |
2384 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | ||
2385 | struct sched_domain *sd; | 2778 | struct sched_domain *sd; |
2386 | int i; | 2779 | int i, scale; |
2780 | |||
2781 | this_load = this_rq->raw_weighted_load; | ||
2782 | |||
2783 | /* Update our load: */ | ||
2784 | for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { | ||
2785 | unsigned long old_load, new_load; | ||
2387 | 2786 | ||
2388 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | ||
2389 | /* Update our load */ | ||
2390 | for (i = 0; i < 3; i++) { | ||
2391 | unsigned long new_load = this_load; | ||
2392 | int scale = 1 << i; | ||
2393 | old_load = this_rq->cpu_load[i]; | 2787 | old_load = this_rq->cpu_load[i]; |
2788 | new_load = this_load; | ||
2394 | /* | 2789 | /* |
2395 | * Round up the averaging division if load is increasing. This | 2790 | * Round up the averaging division if load is increasing. This |
2396 | * prevents us from getting stuck on 9 if the load is 10, for | 2791 | * prevents us from getting stuck on 9 if the load is 10, for |
@@ -2402,8 +2797,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2402 | } | 2797 | } |
2403 | 2798 | ||
2404 | for_each_domain(this_cpu, sd) { | 2799 | for_each_domain(this_cpu, sd) { |
2405 | unsigned long interval; | ||
2406 | |||
2407 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2800 | if (!(sd->flags & SD_LOAD_BALANCE)) |
2408 | continue; | 2801 | continue; |
2409 | 2802 | ||
@@ -2433,17 +2826,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2433 | /* | 2826 | /* |
2434 | * on UP we do not need to balance between CPUs: | 2827 | * on UP we do not need to balance between CPUs: |
2435 | */ | 2828 | */ |
2436 | static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) | 2829 | static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) |
2437 | { | 2830 | { |
2438 | } | 2831 | } |
2439 | static inline void idle_balance(int cpu, runqueue_t *rq) | 2832 | static inline void idle_balance(int cpu, struct rq *rq) |
2440 | { | 2833 | { |
2441 | } | 2834 | } |
2442 | #endif | 2835 | #endif |
2443 | 2836 | ||
2444 | static inline int wake_priority_sleeper(runqueue_t *rq) | 2837 | static inline int wake_priority_sleeper(struct rq *rq) |
2445 | { | 2838 | { |
2446 | int ret = 0; | 2839 | int ret = 0; |
2840 | |||
2447 | #ifdef CONFIG_SCHED_SMT | 2841 | #ifdef CONFIG_SCHED_SMT |
2448 | spin_lock(&rq->lock); | 2842 | spin_lock(&rq->lock); |
2449 | /* | 2843 | /* |
@@ -2467,25 +2861,26 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
2467 | * This is called on clock ticks and on context switches. | 2861 | * This is called on clock ticks and on context switches. |
2468 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | 2862 | * Bank in p->sched_time the ns elapsed since the last tick or switch. |
2469 | */ | 2863 | */ |
2470 | static inline void update_cpu_clock(task_t *p, runqueue_t *rq, | 2864 | static inline void |
2471 | unsigned long long now) | 2865 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
2472 | { | 2866 | { |
2473 | unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); | 2867 | p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); |
2474 | p->sched_time += now - last; | ||
2475 | } | 2868 | } |
2476 | 2869 | ||
2477 | /* | 2870 | /* |
2478 | * Return current->sched_time plus any more ns on the sched_clock | 2871 | * Return current->sched_time plus any more ns on the sched_clock |
2479 | * that have not yet been banked. | 2872 | * that have not yet been banked. |
2480 | */ | 2873 | */ |
2481 | unsigned long long current_sched_time(const task_t *tsk) | 2874 | unsigned long long current_sched_time(const struct task_struct *p) |
2482 | { | 2875 | { |
2483 | unsigned long long ns; | 2876 | unsigned long long ns; |
2484 | unsigned long flags; | 2877 | unsigned long flags; |
2878 | |||
2485 | local_irq_save(flags); | 2879 | local_irq_save(flags); |
2486 | ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); | 2880 | ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); |
2487 | ns = tsk->sched_time + (sched_clock() - ns); | 2881 | ns = p->sched_time + sched_clock() - ns; |
2488 | local_irq_restore(flags); | 2882 | local_irq_restore(flags); |
2883 | |||
2489 | return ns; | 2884 | return ns; |
2490 | } | 2885 | } |
2491 | 2886 | ||
@@ -2499,11 +2894,16 @@ unsigned long long current_sched_time(const task_t *tsk) | |||
2499 | * increasing number of running tasks. We also ignore the interactivity | 2894 | * increasing number of running tasks. We also ignore the interactivity |
2500 | * if a better static_prio task has expired: | 2895 | * if a better static_prio task has expired: |
2501 | */ | 2896 | */ |
2502 | #define EXPIRED_STARVING(rq) \ | 2897 | static inline int expired_starving(struct rq *rq) |
2503 | ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | 2898 | { |
2504 | (jiffies - (rq)->expired_timestamp >= \ | 2899 | if (rq->curr->static_prio > rq->best_expired_prio) |
2505 | STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | 2900 | return 1; |
2506 | ((rq)->curr->static_prio > (rq)->best_expired_prio)) | 2901 | if (!STARVATION_LIMIT || !rq->expired_timestamp) |
2902 | return 0; | ||
2903 | if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) | ||
2904 | return 1; | ||
2905 | return 0; | ||
2906 | } | ||
2507 | 2907 | ||
2508 | /* | 2908 | /* |
2509 | * Account user cpu time to a process. | 2909 | * Account user cpu time to a process. |
@@ -2536,7 +2936,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
2536 | cputime_t cputime) | 2936 | cputime_t cputime) |
2537 | { | 2937 | { |
2538 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2938 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2539 | runqueue_t *rq = this_rq(); | 2939 | struct rq *rq = this_rq(); |
2540 | cputime64_t tmp; | 2940 | cputime64_t tmp; |
2541 | 2941 | ||
2542 | p->stime = cputime_add(p->stime, cputime); | 2942 | p->stime = cputime_add(p->stime, cputime); |
@@ -2566,7 +2966,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
2566 | { | 2966 | { |
2567 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2967 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2568 | cputime64_t tmp = cputime_to_cputime64(steal); | 2968 | cputime64_t tmp = cputime_to_cputime64(steal); |
2569 | runqueue_t *rq = this_rq(); | 2969 | struct rq *rq = this_rq(); |
2570 | 2970 | ||
2571 | if (p == rq->idle) { | 2971 | if (p == rq->idle) { |
2572 | p->stime = cputime_add(p->stime, steal); | 2972 | p->stime = cputime_add(p->stime, steal); |
@@ -2587,10 +2987,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
2587 | */ | 2987 | */ |
2588 | void scheduler_tick(void) | 2988 | void scheduler_tick(void) |
2589 | { | 2989 | { |
2590 | int cpu = smp_processor_id(); | ||
2591 | runqueue_t *rq = this_rq(); | ||
2592 | task_t *p = current; | ||
2593 | unsigned long long now = sched_clock(); | 2990 | unsigned long long now = sched_clock(); |
2991 | struct task_struct *p = current; | ||
2992 | int cpu = smp_processor_id(); | ||
2993 | struct rq *rq = cpu_rq(cpu); | ||
2594 | 2994 | ||
2595 | update_cpu_clock(p, rq, now); | 2995 | update_cpu_clock(p, rq, now); |
2596 | 2996 | ||
@@ -2640,7 +3040,7 @@ void scheduler_tick(void) | |||
2640 | 3040 | ||
2641 | if (!rq->expired_timestamp) | 3041 | if (!rq->expired_timestamp) |
2642 | rq->expired_timestamp = jiffies; | 3042 | rq->expired_timestamp = jiffies; |
2643 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | 3043 | if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { |
2644 | enqueue_task(p, rq->expired); | 3044 | enqueue_task(p, rq->expired); |
2645 | if (p->static_prio < rq->best_expired_prio) | 3045 | if (p->static_prio < rq->best_expired_prio) |
2646 | rq->best_expired_prio = p->static_prio; | 3046 | rq->best_expired_prio = p->static_prio; |
@@ -2679,55 +3079,42 @@ out: | |||
2679 | } | 3079 | } |
2680 | 3080 | ||
2681 | #ifdef CONFIG_SCHED_SMT | 3081 | #ifdef CONFIG_SCHED_SMT |
2682 | static inline void wakeup_busy_runqueue(runqueue_t *rq) | 3082 | static inline void wakeup_busy_runqueue(struct rq *rq) |
2683 | { | 3083 | { |
2684 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ | 3084 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ |
2685 | if (rq->curr == rq->idle && rq->nr_running) | 3085 | if (rq->curr == rq->idle && rq->nr_running) |
2686 | resched_task(rq->idle); | 3086 | resched_task(rq->idle); |
2687 | } | 3087 | } |
2688 | 3088 | ||
2689 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3089 | /* |
3090 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
3091 | */ | ||
3092 | static void wake_sleeping_dependent(int this_cpu) | ||
2690 | { | 3093 | { |
2691 | struct sched_domain *tmp, *sd = NULL; | 3094 | struct sched_domain *tmp, *sd = NULL; |
2692 | cpumask_t sibling_map; | ||
2693 | int i; | 3095 | int i; |
2694 | 3096 | ||
2695 | for_each_domain(this_cpu, tmp) | 3097 | for_each_domain(this_cpu, tmp) { |
2696 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3098 | if (tmp->flags & SD_SHARE_CPUPOWER) { |
2697 | sd = tmp; | 3099 | sd = tmp; |
3100 | break; | ||
3101 | } | ||
3102 | } | ||
2698 | 3103 | ||
2699 | if (!sd) | 3104 | if (!sd) |
2700 | return; | 3105 | return; |
2701 | 3106 | ||
2702 | /* | 3107 | for_each_cpu_mask(i, sd->span) { |
2703 | * Unlock the current runqueue because we have to lock in | 3108 | struct rq *smt_rq = cpu_rq(i); |
2704 | * CPU order to avoid deadlocks. Caller knows that we might | ||
2705 | * unlock. We keep IRQs disabled. | ||
2706 | */ | ||
2707 | spin_unlock(&this_rq->lock); | ||
2708 | |||
2709 | sibling_map = sd->span; | ||
2710 | 3109 | ||
2711 | for_each_cpu_mask(i, sibling_map) | 3110 | if (i == this_cpu) |
2712 | spin_lock(&cpu_rq(i)->lock); | 3111 | continue; |
2713 | /* | 3112 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
2714 | * We clear this CPU from the mask. This both simplifies the | 3113 | continue; |
2715 | * inner loop and keps this_rq locked when we exit: | ||
2716 | */ | ||
2717 | cpu_clear(this_cpu, sibling_map); | ||
2718 | |||
2719 | for_each_cpu_mask(i, sibling_map) { | ||
2720 | runqueue_t *smt_rq = cpu_rq(i); | ||
2721 | 3114 | ||
2722 | wakeup_busy_runqueue(smt_rq); | 3115 | wakeup_busy_runqueue(smt_rq); |
3116 | spin_unlock(&smt_rq->lock); | ||
2723 | } | 3117 | } |
2724 | |||
2725 | for_each_cpu_mask(i, sibling_map) | ||
2726 | spin_unlock(&cpu_rq(i)->lock); | ||
2727 | /* | ||
2728 | * We exit with this_cpu's rq still held and IRQs | ||
2729 | * still disabled: | ||
2730 | */ | ||
2731 | } | 3118 | } |
2732 | 3119 | ||
2733 | /* | 3120 | /* |
@@ -2735,57 +3122,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
2735 | * utilize, if another task runs on a sibling. This models the | 3122 | * utilize, if another task runs on a sibling. This models the |
2736 | * slowdown effect of other tasks running on siblings: | 3123 | * slowdown effect of other tasks running on siblings: |
2737 | */ | 3124 | */ |
2738 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | 3125 | static inline unsigned long |
3126 | smt_slice(struct task_struct *p, struct sched_domain *sd) | ||
2739 | { | 3127 | { |
2740 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 3128 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2741 | } | 3129 | } |
2742 | 3130 | ||
2743 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3131 | /* |
3132 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
3133 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
3134 | * acquire their lock. As we only trylock the normal locking order does not | ||
3135 | * need to be obeyed. | ||
3136 | */ | ||
3137 | static int | ||
3138 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) | ||
2744 | { | 3139 | { |
2745 | struct sched_domain *tmp, *sd = NULL; | 3140 | struct sched_domain *tmp, *sd = NULL; |
2746 | cpumask_t sibling_map; | ||
2747 | prio_array_t *array; | ||
2748 | int ret = 0, i; | 3141 | int ret = 0, i; |
2749 | task_t *p; | ||
2750 | 3142 | ||
2751 | for_each_domain(this_cpu, tmp) | 3143 | /* kernel/rt threads do not participate in dependent sleeping */ |
2752 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3144 | if (!p->mm || rt_task(p)) |
3145 | return 0; | ||
3146 | |||
3147 | for_each_domain(this_cpu, tmp) { | ||
3148 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
2753 | sd = tmp; | 3149 | sd = tmp; |
3150 | break; | ||
3151 | } | ||
3152 | } | ||
2754 | 3153 | ||
2755 | if (!sd) | 3154 | if (!sd) |
2756 | return 0; | 3155 | return 0; |
2757 | 3156 | ||
2758 | /* | 3157 | for_each_cpu_mask(i, sd->span) { |
2759 | * The same locking rules and details apply as for | 3158 | struct task_struct *smt_curr; |
2760 | * wake_sleeping_dependent(): | 3159 | struct rq *smt_rq; |
2761 | */ | ||
2762 | spin_unlock(&this_rq->lock); | ||
2763 | sibling_map = sd->span; | ||
2764 | for_each_cpu_mask(i, sibling_map) | ||
2765 | spin_lock(&cpu_rq(i)->lock); | ||
2766 | cpu_clear(this_cpu, sibling_map); | ||
2767 | 3160 | ||
2768 | /* | 3161 | if (i == this_cpu) |
2769 | * Establish next task to be run - it might have gone away because | 3162 | continue; |
2770 | * we released the runqueue lock above: | ||
2771 | */ | ||
2772 | if (!this_rq->nr_running) | ||
2773 | goto out_unlock; | ||
2774 | array = this_rq->active; | ||
2775 | if (!array->nr_active) | ||
2776 | array = this_rq->expired; | ||
2777 | BUG_ON(!array->nr_active); | ||
2778 | 3163 | ||
2779 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 3164 | smt_rq = cpu_rq(i); |
2780 | task_t, run_list); | 3165 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
3166 | continue; | ||
2781 | 3167 | ||
2782 | for_each_cpu_mask(i, sibling_map) { | 3168 | smt_curr = smt_rq->curr; |
2783 | runqueue_t *smt_rq = cpu_rq(i); | ||
2784 | task_t *smt_curr = smt_rq->curr; | ||
2785 | 3169 | ||
2786 | /* Kernel threads do not participate in dependent sleeping */ | 3170 | if (!smt_curr->mm) |
2787 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 3171 | goto unlock; |
2788 | goto check_smt_task; | ||
2789 | 3172 | ||
2790 | /* | 3173 | /* |
2791 | * If a user task with lower static priority than the | 3174 | * If a user task with lower static priority than the |
@@ -2803,49 +3186,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
2803 | if ((jiffies % DEF_TIMESLICE) > | 3186 | if ((jiffies % DEF_TIMESLICE) > |
2804 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 3187 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2805 | ret = 1; | 3188 | ret = 1; |
2806 | } else | 3189 | } else { |
2807 | if (smt_curr->static_prio < p->static_prio && | 3190 | if (smt_curr->static_prio < p->static_prio && |
2808 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 3191 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
2809 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 3192 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
2810 | ret = 1; | 3193 | ret = 1; |
2811 | |||
2812 | check_smt_task: | ||
2813 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
2814 | rt_task(smt_curr)) | ||
2815 | continue; | ||
2816 | if (!p->mm) { | ||
2817 | wakeup_busy_runqueue(smt_rq); | ||
2818 | continue; | ||
2819 | } | ||
2820 | |||
2821 | /* | ||
2822 | * Reschedule a lower priority task on the SMT sibling for | ||
2823 | * it to be put to sleep, or wake it up if it has been put to | ||
2824 | * sleep for priority reasons to see if it should run now. | ||
2825 | */ | ||
2826 | if (rt_task(p)) { | ||
2827 | if ((jiffies % DEF_TIMESLICE) > | ||
2828 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
2829 | resched_task(smt_curr); | ||
2830 | } else { | ||
2831 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
2832 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
2833 | resched_task(smt_curr); | ||
2834 | else | ||
2835 | wakeup_busy_runqueue(smt_rq); | ||
2836 | } | 3194 | } |
3195 | unlock: | ||
3196 | spin_unlock(&smt_rq->lock); | ||
2837 | } | 3197 | } |
2838 | out_unlock: | ||
2839 | for_each_cpu_mask(i, sibling_map) | ||
2840 | spin_unlock(&cpu_rq(i)->lock); | ||
2841 | return ret; | 3198 | return ret; |
2842 | } | 3199 | } |
2843 | #else | 3200 | #else |
2844 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3201 | static inline void wake_sleeping_dependent(int this_cpu) |
2845 | { | 3202 | { |
2846 | } | 3203 | } |
2847 | 3204 | static inline int | |
2848 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3205 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) |
2849 | { | 3206 | { |
2850 | return 0; | 3207 | return 0; |
2851 | } | 3208 | } |
@@ -2858,12 +3215,13 @@ void fastcall add_preempt_count(int val) | |||
2858 | /* | 3215 | /* |
2859 | * Underflow? | 3216 | * Underflow? |
2860 | */ | 3217 | */ |
2861 | BUG_ON((preempt_count() < 0)); | 3218 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
3219 | return; | ||
2862 | preempt_count() += val; | 3220 | preempt_count() += val; |
2863 | /* | 3221 | /* |
2864 | * Spinlock count overflowing soon? | 3222 | * Spinlock count overflowing soon? |
2865 | */ | 3223 | */ |
2866 | BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 3224 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); |
2867 | } | 3225 | } |
2868 | EXPORT_SYMBOL(add_preempt_count); | 3226 | EXPORT_SYMBOL(add_preempt_count); |
2869 | 3227 | ||
@@ -2872,11 +3230,15 @@ void fastcall sub_preempt_count(int val) | |||
2872 | /* | 3230 | /* |
2873 | * Underflow? | 3231 | * Underflow? |
2874 | */ | 3232 | */ |
2875 | BUG_ON(val > preempt_count()); | 3233 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
3234 | return; | ||
2876 | /* | 3235 | /* |
2877 | * Is the spinlock portion underflowing? | 3236 | * Is the spinlock portion underflowing? |
2878 | */ | 3237 | */ |
2879 | BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); | 3238 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
3239 | !(preempt_count() & PREEMPT_MASK))) | ||
3240 | return; | ||
3241 | |||
2880 | preempt_count() -= val; | 3242 | preempt_count() -= val; |
2881 | } | 3243 | } |
2882 | EXPORT_SYMBOL(sub_preempt_count); | 3244 | EXPORT_SYMBOL(sub_preempt_count); |
@@ -2894,14 +3256,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type) | |||
2894 | */ | 3256 | */ |
2895 | asmlinkage void __sched schedule(void) | 3257 | asmlinkage void __sched schedule(void) |
2896 | { | 3258 | { |
2897 | long *switch_count; | 3259 | struct task_struct *prev, *next; |
2898 | task_t *prev, *next; | 3260 | struct prio_array *array; |
2899 | runqueue_t *rq; | ||
2900 | prio_array_t *array; | ||
2901 | struct list_head *queue; | 3261 | struct list_head *queue; |
2902 | unsigned long long now; | 3262 | unsigned long long now; |
2903 | unsigned long run_time; | 3263 | unsigned long run_time; |
2904 | int cpu, idx, new_prio; | 3264 | int cpu, idx, new_prio; |
3265 | long *switch_count; | ||
3266 | struct rq *rq; | ||
2905 | 3267 | ||
2906 | /* | 3268 | /* |
2907 | * Test if we are atomic. Since do_exit() needs to call into | 3269 | * Test if we are atomic. Since do_exit() needs to call into |
@@ -2967,32 +3329,13 @@ need_resched_nonpreemptible: | |||
2967 | 3329 | ||
2968 | cpu = smp_processor_id(); | 3330 | cpu = smp_processor_id(); |
2969 | if (unlikely(!rq->nr_running)) { | 3331 | if (unlikely(!rq->nr_running)) { |
2970 | go_idle: | ||
2971 | idle_balance(cpu, rq); | 3332 | idle_balance(cpu, rq); |
2972 | if (!rq->nr_running) { | 3333 | if (!rq->nr_running) { |
2973 | next = rq->idle; | 3334 | next = rq->idle; |
2974 | rq->expired_timestamp = 0; | 3335 | rq->expired_timestamp = 0; |
2975 | wake_sleeping_dependent(cpu, rq); | 3336 | wake_sleeping_dependent(cpu); |
2976 | /* | ||
2977 | * wake_sleeping_dependent() might have released | ||
2978 | * the runqueue, so break out if we got new | ||
2979 | * tasks meanwhile: | ||
2980 | */ | ||
2981 | if (!rq->nr_running) | ||
2982 | goto switch_tasks; | ||
2983 | } | ||
2984 | } else { | ||
2985 | if (dependent_sleeper(cpu, rq)) { | ||
2986 | next = rq->idle; | ||
2987 | goto switch_tasks; | 3337 | goto switch_tasks; |
2988 | } | 3338 | } |
2989 | /* | ||
2990 | * dependent_sleeper() releases and reacquires the runqueue | ||
2991 | * lock, hence go into the idle loop if the rq went | ||
2992 | * empty meanwhile: | ||
2993 | */ | ||
2994 | if (unlikely(!rq->nr_running)) | ||
2995 | goto go_idle; | ||
2996 | } | 3339 | } |
2997 | 3340 | ||
2998 | array = rq->active; | 3341 | array = rq->active; |
@@ -3010,7 +3353,7 @@ go_idle: | |||
3010 | 3353 | ||
3011 | idx = sched_find_first_bit(array->bitmap); | 3354 | idx = sched_find_first_bit(array->bitmap); |
3012 | queue = array->queue + idx; | 3355 | queue = array->queue + idx; |
3013 | next = list_entry(queue->next, task_t, run_list); | 3356 | next = list_entry(queue->next, struct task_struct, run_list); |
3014 | 3357 | ||
3015 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | 3358 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
3016 | unsigned long long delta = now - next->timestamp; | 3359 | unsigned long long delta = now - next->timestamp; |
@@ -3030,6 +3373,8 @@ go_idle: | |||
3030 | } | 3373 | } |
3031 | } | 3374 | } |
3032 | next->sleep_type = SLEEP_NORMAL; | 3375 | next->sleep_type = SLEEP_NORMAL; |
3376 | if (dependent_sleeper(cpu, rq, next)) | ||
3377 | next = rq->idle; | ||
3033 | switch_tasks: | 3378 | switch_tasks: |
3034 | if (next == rq->idle) | 3379 | if (next == rq->idle) |
3035 | schedstat_inc(rq, sched_goidle); | 3380 | schedstat_inc(rq, sched_goidle); |
@@ -3071,12 +3416,11 @@ switch_tasks: | |||
3071 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3416 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3072 | goto need_resched; | 3417 | goto need_resched; |
3073 | } | 3418 | } |
3074 | |||
3075 | EXPORT_SYMBOL(schedule); | 3419 | EXPORT_SYMBOL(schedule); |
3076 | 3420 | ||
3077 | #ifdef CONFIG_PREEMPT | 3421 | #ifdef CONFIG_PREEMPT |
3078 | /* | 3422 | /* |
3079 | * this is is the entry point to schedule() from in-kernel preemption | 3423 | * this is the entry point to schedule() from in-kernel preemption |
3080 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3424 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3081 | * occur there and call schedule directly. | 3425 | * occur there and call schedule directly. |
3082 | */ | 3426 | */ |
@@ -3116,11 +3460,10 @@ need_resched: | |||
3116 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3460 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3117 | goto need_resched; | 3461 | goto need_resched; |
3118 | } | 3462 | } |
3119 | |||
3120 | EXPORT_SYMBOL(preempt_schedule); | 3463 | EXPORT_SYMBOL(preempt_schedule); |
3121 | 3464 | ||
3122 | /* | 3465 | /* |
3123 | * this is is the entry point to schedule() from kernel preemption | 3466 | * this is the entry point to schedule() from kernel preemption |
3124 | * off of irq context. | 3467 | * off of irq context. |
3125 | * Note, that this is called and return with irqs disabled. This will | 3468 | * Note, that this is called and return with irqs disabled. This will |
3126 | * protect us against recursive calling from irq. | 3469 | * protect us against recursive calling from irq. |
@@ -3132,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3132 | struct task_struct *task = current; | 3475 | struct task_struct *task = current; |
3133 | int saved_lock_depth; | 3476 | int saved_lock_depth; |
3134 | #endif | 3477 | #endif |
3135 | /* Catch callers which need to be fixed*/ | 3478 | /* Catch callers which need to be fixed */ |
3136 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3479 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3137 | 3480 | ||
3138 | need_resched: | 3481 | need_resched: |
@@ -3165,10 +3508,8 @@ need_resched: | |||
3165 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 3508 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
3166 | void *key) | 3509 | void *key) |
3167 | { | 3510 | { |
3168 | task_t *p = curr->private; | 3511 | return try_to_wake_up(curr->private, mode, sync); |
3169 | return try_to_wake_up(p, mode, sync); | ||
3170 | } | 3512 | } |
3171 | |||
3172 | EXPORT_SYMBOL(default_wake_function); | 3513 | EXPORT_SYMBOL(default_wake_function); |
3173 | 3514 | ||
3174 | /* | 3515 | /* |
@@ -3186,13 +3527,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
3186 | struct list_head *tmp, *next; | 3527 | struct list_head *tmp, *next; |
3187 | 3528 | ||
3188 | list_for_each_safe(tmp, next, &q->task_list) { | 3529 | list_for_each_safe(tmp, next, &q->task_list) { |
3189 | wait_queue_t *curr; | 3530 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); |
3190 | unsigned flags; | 3531 | unsigned flags = curr->flags; |
3191 | curr = list_entry(tmp, wait_queue_t, task_list); | 3532 | |
3192 | flags = curr->flags; | ||
3193 | if (curr->func(curr, mode, sync, key) && | 3533 | if (curr->func(curr, mode, sync, key) && |
3194 | (flags & WQ_FLAG_EXCLUSIVE) && | 3534 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
3195 | !--nr_exclusive) | ||
3196 | break; | 3535 | break; |
3197 | } | 3536 | } |
3198 | } | 3537 | } |
@@ -3213,7 +3552,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | |||
3213 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 3552 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
3214 | spin_unlock_irqrestore(&q->lock, flags); | 3553 | spin_unlock_irqrestore(&q->lock, flags); |
3215 | } | 3554 | } |
3216 | |||
3217 | EXPORT_SYMBOL(__wake_up); | 3555 | EXPORT_SYMBOL(__wake_up); |
3218 | 3556 | ||
3219 | /* | 3557 | /* |
@@ -3282,6 +3620,7 @@ EXPORT_SYMBOL(complete_all); | |||
3282 | void fastcall __sched wait_for_completion(struct completion *x) | 3620 | void fastcall __sched wait_for_completion(struct completion *x) |
3283 | { | 3621 | { |
3284 | might_sleep(); | 3622 | might_sleep(); |
3623 | |||
3285 | spin_lock_irq(&x->wait.lock); | 3624 | spin_lock_irq(&x->wait.lock); |
3286 | if (!x->done) { | 3625 | if (!x->done) { |
3287 | DECLARE_WAITQUEUE(wait, current); | 3626 | DECLARE_WAITQUEUE(wait, current); |
@@ -3426,7 +3765,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | |||
3426 | schedule(); | 3765 | schedule(); |
3427 | SLEEP_ON_TAIL | 3766 | SLEEP_ON_TAIL |
3428 | } | 3767 | } |
3429 | |||
3430 | EXPORT_SYMBOL(interruptible_sleep_on); | 3768 | EXPORT_SYMBOL(interruptible_sleep_on); |
3431 | 3769 | ||
3432 | long fastcall __sched | 3770 | long fastcall __sched |
@@ -3442,7 +3780,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3442 | 3780 | ||
3443 | return timeout; | 3781 | return timeout; |
3444 | } | 3782 | } |
3445 | |||
3446 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3783 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3447 | 3784 | ||
3448 | void fastcall __sched sleep_on(wait_queue_head_t *q) | 3785 | void fastcall __sched sleep_on(wait_queue_head_t *q) |
@@ -3455,7 +3792,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q) | |||
3455 | schedule(); | 3792 | schedule(); |
3456 | SLEEP_ON_TAIL | 3793 | SLEEP_ON_TAIL |
3457 | } | 3794 | } |
3458 | |||
3459 | EXPORT_SYMBOL(sleep_on); | 3795 | EXPORT_SYMBOL(sleep_on); |
3460 | 3796 | ||
3461 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3797 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
@@ -3473,12 +3809,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3473 | 3809 | ||
3474 | EXPORT_SYMBOL(sleep_on_timeout); | 3810 | EXPORT_SYMBOL(sleep_on_timeout); |
3475 | 3811 | ||
3476 | void set_user_nice(task_t *p, long nice) | 3812 | #ifdef CONFIG_RT_MUTEXES |
3813 | |||
3814 | /* | ||
3815 | * rt_mutex_setprio - set the current priority of a task | ||
3816 | * @p: task | ||
3817 | * @prio: prio value (kernel-internal form) | ||
3818 | * | ||
3819 | * This function changes the 'effective' priority of a task. It does | ||
3820 | * not touch ->normal_prio like __setscheduler(). | ||
3821 | * | ||
3822 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
3823 | */ | ||
3824 | void rt_mutex_setprio(struct task_struct *p, int prio) | ||
3477 | { | 3825 | { |
3826 | struct prio_array *array; | ||
3478 | unsigned long flags; | 3827 | unsigned long flags; |
3479 | prio_array_t *array; | 3828 | struct rq *rq; |
3480 | runqueue_t *rq; | 3829 | int oldprio; |
3481 | int old_prio, new_prio, delta; | 3830 | |
3831 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
3832 | |||
3833 | rq = task_rq_lock(p, &flags); | ||
3834 | |||
3835 | oldprio = p->prio; | ||
3836 | array = p->array; | ||
3837 | if (array) | ||
3838 | dequeue_task(p, array); | ||
3839 | p->prio = prio; | ||
3840 | |||
3841 | if (array) { | ||
3842 | /* | ||
3843 | * If changing to an RT priority then queue it | ||
3844 | * in the active array! | ||
3845 | */ | ||
3846 | if (rt_task(p)) | ||
3847 | array = rq->active; | ||
3848 | enqueue_task(p, array); | ||
3849 | /* | ||
3850 | * Reschedule if we are currently running on this runqueue and | ||
3851 | * our priority decreased, or if we are not currently running on | ||
3852 | * this runqueue and our priority is higher than the current's | ||
3853 | */ | ||
3854 | if (task_running(rq, p)) { | ||
3855 | if (p->prio > oldprio) | ||
3856 | resched_task(rq->curr); | ||
3857 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3858 | resched_task(rq->curr); | ||
3859 | } | ||
3860 | task_rq_unlock(rq, &flags); | ||
3861 | } | ||
3862 | |||
3863 | #endif | ||
3864 | |||
3865 | void set_user_nice(struct task_struct *p, long nice) | ||
3866 | { | ||
3867 | struct prio_array *array; | ||
3868 | int old_prio, delta; | ||
3869 | unsigned long flags; | ||
3870 | struct rq *rq; | ||
3482 | 3871 | ||
3483 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3872 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3484 | return; | 3873 | return; |
@@ -3493,22 +3882,25 @@ void set_user_nice(task_t *p, long nice) | |||
3493 | * it wont have any effect on scheduling until the task is | 3882 | * it wont have any effect on scheduling until the task is |
3494 | * not SCHED_NORMAL/SCHED_BATCH: | 3883 | * not SCHED_NORMAL/SCHED_BATCH: |
3495 | */ | 3884 | */ |
3496 | if (rt_task(p)) { | 3885 | if (has_rt_policy(p)) { |
3497 | p->static_prio = NICE_TO_PRIO(nice); | 3886 | p->static_prio = NICE_TO_PRIO(nice); |
3498 | goto out_unlock; | 3887 | goto out_unlock; |
3499 | } | 3888 | } |
3500 | array = p->array; | 3889 | array = p->array; |
3501 | if (array) | 3890 | if (array) { |
3502 | dequeue_task(p, array); | 3891 | dequeue_task(p, array); |
3892 | dec_raw_weighted_load(rq, p); | ||
3893 | } | ||
3503 | 3894 | ||
3504 | old_prio = p->prio; | ||
3505 | new_prio = NICE_TO_PRIO(nice); | ||
3506 | delta = new_prio - old_prio; | ||
3507 | p->static_prio = NICE_TO_PRIO(nice); | 3895 | p->static_prio = NICE_TO_PRIO(nice); |
3508 | p->prio += delta; | 3896 | set_load_weight(p); |
3897 | old_prio = p->prio; | ||
3898 | p->prio = effective_prio(p); | ||
3899 | delta = p->prio - old_prio; | ||
3509 | 3900 | ||
3510 | if (array) { | 3901 | if (array) { |
3511 | enqueue_task(p, array); | 3902 | enqueue_task(p, array); |
3903 | inc_raw_weighted_load(rq, p); | ||
3512 | /* | 3904 | /* |
3513 | * If the task increased its priority or is running and | 3905 | * If the task increased its priority or is running and |
3514 | * lowered its priority, then reschedule its CPU: | 3906 | * lowered its priority, then reschedule its CPU: |
@@ -3519,7 +3911,6 @@ void set_user_nice(task_t *p, long nice) | |||
3519 | out_unlock: | 3911 | out_unlock: |
3520 | task_rq_unlock(rq, &flags); | 3912 | task_rq_unlock(rq, &flags); |
3521 | } | 3913 | } |
3522 | |||
3523 | EXPORT_SYMBOL(set_user_nice); | 3914 | EXPORT_SYMBOL(set_user_nice); |
3524 | 3915 | ||
3525 | /* | 3916 | /* |
@@ -3527,10 +3918,11 @@ EXPORT_SYMBOL(set_user_nice); | |||
3527 | * @p: task | 3918 | * @p: task |
3528 | * @nice: nice value | 3919 | * @nice: nice value |
3529 | */ | 3920 | */ |
3530 | int can_nice(const task_t *p, const int nice) | 3921 | int can_nice(const struct task_struct *p, const int nice) |
3531 | { | 3922 | { |
3532 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3923 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
3533 | int nice_rlim = 20 - nice; | 3924 | int nice_rlim = 20 - nice; |
3925 | |||
3534 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 3926 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
3535 | capable(CAP_SYS_NICE)); | 3927 | capable(CAP_SYS_NICE)); |
3536 | } | 3928 | } |
@@ -3546,8 +3938,7 @@ int can_nice(const task_t *p, const int nice) | |||
3546 | */ | 3938 | */ |
3547 | asmlinkage long sys_nice(int increment) | 3939 | asmlinkage long sys_nice(int increment) |
3548 | { | 3940 | { |
3549 | int retval; | 3941 | long nice, retval; |
3550 | long nice; | ||
3551 | 3942 | ||
3552 | /* | 3943 | /* |
3553 | * Setpriority might change our priority at the same moment. | 3944 | * Setpriority might change our priority at the same moment. |
@@ -3586,7 +3977,7 @@ asmlinkage long sys_nice(int increment) | |||
3586 | * RT tasks are offset by -200. Normal tasks are centered | 3977 | * RT tasks are offset by -200. Normal tasks are centered |
3587 | * around 0, value goes from -16 to +15. | 3978 | * around 0, value goes from -16 to +15. |
3588 | */ | 3979 | */ |
3589 | int task_prio(const task_t *p) | 3980 | int task_prio(const struct task_struct *p) |
3590 | { | 3981 | { |
3591 | return p->prio - MAX_RT_PRIO; | 3982 | return p->prio - MAX_RT_PRIO; |
3592 | } | 3983 | } |
@@ -3595,7 +3986,7 @@ int task_prio(const task_t *p) | |||
3595 | * task_nice - return the nice value of a given task. | 3986 | * task_nice - return the nice value of a given task. |
3596 | * @p: the task in question. | 3987 | * @p: the task in question. |
3597 | */ | 3988 | */ |
3598 | int task_nice(const task_t *p) | 3989 | int task_nice(const struct task_struct *p) |
3599 | { | 3990 | { |
3600 | return TASK_NICE(p); | 3991 | return TASK_NICE(p); |
3601 | } | 3992 | } |
@@ -3614,7 +4005,7 @@ int idle_cpu(int cpu) | |||
3614 | * idle_task - return the idle task for a given cpu. | 4005 | * idle_task - return the idle task for a given cpu. |
3615 | * @cpu: the processor in question. | 4006 | * @cpu: the processor in question. |
3616 | */ | 4007 | */ |
3617 | task_t *idle_task(int cpu) | 4008 | struct task_struct *idle_task(int cpu) |
3618 | { | 4009 | { |
3619 | return cpu_rq(cpu)->idle; | 4010 | return cpu_rq(cpu)->idle; |
3620 | } | 4011 | } |
@@ -3623,7 +4014,7 @@ task_t *idle_task(int cpu) | |||
3623 | * find_process_by_pid - find a process with a matching PID value. | 4014 | * find_process_by_pid - find a process with a matching PID value. |
3624 | * @pid: the pid in question. | 4015 | * @pid: the pid in question. |
3625 | */ | 4016 | */ |
3626 | static inline task_t *find_process_by_pid(pid_t pid) | 4017 | static inline struct task_struct *find_process_by_pid(pid_t pid) |
3627 | { | 4018 | { |
3628 | return pid ? find_task_by_pid(pid) : current; | 4019 | return pid ? find_task_by_pid(pid) : current; |
3629 | } | 4020 | } |
@@ -3632,18 +4023,18 @@ static inline task_t *find_process_by_pid(pid_t pid) | |||
3632 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4023 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
3633 | { | 4024 | { |
3634 | BUG_ON(p->array); | 4025 | BUG_ON(p->array); |
4026 | |||
3635 | p->policy = policy; | 4027 | p->policy = policy; |
3636 | p->rt_priority = prio; | 4028 | p->rt_priority = prio; |
3637 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 4029 | p->normal_prio = normal_prio(p); |
3638 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 4030 | /* we are holding p->pi_lock already */ |
3639 | } else { | 4031 | p->prio = rt_mutex_getprio(p); |
3640 | p->prio = p->static_prio; | 4032 | /* |
3641 | /* | 4033 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
3642 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 4034 | */ |
3643 | */ | 4035 | if (policy == SCHED_BATCH) |
3644 | if (policy == SCHED_BATCH) | 4036 | p->sleep_avg = 0; |
3645 | p->sleep_avg = 0; | 4037 | set_load_weight(p); |
3646 | } | ||
3647 | } | 4038 | } |
3648 | 4039 | ||
3649 | /** | 4040 | /** |
@@ -3656,12 +4047,13 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3656 | int sched_setscheduler(struct task_struct *p, int policy, | 4047 | int sched_setscheduler(struct task_struct *p, int policy, |
3657 | struct sched_param *param) | 4048 | struct sched_param *param) |
3658 | { | 4049 | { |
3659 | int retval; | 4050 | int retval, oldprio, oldpolicy = -1; |
3660 | int oldprio, oldpolicy = -1; | 4051 | struct prio_array *array; |
3661 | prio_array_t *array; | ||
3662 | unsigned long flags; | 4052 | unsigned long flags; |
3663 | runqueue_t *rq; | 4053 | struct rq *rq; |
3664 | 4054 | ||
4055 | /* may grab non-irq protected spin_locks */ | ||
4056 | BUG_ON(in_interrupt()); | ||
3665 | recheck: | 4057 | recheck: |
3666 | /* double check policy once rq lock held */ | 4058 | /* double check policy once rq lock held */ |
3667 | if (policy < 0) | 4059 | if (policy < 0) |
@@ -3710,14 +4102,20 @@ recheck: | |||
3710 | if (retval) | 4102 | if (retval) |
3711 | return retval; | 4103 | return retval; |
3712 | /* | 4104 | /* |
4105 | * make sure no PI-waiters arrive (or leave) while we are | ||
4106 | * changing the priority of the task: | ||
4107 | */ | ||
4108 | spin_lock_irqsave(&p->pi_lock, flags); | ||
4109 | /* | ||
3713 | * To be able to change p->policy safely, the apropriate | 4110 | * To be able to change p->policy safely, the apropriate |
3714 | * runqueue lock must be held. | 4111 | * runqueue lock must be held. |
3715 | */ | 4112 | */ |
3716 | rq = task_rq_lock(p, &flags); | 4113 | rq = __task_rq_lock(p); |
3717 | /* recheck policy now with rq lock held */ | 4114 | /* recheck policy now with rq lock held */ |
3718 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4115 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
3719 | policy = oldpolicy = -1; | 4116 | policy = oldpolicy = -1; |
3720 | task_rq_unlock(rq, &flags); | 4117 | __task_rq_unlock(rq); |
4118 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
3721 | goto recheck; | 4119 | goto recheck; |
3722 | } | 4120 | } |
3723 | array = p->array; | 4121 | array = p->array; |
@@ -3738,7 +4136,11 @@ recheck: | |||
3738 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4136 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3739 | resched_task(rq->curr); | 4137 | resched_task(rq->curr); |
3740 | } | 4138 | } |
3741 | task_rq_unlock(rq, &flags); | 4139 | __task_rq_unlock(rq); |
4140 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4141 | |||
4142 | rt_mutex_adjust_pi(p); | ||
4143 | |||
3742 | return 0; | 4144 | return 0; |
3743 | } | 4145 | } |
3744 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4146 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
@@ -3746,9 +4148,9 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
3746 | static int | 4148 | static int |
3747 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4149 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
3748 | { | 4150 | { |
3749 | int retval; | ||
3750 | struct sched_param lparam; | 4151 | struct sched_param lparam; |
3751 | struct task_struct *p; | 4152 | struct task_struct *p; |
4153 | int retval; | ||
3752 | 4154 | ||
3753 | if (!param || pid < 0) | 4155 | if (!param || pid < 0) |
3754 | return -EINVAL; | 4156 | return -EINVAL; |
@@ -3760,8 +4162,11 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3760 | read_unlock_irq(&tasklist_lock); | 4162 | read_unlock_irq(&tasklist_lock); |
3761 | return -ESRCH; | 4163 | return -ESRCH; |
3762 | } | 4164 | } |
3763 | retval = sched_setscheduler(p, policy, &lparam); | 4165 | get_task_struct(p); |
3764 | read_unlock_irq(&tasklist_lock); | 4166 | read_unlock_irq(&tasklist_lock); |
4167 | retval = sched_setscheduler(p, policy, &lparam); | ||
4168 | put_task_struct(p); | ||
4169 | |||
3765 | return retval; | 4170 | return retval; |
3766 | } | 4171 | } |
3767 | 4172 | ||
@@ -3797,8 +4202,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
3797 | */ | 4202 | */ |
3798 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4203 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
3799 | { | 4204 | { |
4205 | struct task_struct *p; | ||
3800 | int retval = -EINVAL; | 4206 | int retval = -EINVAL; |
3801 | task_t *p; | ||
3802 | 4207 | ||
3803 | if (pid < 0) | 4208 | if (pid < 0) |
3804 | goto out_nounlock; | 4209 | goto out_nounlock; |
@@ -3825,8 +4230,8 @@ out_nounlock: | |||
3825 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 4230 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
3826 | { | 4231 | { |
3827 | struct sched_param lp; | 4232 | struct sched_param lp; |
4233 | struct task_struct *p; | ||
3828 | int retval = -EINVAL; | 4234 | int retval = -EINVAL; |
3829 | task_t *p; | ||
3830 | 4235 | ||
3831 | if (!param || pid < 0) | 4236 | if (!param || pid < 0) |
3832 | goto out_nounlock; | 4237 | goto out_nounlock; |
@@ -3859,9 +4264,9 @@ out_unlock: | |||
3859 | 4264 | ||
3860 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 4265 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
3861 | { | 4266 | { |
3862 | task_t *p; | ||
3863 | int retval; | ||
3864 | cpumask_t cpus_allowed; | 4267 | cpumask_t cpus_allowed; |
4268 | struct task_struct *p; | ||
4269 | int retval; | ||
3865 | 4270 | ||
3866 | lock_cpu_hotplug(); | 4271 | lock_cpu_hotplug(); |
3867 | read_lock(&tasklist_lock); | 4272 | read_lock(&tasklist_lock); |
@@ -3886,6 +4291,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
3886 | !capable(CAP_SYS_NICE)) | 4291 | !capable(CAP_SYS_NICE)) |
3887 | goto out_unlock; | 4292 | goto out_unlock; |
3888 | 4293 | ||
4294 | retval = security_task_setscheduler(p, 0, NULL); | ||
4295 | if (retval) | ||
4296 | goto out_unlock; | ||
4297 | |||
3889 | cpus_allowed = cpuset_cpus_allowed(p); | 4298 | cpus_allowed = cpuset_cpus_allowed(p); |
3890 | cpus_and(new_mask, new_mask, cpus_allowed); | 4299 | cpus_and(new_mask, new_mask, cpus_allowed); |
3891 | retval = set_cpus_allowed(p, new_mask); | 4300 | retval = set_cpus_allowed(p, new_mask); |
@@ -3943,8 +4352,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | |||
3943 | 4352 | ||
3944 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4353 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
3945 | { | 4354 | { |
4355 | struct task_struct *p; | ||
3946 | int retval; | 4356 | int retval; |
3947 | task_t *p; | ||
3948 | 4357 | ||
3949 | lock_cpu_hotplug(); | 4358 | lock_cpu_hotplug(); |
3950 | read_lock(&tasklist_lock); | 4359 | read_lock(&tasklist_lock); |
@@ -3954,7 +4363,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
3954 | if (!p) | 4363 | if (!p) |
3955 | goto out_unlock; | 4364 | goto out_unlock; |
3956 | 4365 | ||
3957 | retval = 0; | 4366 | retval = security_task_getscheduler(p); |
4367 | if (retval) | ||
4368 | goto out_unlock; | ||
4369 | |||
3958 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); | 4370 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); |
3959 | 4371 | ||
3960 | out_unlock: | 4372 | out_unlock: |
@@ -4000,9 +4412,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
4000 | */ | 4412 | */ |
4001 | asmlinkage long sys_sched_yield(void) | 4413 | asmlinkage long sys_sched_yield(void) |
4002 | { | 4414 | { |
4003 | runqueue_t *rq = this_rq_lock(); | 4415 | struct rq *rq = this_rq_lock(); |
4004 | prio_array_t *array = current->array; | 4416 | struct prio_array *array = current->array, *target = rq->expired; |
4005 | prio_array_t *target = rq->expired; | ||
4006 | 4417 | ||
4007 | schedstat_inc(rq, yld_cnt); | 4418 | schedstat_inc(rq, yld_cnt); |
4008 | /* | 4419 | /* |
@@ -4036,6 +4447,7 @@ asmlinkage long sys_sched_yield(void) | |||
4036 | * no need to preempt or enable interrupts: | 4447 | * no need to preempt or enable interrupts: |
4037 | */ | 4448 | */ |
4038 | __release(rq->lock); | 4449 | __release(rq->lock); |
4450 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
4039 | _raw_spin_unlock(&rq->lock); | 4451 | _raw_spin_unlock(&rq->lock); |
4040 | preempt_enable_no_resched(); | 4452 | preempt_enable_no_resched(); |
4041 | 4453 | ||
@@ -4044,17 +4456,25 @@ asmlinkage long sys_sched_yield(void) | |||
4044 | return 0; | 4456 | return 0; |
4045 | } | 4457 | } |
4046 | 4458 | ||
4047 | static inline void __cond_resched(void) | 4459 | static inline int __resched_legal(void) |
4048 | { | 4460 | { |
4461 | if (unlikely(preempt_count())) | ||
4462 | return 0; | ||
4463 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
4464 | return 0; | ||
4465 | return 1; | ||
4466 | } | ||
4467 | |||
4468 | static void __cond_resched(void) | ||
4469 | { | ||
4470 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | ||
4471 | __might_sleep(__FILE__, __LINE__); | ||
4472 | #endif | ||
4049 | /* | 4473 | /* |
4050 | * The BKS might be reacquired before we have dropped | 4474 | * The BKS might be reacquired before we have dropped |
4051 | * PREEMPT_ACTIVE, which could trigger a second | 4475 | * PREEMPT_ACTIVE, which could trigger a second |
4052 | * cond_resched() call. | 4476 | * cond_resched() call. |
4053 | */ | 4477 | */ |
4054 | if (unlikely(preempt_count())) | ||
4055 | return; | ||
4056 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
4057 | return; | ||
4058 | do { | 4478 | do { |
4059 | add_preempt_count(PREEMPT_ACTIVE); | 4479 | add_preempt_count(PREEMPT_ACTIVE); |
4060 | schedule(); | 4480 | schedule(); |
@@ -4064,13 +4484,12 @@ static inline void __cond_resched(void) | |||
4064 | 4484 | ||
4065 | int __sched cond_resched(void) | 4485 | int __sched cond_resched(void) |
4066 | { | 4486 | { |
4067 | if (need_resched()) { | 4487 | if (need_resched() && __resched_legal()) { |
4068 | __cond_resched(); | 4488 | __cond_resched(); |
4069 | return 1; | 4489 | return 1; |
4070 | } | 4490 | } |
4071 | return 0; | 4491 | return 0; |
4072 | } | 4492 | } |
4073 | |||
4074 | EXPORT_SYMBOL(cond_resched); | 4493 | EXPORT_SYMBOL(cond_resched); |
4075 | 4494 | ||
4076 | /* | 4495 | /* |
@@ -4091,7 +4510,8 @@ int cond_resched_lock(spinlock_t *lock) | |||
4091 | ret = 1; | 4510 | ret = 1; |
4092 | spin_lock(lock); | 4511 | spin_lock(lock); |
4093 | } | 4512 | } |
4094 | if (need_resched()) { | 4513 | if (need_resched() && __resched_legal()) { |
4514 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
4095 | _raw_spin_unlock(lock); | 4515 | _raw_spin_unlock(lock); |
4096 | preempt_enable_no_resched(); | 4516 | preempt_enable_no_resched(); |
4097 | __cond_resched(); | 4517 | __cond_resched(); |
@@ -4100,25 +4520,24 @@ int cond_resched_lock(spinlock_t *lock) | |||
4100 | } | 4520 | } |
4101 | return ret; | 4521 | return ret; |
4102 | } | 4522 | } |
4103 | |||
4104 | EXPORT_SYMBOL(cond_resched_lock); | 4523 | EXPORT_SYMBOL(cond_resched_lock); |
4105 | 4524 | ||
4106 | int __sched cond_resched_softirq(void) | 4525 | int __sched cond_resched_softirq(void) |
4107 | { | 4526 | { |
4108 | BUG_ON(!in_softirq()); | 4527 | BUG_ON(!in_softirq()); |
4109 | 4528 | ||
4110 | if (need_resched()) { | 4529 | if (need_resched() && __resched_legal()) { |
4111 | __local_bh_enable(); | 4530 | raw_local_irq_disable(); |
4531 | _local_bh_enable(); | ||
4532 | raw_local_irq_enable(); | ||
4112 | __cond_resched(); | 4533 | __cond_resched(); |
4113 | local_bh_disable(); | 4534 | local_bh_disable(); |
4114 | return 1; | 4535 | return 1; |
4115 | } | 4536 | } |
4116 | return 0; | 4537 | return 0; |
4117 | } | 4538 | } |
4118 | |||
4119 | EXPORT_SYMBOL(cond_resched_softirq); | 4539 | EXPORT_SYMBOL(cond_resched_softirq); |
4120 | 4540 | ||
4121 | |||
4122 | /** | 4541 | /** |
4123 | * yield - yield the current processor to other threads. | 4542 | * yield - yield the current processor to other threads. |
4124 | * | 4543 | * |
@@ -4130,7 +4549,6 @@ void __sched yield(void) | |||
4130 | set_current_state(TASK_RUNNING); | 4549 | set_current_state(TASK_RUNNING); |
4131 | sys_sched_yield(); | 4550 | sys_sched_yield(); |
4132 | } | 4551 | } |
4133 | |||
4134 | EXPORT_SYMBOL(yield); | 4552 | EXPORT_SYMBOL(yield); |
4135 | 4553 | ||
4136 | /* | 4554 | /* |
@@ -4142,23 +4560,26 @@ EXPORT_SYMBOL(yield); | |||
4142 | */ | 4560 | */ |
4143 | void __sched io_schedule(void) | 4561 | void __sched io_schedule(void) |
4144 | { | 4562 | { |
4145 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4563 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4146 | 4564 | ||
4565 | delayacct_blkio_start(); | ||
4147 | atomic_inc(&rq->nr_iowait); | 4566 | atomic_inc(&rq->nr_iowait); |
4148 | schedule(); | 4567 | schedule(); |
4149 | atomic_dec(&rq->nr_iowait); | 4568 | atomic_dec(&rq->nr_iowait); |
4569 | delayacct_blkio_end(); | ||
4150 | } | 4570 | } |
4151 | |||
4152 | EXPORT_SYMBOL(io_schedule); | 4571 | EXPORT_SYMBOL(io_schedule); |
4153 | 4572 | ||
4154 | long __sched io_schedule_timeout(long timeout) | 4573 | long __sched io_schedule_timeout(long timeout) |
4155 | { | 4574 | { |
4156 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4575 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4157 | long ret; | 4576 | long ret; |
4158 | 4577 | ||
4578 | delayacct_blkio_start(); | ||
4159 | atomic_inc(&rq->nr_iowait); | 4579 | atomic_inc(&rq->nr_iowait); |
4160 | ret = schedule_timeout(timeout); | 4580 | ret = schedule_timeout(timeout); |
4161 | atomic_dec(&rq->nr_iowait); | 4581 | atomic_dec(&rq->nr_iowait); |
4582 | delayacct_blkio_end(); | ||
4162 | return ret; | 4583 | return ret; |
4163 | } | 4584 | } |
4164 | 4585 | ||
@@ -4220,9 +4641,9 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
4220 | asmlinkage | 4641 | asmlinkage |
4221 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4642 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
4222 | { | 4643 | { |
4644 | struct task_struct *p; | ||
4223 | int retval = -EINVAL; | 4645 | int retval = -EINVAL; |
4224 | struct timespec t; | 4646 | struct timespec t; |
4225 | task_t *p; | ||
4226 | 4647 | ||
4227 | if (pid < 0) | 4648 | if (pid < 0) |
4228 | goto out_nounlock; | 4649 | goto out_nounlock; |
@@ -4237,7 +4658,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4237 | if (retval) | 4658 | if (retval) |
4238 | goto out_unlock; | 4659 | goto out_unlock; |
4239 | 4660 | ||
4240 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | 4661 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4241 | 0 : task_timeslice(p), &t); | 4662 | 0 : task_timeslice(p), &t); |
4242 | read_unlock(&tasklist_lock); | 4663 | read_unlock(&tasklist_lock); |
4243 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4664 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
@@ -4250,35 +4671,36 @@ out_unlock: | |||
4250 | 4671 | ||
4251 | static inline struct task_struct *eldest_child(struct task_struct *p) | 4672 | static inline struct task_struct *eldest_child(struct task_struct *p) |
4252 | { | 4673 | { |
4253 | if (list_empty(&p->children)) return NULL; | 4674 | if (list_empty(&p->children)) |
4675 | return NULL; | ||
4254 | return list_entry(p->children.next,struct task_struct,sibling); | 4676 | return list_entry(p->children.next,struct task_struct,sibling); |
4255 | } | 4677 | } |
4256 | 4678 | ||
4257 | static inline struct task_struct *older_sibling(struct task_struct *p) | 4679 | static inline struct task_struct *older_sibling(struct task_struct *p) |
4258 | { | 4680 | { |
4259 | if (p->sibling.prev==&p->parent->children) return NULL; | 4681 | if (p->sibling.prev==&p->parent->children) |
4682 | return NULL; | ||
4260 | return list_entry(p->sibling.prev,struct task_struct,sibling); | 4683 | return list_entry(p->sibling.prev,struct task_struct,sibling); |
4261 | } | 4684 | } |
4262 | 4685 | ||
4263 | static inline struct task_struct *younger_sibling(struct task_struct *p) | 4686 | static inline struct task_struct *younger_sibling(struct task_struct *p) |
4264 | { | 4687 | { |
4265 | if (p->sibling.next==&p->parent->children) return NULL; | 4688 | if (p->sibling.next==&p->parent->children) |
4689 | return NULL; | ||
4266 | return list_entry(p->sibling.next,struct task_struct,sibling); | 4690 | return list_entry(p->sibling.next,struct task_struct,sibling); |
4267 | } | 4691 | } |
4268 | 4692 | ||
4269 | static void show_task(task_t *p) | 4693 | static const char stat_nam[] = "RSDTtZX"; |
4694 | |||
4695 | static void show_task(struct task_struct *p) | ||
4270 | { | 4696 | { |
4271 | task_t *relative; | 4697 | struct task_struct *relative; |
4272 | unsigned state; | ||
4273 | unsigned long free = 0; | 4698 | unsigned long free = 0; |
4274 | static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; | 4699 | unsigned state; |
4275 | 4700 | ||
4276 | printk("%-13.13s ", p->comm); | ||
4277 | state = p->state ? __ffs(p->state) + 1 : 0; | 4701 | state = p->state ? __ffs(p->state) + 1 : 0; |
4278 | if (state < ARRAY_SIZE(stat_nam)) | 4702 | printk("%-13.13s %c", p->comm, |
4279 | printk(stat_nam[state]); | 4703 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
4280 | else | ||
4281 | printk("?"); | ||
4282 | #if (BITS_PER_LONG == 32) | 4704 | #if (BITS_PER_LONG == 32) |
4283 | if (state == TASK_RUNNING) | 4705 | if (state == TASK_RUNNING) |
4284 | printk(" running "); | 4706 | printk(" running "); |
@@ -4322,7 +4744,7 @@ static void show_task(task_t *p) | |||
4322 | 4744 | ||
4323 | void show_state(void) | 4745 | void show_state(void) |
4324 | { | 4746 | { |
4325 | task_t *g, *p; | 4747 | struct task_struct *g, *p; |
4326 | 4748 | ||
4327 | #if (BITS_PER_LONG == 32) | 4749 | #if (BITS_PER_LONG == 32) |
4328 | printk("\n" | 4750 | printk("\n" |
@@ -4344,7 +4766,7 @@ void show_state(void) | |||
4344 | } while_each_thread(g, p); | 4766 | } while_each_thread(g, p); |
4345 | 4767 | ||
4346 | read_unlock(&tasklist_lock); | 4768 | read_unlock(&tasklist_lock); |
4347 | mutex_debug_show_all_locks(); | 4769 | debug_show_all_locks(); |
4348 | } | 4770 | } |
4349 | 4771 | ||
4350 | /** | 4772 | /** |
@@ -4355,15 +4777,15 @@ void show_state(void) | |||
4355 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4777 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
4356 | * flag, to make booting more robust. | 4778 | * flag, to make booting more robust. |
4357 | */ | 4779 | */ |
4358 | void __devinit init_idle(task_t *idle, int cpu) | 4780 | void __devinit init_idle(struct task_struct *idle, int cpu) |
4359 | { | 4781 | { |
4360 | runqueue_t *rq = cpu_rq(cpu); | 4782 | struct rq *rq = cpu_rq(cpu); |
4361 | unsigned long flags; | 4783 | unsigned long flags; |
4362 | 4784 | ||
4363 | idle->timestamp = sched_clock(); | 4785 | idle->timestamp = sched_clock(); |
4364 | idle->sleep_avg = 0; | 4786 | idle->sleep_avg = 0; |
4365 | idle->array = NULL; | 4787 | idle->array = NULL; |
4366 | idle->prio = MAX_PRIO; | 4788 | idle->prio = idle->normal_prio = MAX_PRIO; |
4367 | idle->state = TASK_RUNNING; | 4789 | idle->state = TASK_RUNNING; |
4368 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4790 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4369 | set_task_cpu(idle, cpu); | 4791 | set_task_cpu(idle, cpu); |
@@ -4396,7 +4818,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
4396 | /* | 4818 | /* |
4397 | * This is how migration works: | 4819 | * This is how migration works: |
4398 | * | 4820 | * |
4399 | * 1) we queue a migration_req_t structure in the source CPU's | 4821 | * 1) we queue a struct migration_req structure in the source CPU's |
4400 | * runqueue and wake up that CPU's migration thread. | 4822 | * runqueue and wake up that CPU's migration thread. |
4401 | * 2) we down() the locked semaphore => thread blocks. | 4823 | * 2) we down() the locked semaphore => thread blocks. |
4402 | * 3) migration thread wakes up (implicitly it forces the migrated | 4824 | * 3) migration thread wakes up (implicitly it forces the migrated |
@@ -4418,12 +4840,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
4418 | * task must not exit() & deallocate itself prematurely. The | 4840 | * task must not exit() & deallocate itself prematurely. The |
4419 | * call is not atomic; no spinlocks may be held. | 4841 | * call is not atomic; no spinlocks may be held. |
4420 | */ | 4842 | */ |
4421 | int set_cpus_allowed(task_t *p, cpumask_t new_mask) | 4843 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
4422 | { | 4844 | { |
4845 | struct migration_req req; | ||
4423 | unsigned long flags; | 4846 | unsigned long flags; |
4847 | struct rq *rq; | ||
4424 | int ret = 0; | 4848 | int ret = 0; |
4425 | migration_req_t req; | ||
4426 | runqueue_t *rq; | ||
4427 | 4849 | ||
4428 | rq = task_rq_lock(p, &flags); | 4850 | rq = task_rq_lock(p, &flags); |
4429 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 4851 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
@@ -4446,9 +4868,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) | |||
4446 | } | 4868 | } |
4447 | out: | 4869 | out: |
4448 | task_rq_unlock(rq, &flags); | 4870 | task_rq_unlock(rq, &flags); |
4871 | |||
4449 | return ret; | 4872 | return ret; |
4450 | } | 4873 | } |
4451 | |||
4452 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 4874 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
4453 | 4875 | ||
4454 | /* | 4876 | /* |
@@ -4459,13 +4881,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
4459 | * | 4881 | * |
4460 | * So we race with normal scheduler movements, but that's OK, as long | 4882 | * So we race with normal scheduler movements, but that's OK, as long |
4461 | * as the task is no longer on this CPU. | 4883 | * as the task is no longer on this CPU. |
4884 | * | ||
4885 | * Returns non-zero if task was successfully migrated. | ||
4462 | */ | 4886 | */ |
4463 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4887 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4464 | { | 4888 | { |
4465 | runqueue_t *rq_dest, *rq_src; | 4889 | struct rq *rq_dest, *rq_src; |
4890 | int ret = 0; | ||
4466 | 4891 | ||
4467 | if (unlikely(cpu_is_offline(dest_cpu))) | 4892 | if (unlikely(cpu_is_offline(dest_cpu))) |
4468 | return; | 4893 | return ret; |
4469 | 4894 | ||
4470 | rq_src = cpu_rq(src_cpu); | 4895 | rq_src = cpu_rq(src_cpu); |
4471 | rq_dest = cpu_rq(dest_cpu); | 4896 | rq_dest = cpu_rq(dest_cpu); |
@@ -4489,13 +4914,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4489 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 4914 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick |
4490 | + rq_dest->timestamp_last_tick; | 4915 | + rq_dest->timestamp_last_tick; |
4491 | deactivate_task(p, rq_src); | 4916 | deactivate_task(p, rq_src); |
4492 | activate_task(p, rq_dest, 0); | 4917 | __activate_task(p, rq_dest); |
4493 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4918 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
4494 | resched_task(rq_dest->curr); | 4919 | resched_task(rq_dest->curr); |
4495 | } | 4920 | } |
4496 | 4921 | ret = 1; | |
4497 | out: | 4922 | out: |
4498 | double_rq_unlock(rq_src, rq_dest); | 4923 | double_rq_unlock(rq_src, rq_dest); |
4924 | return ret; | ||
4499 | } | 4925 | } |
4500 | 4926 | ||
4501 | /* | 4927 | /* |
@@ -4505,16 +4931,16 @@ out: | |||
4505 | */ | 4931 | */ |
4506 | static int migration_thread(void *data) | 4932 | static int migration_thread(void *data) |
4507 | { | 4933 | { |
4508 | runqueue_t *rq; | ||
4509 | int cpu = (long)data; | 4934 | int cpu = (long)data; |
4935 | struct rq *rq; | ||
4510 | 4936 | ||
4511 | rq = cpu_rq(cpu); | 4937 | rq = cpu_rq(cpu); |
4512 | BUG_ON(rq->migration_thread != current); | 4938 | BUG_ON(rq->migration_thread != current); |
4513 | 4939 | ||
4514 | set_current_state(TASK_INTERRUPTIBLE); | 4940 | set_current_state(TASK_INTERRUPTIBLE); |
4515 | while (!kthread_should_stop()) { | 4941 | while (!kthread_should_stop()) { |
4942 | struct migration_req *req; | ||
4516 | struct list_head *head; | 4943 | struct list_head *head; |
4517 | migration_req_t *req; | ||
4518 | 4944 | ||
4519 | try_to_freeze(); | 4945 | try_to_freeze(); |
4520 | 4946 | ||
@@ -4538,7 +4964,7 @@ static int migration_thread(void *data) | |||
4538 | set_current_state(TASK_INTERRUPTIBLE); | 4964 | set_current_state(TASK_INTERRUPTIBLE); |
4539 | continue; | 4965 | continue; |
4540 | } | 4966 | } |
4541 | req = list_entry(head->next, migration_req_t, list); | 4967 | req = list_entry(head->next, struct migration_req, list); |
4542 | list_del_init(head->next); | 4968 | list_del_init(head->next); |
4543 | 4969 | ||
4544 | spin_unlock(&rq->lock); | 4970 | spin_unlock(&rq->lock); |
@@ -4563,36 +4989,42 @@ wait_to_die: | |||
4563 | 4989 | ||
4564 | #ifdef CONFIG_HOTPLUG_CPU | 4990 | #ifdef CONFIG_HOTPLUG_CPU |
4565 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 4991 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
4566 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 4992 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
4567 | { | 4993 | { |
4568 | int dest_cpu; | 4994 | unsigned long flags; |
4569 | cpumask_t mask; | 4995 | cpumask_t mask; |
4996 | struct rq *rq; | ||
4997 | int dest_cpu; | ||
4570 | 4998 | ||
4999 | restart: | ||
4571 | /* On same node? */ | 5000 | /* On same node? */ |
4572 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5001 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
4573 | cpus_and(mask, mask, tsk->cpus_allowed); | 5002 | cpus_and(mask, mask, p->cpus_allowed); |
4574 | dest_cpu = any_online_cpu(mask); | 5003 | dest_cpu = any_online_cpu(mask); |
4575 | 5004 | ||
4576 | /* On any allowed CPU? */ | 5005 | /* On any allowed CPU? */ |
4577 | if (dest_cpu == NR_CPUS) | 5006 | if (dest_cpu == NR_CPUS) |
4578 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5007 | dest_cpu = any_online_cpu(p->cpus_allowed); |
4579 | 5008 | ||
4580 | /* No more Mr. Nice Guy. */ | 5009 | /* No more Mr. Nice Guy. */ |
4581 | if (dest_cpu == NR_CPUS) { | 5010 | if (dest_cpu == NR_CPUS) { |
4582 | cpus_setall(tsk->cpus_allowed); | 5011 | rq = task_rq_lock(p, &flags); |
4583 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5012 | cpus_setall(p->cpus_allowed); |
5013 | dest_cpu = any_online_cpu(p->cpus_allowed); | ||
5014 | task_rq_unlock(rq, &flags); | ||
4584 | 5015 | ||
4585 | /* | 5016 | /* |
4586 | * Don't tell them about moving exiting tasks or | 5017 | * Don't tell them about moving exiting tasks or |
4587 | * kernel threads (both mm NULL), since they never | 5018 | * kernel threads (both mm NULL), since they never |
4588 | * leave kernel. | 5019 | * leave kernel. |
4589 | */ | 5020 | */ |
4590 | if (tsk->mm && printk_ratelimit()) | 5021 | if (p->mm && printk_ratelimit()) |
4591 | printk(KERN_INFO "process %d (%s) no " | 5022 | printk(KERN_INFO "process %d (%s) no " |
4592 | "longer affine to cpu%d\n", | 5023 | "longer affine to cpu%d\n", |
4593 | tsk->pid, tsk->comm, dead_cpu); | 5024 | p->pid, p->comm, dead_cpu); |
4594 | } | 5025 | } |
4595 | __migrate_task(tsk, dead_cpu, dest_cpu); | 5026 | if (!__migrate_task(p, dead_cpu, dest_cpu)) |
5027 | goto restart; | ||
4596 | } | 5028 | } |
4597 | 5029 | ||
4598 | /* | 5030 | /* |
@@ -4602,9 +5034,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4602 | * their home CPUs. So we just add the counter to another CPU's counter, | 5034 | * their home CPUs. So we just add the counter to another CPU's counter, |
4603 | * to keep the global sum constant after CPU-down: | 5035 | * to keep the global sum constant after CPU-down: |
4604 | */ | 5036 | */ |
4605 | static void migrate_nr_uninterruptible(runqueue_t *rq_src) | 5037 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
4606 | { | 5038 | { |
4607 | runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 5039 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
4608 | unsigned long flags; | 5040 | unsigned long flags; |
4609 | 5041 | ||
4610 | local_irq_save(flags); | 5042 | local_irq_save(flags); |
@@ -4618,48 +5050,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src) | |||
4618 | /* Run through task list and migrate tasks from the dead cpu. */ | 5050 | /* Run through task list and migrate tasks from the dead cpu. */ |
4619 | static void migrate_live_tasks(int src_cpu) | 5051 | static void migrate_live_tasks(int src_cpu) |
4620 | { | 5052 | { |
4621 | struct task_struct *tsk, *t; | 5053 | struct task_struct *p, *t; |
4622 | 5054 | ||
4623 | write_lock_irq(&tasklist_lock); | 5055 | write_lock_irq(&tasklist_lock); |
4624 | 5056 | ||
4625 | do_each_thread(t, tsk) { | 5057 | do_each_thread(t, p) { |
4626 | if (tsk == current) | 5058 | if (p == current) |
4627 | continue; | 5059 | continue; |
4628 | 5060 | ||
4629 | if (task_cpu(tsk) == src_cpu) | 5061 | if (task_cpu(p) == src_cpu) |
4630 | move_task_off_dead_cpu(src_cpu, tsk); | 5062 | move_task_off_dead_cpu(src_cpu, p); |
4631 | } while_each_thread(t, tsk); | 5063 | } while_each_thread(t, p); |
4632 | 5064 | ||
4633 | write_unlock_irq(&tasklist_lock); | 5065 | write_unlock_irq(&tasklist_lock); |
4634 | } | 5066 | } |
4635 | 5067 | ||
4636 | /* Schedules idle task to be the next runnable task on current CPU. | 5068 | /* Schedules idle task to be the next runnable task on current CPU. |
4637 | * It does so by boosting its priority to highest possible and adding it to | 5069 | * It does so by boosting its priority to highest possible and adding it to |
4638 | * the _front_ of runqueue. Used by CPU offline code. | 5070 | * the _front_ of the runqueue. Used by CPU offline code. |
4639 | */ | 5071 | */ |
4640 | void sched_idle_next(void) | 5072 | void sched_idle_next(void) |
4641 | { | 5073 | { |
4642 | int cpu = smp_processor_id(); | 5074 | int this_cpu = smp_processor_id(); |
4643 | runqueue_t *rq = this_rq(); | 5075 | struct rq *rq = cpu_rq(this_cpu); |
4644 | struct task_struct *p = rq->idle; | 5076 | struct task_struct *p = rq->idle; |
4645 | unsigned long flags; | 5077 | unsigned long flags; |
4646 | 5078 | ||
4647 | /* cpu has to be offline */ | 5079 | /* cpu has to be offline */ |
4648 | BUG_ON(cpu_online(cpu)); | 5080 | BUG_ON(cpu_online(this_cpu)); |
4649 | 5081 | ||
4650 | /* Strictly not necessary since rest of the CPUs are stopped by now | 5082 | /* |
4651 | * and interrupts disabled on current cpu. | 5083 | * Strictly not necessary since rest of the CPUs are stopped by now |
5084 | * and interrupts disabled on the current cpu. | ||
4652 | */ | 5085 | */ |
4653 | spin_lock_irqsave(&rq->lock, flags); | 5086 | spin_lock_irqsave(&rq->lock, flags); |
4654 | 5087 | ||
4655 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5088 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
4656 | /* Add idle task to _front_ of it's priority queue */ | 5089 | |
5090 | /* Add idle task to the _front_ of its priority queue: */ | ||
4657 | __activate_idle_task(p, rq); | 5091 | __activate_idle_task(p, rq); |
4658 | 5092 | ||
4659 | spin_unlock_irqrestore(&rq->lock, flags); | 5093 | spin_unlock_irqrestore(&rq->lock, flags); |
4660 | } | 5094 | } |
4661 | 5095 | ||
4662 | /* Ensures that the idle task is using init_mm right before its cpu goes | 5096 | /* |
5097 | * Ensures that the idle task is using init_mm right before its cpu goes | ||
4663 | * offline. | 5098 | * offline. |
4664 | */ | 5099 | */ |
4665 | void idle_task_exit(void) | 5100 | void idle_task_exit(void) |
@@ -4673,17 +5108,17 @@ void idle_task_exit(void) | |||
4673 | mmdrop(mm); | 5108 | mmdrop(mm); |
4674 | } | 5109 | } |
4675 | 5110 | ||
4676 | static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | 5111 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
4677 | { | 5112 | { |
4678 | struct runqueue *rq = cpu_rq(dead_cpu); | 5113 | struct rq *rq = cpu_rq(dead_cpu); |
4679 | 5114 | ||
4680 | /* Must be exiting, otherwise would be on tasklist. */ | 5115 | /* Must be exiting, otherwise would be on tasklist. */ |
4681 | BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); | 5116 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); |
4682 | 5117 | ||
4683 | /* Cannot have done final schedule yet: would have vanished. */ | 5118 | /* Cannot have done final schedule yet: would have vanished. */ |
4684 | BUG_ON(tsk->flags & PF_DEAD); | 5119 | BUG_ON(p->flags & PF_DEAD); |
4685 | 5120 | ||
4686 | get_task_struct(tsk); | 5121 | get_task_struct(p); |
4687 | 5122 | ||
4688 | /* | 5123 | /* |
4689 | * Drop lock around migration; if someone else moves it, | 5124 | * Drop lock around migration; if someone else moves it, |
@@ -4691,25 +5126,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | |||
4691 | * fine. | 5126 | * fine. |
4692 | */ | 5127 | */ |
4693 | spin_unlock_irq(&rq->lock); | 5128 | spin_unlock_irq(&rq->lock); |
4694 | move_task_off_dead_cpu(dead_cpu, tsk); | 5129 | move_task_off_dead_cpu(dead_cpu, p); |
4695 | spin_lock_irq(&rq->lock); | 5130 | spin_lock_irq(&rq->lock); |
4696 | 5131 | ||
4697 | put_task_struct(tsk); | 5132 | put_task_struct(p); |
4698 | } | 5133 | } |
4699 | 5134 | ||
4700 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 5135 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
4701 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5136 | static void migrate_dead_tasks(unsigned int dead_cpu) |
4702 | { | 5137 | { |
4703 | unsigned arr, i; | 5138 | struct rq *rq = cpu_rq(dead_cpu); |
4704 | struct runqueue *rq = cpu_rq(dead_cpu); | 5139 | unsigned int arr, i; |
4705 | 5140 | ||
4706 | for (arr = 0; arr < 2; arr++) { | 5141 | for (arr = 0; arr < 2; arr++) { |
4707 | for (i = 0; i < MAX_PRIO; i++) { | 5142 | for (i = 0; i < MAX_PRIO; i++) { |
4708 | struct list_head *list = &rq->arrays[arr].queue[i]; | 5143 | struct list_head *list = &rq->arrays[arr].queue[i]; |
5144 | |||
4709 | while (!list_empty(list)) | 5145 | while (!list_empty(list)) |
4710 | migrate_dead(dead_cpu, | 5146 | migrate_dead(dead_cpu, list_entry(list->next, |
4711 | list_entry(list->next, task_t, | 5147 | struct task_struct, run_list)); |
4712 | run_list)); | ||
4713 | } | 5148 | } |
4714 | } | 5149 | } |
4715 | } | 5150 | } |
@@ -4719,13 +5154,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
4719 | * migration_call - callback that gets triggered when a CPU is added. | 5154 | * migration_call - callback that gets triggered when a CPU is added. |
4720 | * Here we can start up the necessary migration thread for the new CPU. | 5155 | * Here we can start up the necessary migration thread for the new CPU. |
4721 | */ | 5156 | */ |
4722 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 5157 | static int __cpuinit |
4723 | void *hcpu) | 5158 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
4724 | { | 5159 | { |
4725 | int cpu = (long)hcpu; | ||
4726 | struct task_struct *p; | 5160 | struct task_struct *p; |
4727 | struct runqueue *rq; | 5161 | int cpu = (long)hcpu; |
4728 | unsigned long flags; | 5162 | unsigned long flags; |
5163 | struct rq *rq; | ||
4729 | 5164 | ||
4730 | switch (action) { | 5165 | switch (action) { |
4731 | case CPU_UP_PREPARE: | 5166 | case CPU_UP_PREPARE: |
@@ -4740,18 +5175,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4740 | task_rq_unlock(rq, &flags); | 5175 | task_rq_unlock(rq, &flags); |
4741 | cpu_rq(cpu)->migration_thread = p; | 5176 | cpu_rq(cpu)->migration_thread = p; |
4742 | break; | 5177 | break; |
5178 | |||
4743 | case CPU_ONLINE: | 5179 | case CPU_ONLINE: |
4744 | /* Strictly unneccessary, as first user will wake it. */ | 5180 | /* Strictly unneccessary, as first user will wake it. */ |
4745 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5181 | wake_up_process(cpu_rq(cpu)->migration_thread); |
4746 | break; | 5182 | break; |
5183 | |||
4747 | #ifdef CONFIG_HOTPLUG_CPU | 5184 | #ifdef CONFIG_HOTPLUG_CPU |
4748 | case CPU_UP_CANCELED: | 5185 | case CPU_UP_CANCELED: |
5186 | if (!cpu_rq(cpu)->migration_thread) | ||
5187 | break; | ||
4749 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5188 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
4750 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5189 | kthread_bind(cpu_rq(cpu)->migration_thread, |
4751 | any_online_cpu(cpu_online_map)); | 5190 | any_online_cpu(cpu_online_map)); |
4752 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5191 | kthread_stop(cpu_rq(cpu)->migration_thread); |
4753 | cpu_rq(cpu)->migration_thread = NULL; | 5192 | cpu_rq(cpu)->migration_thread = NULL; |
4754 | break; | 5193 | break; |
5194 | |||
4755 | case CPU_DEAD: | 5195 | case CPU_DEAD: |
4756 | migrate_live_tasks(cpu); | 5196 | migrate_live_tasks(cpu); |
4757 | rq = cpu_rq(cpu); | 5197 | rq = cpu_rq(cpu); |
@@ -4772,9 +5212,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4772 | * the requestors. */ | 5212 | * the requestors. */ |
4773 | spin_lock_irq(&rq->lock); | 5213 | spin_lock_irq(&rq->lock); |
4774 | while (!list_empty(&rq->migration_queue)) { | 5214 | while (!list_empty(&rq->migration_queue)) { |
4775 | migration_req_t *req; | 5215 | struct migration_req *req; |
5216 | |||
4776 | req = list_entry(rq->migration_queue.next, | 5217 | req = list_entry(rq->migration_queue.next, |
4777 | migration_req_t, list); | 5218 | struct migration_req, list); |
4778 | list_del_init(&req->list); | 5219 | list_del_init(&req->list); |
4779 | complete(&req->done); | 5220 | complete(&req->done); |
4780 | } | 5221 | } |
@@ -4788,7 +5229,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4788 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5229 | /* Register at highest priority so that task migration (migrate_all_tasks) |
4789 | * happens before everything else. | 5230 | * happens before everything else. |
4790 | */ | 5231 | */ |
4791 | static struct notifier_block migration_notifier = { | 5232 | static struct notifier_block __cpuinitdata migration_notifier = { |
4792 | .notifier_call = migration_call, | 5233 | .notifier_call = migration_call, |
4793 | .priority = 10 | 5234 | .priority = 10 |
4794 | }; | 5235 | }; |
@@ -4796,10 +5237,12 @@ static struct notifier_block migration_notifier = { | |||
4796 | int __init migration_init(void) | 5237 | int __init migration_init(void) |
4797 | { | 5238 | { |
4798 | void *cpu = (void *)(long)smp_processor_id(); | 5239 | void *cpu = (void *)(long)smp_processor_id(); |
4799 | /* Start one for boot CPU. */ | 5240 | |
5241 | /* Start one for the boot CPU: */ | ||
4800 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5242 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
4801 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5243 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
4802 | register_cpu_notifier(&migration_notifier); | 5244 | register_cpu_notifier(&migration_notifier); |
5245 | |||
4803 | return 0; | 5246 | return 0; |
4804 | } | 5247 | } |
4805 | #endif | 5248 | #endif |
@@ -4895,7 +5338,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
4895 | } while (sd); | 5338 | } while (sd); |
4896 | } | 5339 | } |
4897 | #else | 5340 | #else |
4898 | #define sched_domain_debug(sd, cpu) {} | 5341 | # define sched_domain_debug(sd, cpu) do { } while (0) |
4899 | #endif | 5342 | #endif |
4900 | 5343 | ||
4901 | static int sd_degenerate(struct sched_domain *sd) | 5344 | static int sd_degenerate(struct sched_domain *sd) |
@@ -4921,8 +5364,8 @@ static int sd_degenerate(struct sched_domain *sd) | |||
4921 | return 1; | 5364 | return 1; |
4922 | } | 5365 | } |
4923 | 5366 | ||
4924 | static int sd_parent_degenerate(struct sched_domain *sd, | 5367 | static int |
4925 | struct sched_domain *parent) | 5368 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
4926 | { | 5369 | { |
4927 | unsigned long cflags = sd->flags, pflags = parent->flags; | 5370 | unsigned long cflags = sd->flags, pflags = parent->flags; |
4928 | 5371 | ||
@@ -4955,7 +5398,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, | |||
4955 | */ | 5398 | */ |
4956 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 5399 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4957 | { | 5400 | { |
4958 | runqueue_t *rq = cpu_rq(cpu); | 5401 | struct rq *rq = cpu_rq(cpu); |
4959 | struct sched_domain *tmp; | 5402 | struct sched_domain *tmp; |
4960 | 5403 | ||
4961 | /* Remove the sched domains which do not contribute to scheduling. */ | 5404 | /* Remove the sched domains which do not contribute to scheduling. */ |
@@ -5217,8 +5660,8 @@ static void touch_cache(void *__cache, unsigned long __size) | |||
5217 | /* | 5660 | /* |
5218 | * Measure the cache-cost of one task migration. Returns in units of nsec. | 5661 | * Measure the cache-cost of one task migration. Returns in units of nsec. |
5219 | */ | 5662 | */ |
5220 | static unsigned long long measure_one(void *cache, unsigned long size, | 5663 | static unsigned long long |
5221 | int source, int target) | 5664 | measure_one(void *cache, unsigned long size, int source, int target) |
5222 | { | 5665 | { |
5223 | cpumask_t mask, saved_mask; | 5666 | cpumask_t mask, saved_mask; |
5224 | unsigned long long t0, t1, t2, t3, cost; | 5667 | unsigned long long t0, t1, t2, t3, cost; |
@@ -5370,7 +5813,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
5370 | cache = vmalloc(max_size); | 5813 | cache = vmalloc(max_size); |
5371 | if (!cache) { | 5814 | if (!cache) { |
5372 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 5815 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); |
5373 | return 1000000; // return 1 msec on very small boxen | 5816 | return 1000000; /* return 1 msec on very small boxen */ |
5374 | } | 5817 | } |
5375 | 5818 | ||
5376 | while (size <= max_size) { | 5819 | while (size <= max_size) { |
@@ -5568,9 +6011,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
5568 | */ | 6011 | */ |
5569 | static cpumask_t sched_domain_node_span(int node) | 6012 | static cpumask_t sched_domain_node_span(int node) |
5570 | { | 6013 | { |
5571 | int i; | ||
5572 | cpumask_t span, nodemask; | ||
5573 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 6014 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
6015 | cpumask_t span, nodemask; | ||
6016 | int i; | ||
5574 | 6017 | ||
5575 | cpus_clear(span); | 6018 | cpus_clear(span); |
5576 | bitmap_zero(used_nodes, MAX_NUMNODES); | 6019 | bitmap_zero(used_nodes, MAX_NUMNODES); |
@@ -5581,6 +6024,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
5581 | 6024 | ||
5582 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6025 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
5583 | int next_node = find_next_best_node(node, used_nodes); | 6026 | int next_node = find_next_best_node(node, used_nodes); |
6027 | |||
5584 | nodemask = node_to_cpumask(next_node); | 6028 | nodemask = node_to_cpumask(next_node); |
5585 | cpus_or(span, span, nodemask); | 6029 | cpus_or(span, span, nodemask); |
5586 | } | 6030 | } |
@@ -5589,22 +6033,27 @@ static cpumask_t sched_domain_node_span(int node) | |||
5589 | } | 6033 | } |
5590 | #endif | 6034 | #endif |
5591 | 6035 | ||
6036 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
6037 | |||
5592 | /* | 6038 | /* |
5593 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 6039 | * SMT sched-domains: |
5594 | * can switch it on easily if needed. | ||
5595 | */ | 6040 | */ |
5596 | #ifdef CONFIG_SCHED_SMT | 6041 | #ifdef CONFIG_SCHED_SMT |
5597 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6042 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
5598 | static struct sched_group sched_group_cpus[NR_CPUS]; | 6043 | static struct sched_group sched_group_cpus[NR_CPUS]; |
6044 | |||
5599 | static int cpu_to_cpu_group(int cpu) | 6045 | static int cpu_to_cpu_group(int cpu) |
5600 | { | 6046 | { |
5601 | return cpu; | 6047 | return cpu; |
5602 | } | 6048 | } |
5603 | #endif | 6049 | #endif |
5604 | 6050 | ||
6051 | /* | ||
6052 | * multi-core sched-domains: | ||
6053 | */ | ||
5605 | #ifdef CONFIG_SCHED_MC | 6054 | #ifdef CONFIG_SCHED_MC |
5606 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6055 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
5607 | static struct sched_group sched_group_core[NR_CPUS]; | 6056 | static struct sched_group *sched_group_core_bycpu[NR_CPUS]; |
5608 | #endif | 6057 | #endif |
5609 | 6058 | ||
5610 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6059 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
@@ -5620,10 +6069,11 @@ static int cpu_to_core_group(int cpu) | |||
5620 | #endif | 6069 | #endif |
5621 | 6070 | ||
5622 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6071 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5623 | static struct sched_group sched_group_phys[NR_CPUS]; | 6072 | static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; |
6073 | |||
5624 | static int cpu_to_phys_group(int cpu) | 6074 | static int cpu_to_phys_group(int cpu) |
5625 | { | 6075 | { |
5626 | #if defined(CONFIG_SCHED_MC) | 6076 | #ifdef CONFIG_SCHED_MC |
5627 | cpumask_t mask = cpu_coregroup_map(cpu); | 6077 | cpumask_t mask = cpu_coregroup_map(cpu); |
5628 | return first_cpu(mask); | 6078 | return first_cpu(mask); |
5629 | #elif defined(CONFIG_SCHED_SMT) | 6079 | #elif defined(CONFIG_SCHED_SMT) |
@@ -5677,13 +6127,74 @@ next_sg: | |||
5677 | } | 6127 | } |
5678 | #endif | 6128 | #endif |
5679 | 6129 | ||
6130 | /* Free memory allocated for various sched_group structures */ | ||
6131 | static void free_sched_groups(const cpumask_t *cpu_map) | ||
6132 | { | ||
6133 | int cpu; | ||
6134 | #ifdef CONFIG_NUMA | ||
6135 | int i; | ||
6136 | |||
6137 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6138 | struct sched_group *sched_group_allnodes | ||
6139 | = sched_group_allnodes_bycpu[cpu]; | ||
6140 | struct sched_group **sched_group_nodes | ||
6141 | = sched_group_nodes_bycpu[cpu]; | ||
6142 | |||
6143 | if (sched_group_allnodes) { | ||
6144 | kfree(sched_group_allnodes); | ||
6145 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
6146 | } | ||
6147 | |||
6148 | if (!sched_group_nodes) | ||
6149 | continue; | ||
6150 | |||
6151 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
6152 | cpumask_t nodemask = node_to_cpumask(i); | ||
6153 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
6154 | |||
6155 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6156 | if (cpus_empty(nodemask)) | ||
6157 | continue; | ||
6158 | |||
6159 | if (sg == NULL) | ||
6160 | continue; | ||
6161 | sg = sg->next; | ||
6162 | next_sg: | ||
6163 | oldsg = sg; | ||
6164 | sg = sg->next; | ||
6165 | kfree(oldsg); | ||
6166 | if (oldsg != sched_group_nodes[i]) | ||
6167 | goto next_sg; | ||
6168 | } | ||
6169 | kfree(sched_group_nodes); | ||
6170 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6171 | } | ||
6172 | #endif | ||
6173 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6174 | if (sched_group_phys_bycpu[cpu]) { | ||
6175 | kfree(sched_group_phys_bycpu[cpu]); | ||
6176 | sched_group_phys_bycpu[cpu] = NULL; | ||
6177 | } | ||
6178 | #ifdef CONFIG_SCHED_MC | ||
6179 | if (sched_group_core_bycpu[cpu]) { | ||
6180 | kfree(sched_group_core_bycpu[cpu]); | ||
6181 | sched_group_core_bycpu[cpu] = NULL; | ||
6182 | } | ||
6183 | #endif | ||
6184 | } | ||
6185 | } | ||
6186 | |||
5680 | /* | 6187 | /* |
5681 | * Build sched domains for a given set of cpus and attach the sched domains | 6188 | * Build sched domains for a given set of cpus and attach the sched domains |
5682 | * to the individual cpus | 6189 | * to the individual cpus |
5683 | */ | 6190 | */ |
5684 | void build_sched_domains(const cpumask_t *cpu_map) | 6191 | static int build_sched_domains(const cpumask_t *cpu_map) |
5685 | { | 6192 | { |
5686 | int i; | 6193 | int i; |
6194 | struct sched_group *sched_group_phys = NULL; | ||
6195 | #ifdef CONFIG_SCHED_MC | ||
6196 | struct sched_group *sched_group_core = NULL; | ||
6197 | #endif | ||
5687 | #ifdef CONFIG_NUMA | 6198 | #ifdef CONFIG_NUMA |
5688 | struct sched_group **sched_group_nodes = NULL; | 6199 | struct sched_group **sched_group_nodes = NULL; |
5689 | struct sched_group *sched_group_allnodes = NULL; | 6200 | struct sched_group *sched_group_allnodes = NULL; |
@@ -5691,11 +6202,11 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5691 | /* | 6202 | /* |
5692 | * Allocate the per-node list of sched groups | 6203 | * Allocate the per-node list of sched groups |
5693 | */ | 6204 | */ |
5694 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6205 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
5695 | GFP_ATOMIC); | 6206 | GFP_KERNEL); |
5696 | if (!sched_group_nodes) { | 6207 | if (!sched_group_nodes) { |
5697 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6208 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
5698 | return; | 6209 | return -ENOMEM; |
5699 | } | 6210 | } |
5700 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6211 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
5701 | #endif | 6212 | #endif |
@@ -5721,7 +6232,7 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5721 | if (!sched_group_allnodes) { | 6232 | if (!sched_group_allnodes) { |
5722 | printk(KERN_WARNING | 6233 | printk(KERN_WARNING |
5723 | "Can not alloc allnodes sched group\n"); | 6234 | "Can not alloc allnodes sched group\n"); |
5724 | break; | 6235 | goto error; |
5725 | } | 6236 | } |
5726 | sched_group_allnodes_bycpu[i] | 6237 | sched_group_allnodes_bycpu[i] |
5727 | = sched_group_allnodes; | 6238 | = sched_group_allnodes; |
@@ -5742,6 +6253,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5742 | cpus_and(sd->span, sd->span, *cpu_map); | 6253 | cpus_and(sd->span, sd->span, *cpu_map); |
5743 | #endif | 6254 | #endif |
5744 | 6255 | ||
6256 | if (!sched_group_phys) { | ||
6257 | sched_group_phys | ||
6258 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6259 | GFP_KERNEL); | ||
6260 | if (!sched_group_phys) { | ||
6261 | printk (KERN_WARNING "Can not alloc phys sched" | ||
6262 | "group\n"); | ||
6263 | goto error; | ||
6264 | } | ||
6265 | sched_group_phys_bycpu[i] = sched_group_phys; | ||
6266 | } | ||
6267 | |||
5745 | p = sd; | 6268 | p = sd; |
5746 | sd = &per_cpu(phys_domains, i); | 6269 | sd = &per_cpu(phys_domains, i); |
5747 | group = cpu_to_phys_group(i); | 6270 | group = cpu_to_phys_group(i); |
@@ -5751,6 +6274,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5751 | sd->groups = &sched_group_phys[group]; | 6274 | sd->groups = &sched_group_phys[group]; |
5752 | 6275 | ||
5753 | #ifdef CONFIG_SCHED_MC | 6276 | #ifdef CONFIG_SCHED_MC |
6277 | if (!sched_group_core) { | ||
6278 | sched_group_core | ||
6279 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6280 | GFP_KERNEL); | ||
6281 | if (!sched_group_core) { | ||
6282 | printk (KERN_WARNING "Can not alloc core sched" | ||
6283 | "group\n"); | ||
6284 | goto error; | ||
6285 | } | ||
6286 | sched_group_core_bycpu[i] = sched_group_core; | ||
6287 | } | ||
6288 | |||
5754 | p = sd; | 6289 | p = sd; |
5755 | sd = &per_cpu(core_domains, i); | 6290 | sd = &per_cpu(core_domains, i); |
5756 | group = cpu_to_core_group(i); | 6291 | group = cpu_to_core_group(i); |
@@ -5834,24 +6369,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5834 | domainspan = sched_domain_node_span(i); | 6369 | domainspan = sched_domain_node_span(i); |
5835 | cpus_and(domainspan, domainspan, *cpu_map); | 6370 | cpus_and(domainspan, domainspan, *cpu_map); |
5836 | 6371 | ||
5837 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6372 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6373 | if (!sg) { | ||
6374 | printk(KERN_WARNING "Can not alloc domain group for " | ||
6375 | "node %d\n", i); | ||
6376 | goto error; | ||
6377 | } | ||
5838 | sched_group_nodes[i] = sg; | 6378 | sched_group_nodes[i] = sg; |
5839 | for_each_cpu_mask(j, nodemask) { | 6379 | for_each_cpu_mask(j, nodemask) { |
5840 | struct sched_domain *sd; | 6380 | struct sched_domain *sd; |
5841 | sd = &per_cpu(node_domains, j); | 6381 | sd = &per_cpu(node_domains, j); |
5842 | sd->groups = sg; | 6382 | sd->groups = sg; |
5843 | if (sd->groups == NULL) { | ||
5844 | /* Turn off balancing if we have no groups */ | ||
5845 | sd->flags = 0; | ||
5846 | } | ||
5847 | } | ||
5848 | if (!sg) { | ||
5849 | printk(KERN_WARNING | ||
5850 | "Can not alloc domain group for node %d\n", i); | ||
5851 | continue; | ||
5852 | } | 6383 | } |
5853 | sg->cpu_power = 0; | 6384 | sg->cpu_power = 0; |
5854 | sg->cpumask = nodemask; | 6385 | sg->cpumask = nodemask; |
6386 | sg->next = sg; | ||
5855 | cpus_or(covered, covered, nodemask); | 6387 | cpus_or(covered, covered, nodemask); |
5856 | prev = sg; | 6388 | prev = sg; |
5857 | 6389 | ||
@@ -5870,54 +6402,90 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5870 | if (cpus_empty(tmp)) | 6402 | if (cpus_empty(tmp)) |
5871 | continue; | 6403 | continue; |
5872 | 6404 | ||
5873 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6405 | sg = kmalloc_node(sizeof(struct sched_group), |
6406 | GFP_KERNEL, i); | ||
5874 | if (!sg) { | 6407 | if (!sg) { |
5875 | printk(KERN_WARNING | 6408 | printk(KERN_WARNING |
5876 | "Can not alloc domain group for node %d\n", j); | 6409 | "Can not alloc domain group for node %d\n", j); |
5877 | break; | 6410 | goto error; |
5878 | } | 6411 | } |
5879 | sg->cpu_power = 0; | 6412 | sg->cpu_power = 0; |
5880 | sg->cpumask = tmp; | 6413 | sg->cpumask = tmp; |
6414 | sg->next = prev->next; | ||
5881 | cpus_or(covered, covered, tmp); | 6415 | cpus_or(covered, covered, tmp); |
5882 | prev->next = sg; | 6416 | prev->next = sg; |
5883 | prev = sg; | 6417 | prev = sg; |
5884 | } | 6418 | } |
5885 | prev->next = sched_group_nodes[i]; | ||
5886 | } | 6419 | } |
5887 | #endif | 6420 | #endif |
5888 | 6421 | ||
5889 | /* Calculate CPU power for physical packages and nodes */ | 6422 | /* Calculate CPU power for physical packages and nodes */ |
6423 | #ifdef CONFIG_SCHED_SMT | ||
5890 | for_each_cpu_mask(i, *cpu_map) { | 6424 | for_each_cpu_mask(i, *cpu_map) { |
5891 | int power; | ||
5892 | struct sched_domain *sd; | 6425 | struct sched_domain *sd; |
5893 | #ifdef CONFIG_SCHED_SMT | ||
5894 | sd = &per_cpu(cpu_domains, i); | 6426 | sd = &per_cpu(cpu_domains, i); |
5895 | power = SCHED_LOAD_SCALE; | 6427 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
5896 | sd->groups->cpu_power = power; | 6428 | } |
5897 | #endif | 6429 | #endif |
5898 | #ifdef CONFIG_SCHED_MC | 6430 | #ifdef CONFIG_SCHED_MC |
6431 | for_each_cpu_mask(i, *cpu_map) { | ||
6432 | int power; | ||
6433 | struct sched_domain *sd; | ||
5899 | sd = &per_cpu(core_domains, i); | 6434 | sd = &per_cpu(core_domains, i); |
5900 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6435 | if (sched_smt_power_savings) |
6436 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6437 | else | ||
6438 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
5901 | * SCHED_LOAD_SCALE / 10; | 6439 | * SCHED_LOAD_SCALE / 10; |
5902 | sd->groups->cpu_power = power; | 6440 | sd->groups->cpu_power = power; |
6441 | } | ||
6442 | #endif | ||
5903 | 6443 | ||
6444 | for_each_cpu_mask(i, *cpu_map) { | ||
6445 | struct sched_domain *sd; | ||
6446 | #ifdef CONFIG_SCHED_MC | ||
5904 | sd = &per_cpu(phys_domains, i); | 6447 | sd = &per_cpu(phys_domains, i); |
6448 | if (i != first_cpu(sd->groups->cpumask)) | ||
6449 | continue; | ||
5905 | 6450 | ||
5906 | /* | 6451 | sd->groups->cpu_power = 0; |
5907 | * This has to be < 2 * SCHED_LOAD_SCALE | 6452 | if (sched_mc_power_savings || sched_smt_power_savings) { |
5908 | * Lets keep it SCHED_LOAD_SCALE, so that | 6453 | int j; |
5909 | * while calculating NUMA group's cpu_power | 6454 | |
5910 | * we can simply do | 6455 | for_each_cpu_mask(j, sd->groups->cpumask) { |
5911 | * numa_group->cpu_power += phys_group->cpu_power; | 6456 | struct sched_domain *sd1; |
5912 | * | 6457 | sd1 = &per_cpu(core_domains, j); |
5913 | * See "only add power once for each physical pkg" | 6458 | /* |
5914 | * comment below | 6459 | * for each core we will add once |
5915 | */ | 6460 | * to the group in physical domain |
5916 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6461 | */ |
6462 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6463 | continue; | ||
6464 | |||
6465 | if (sched_smt_power_savings) | ||
6466 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6467 | else | ||
6468 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6469 | } | ||
6470 | } else | ||
6471 | /* | ||
6472 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6473 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6474 | * while calculating NUMA group's cpu_power | ||
6475 | * we can simply do | ||
6476 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6477 | * | ||
6478 | * See "only add power once for each physical pkg" | ||
6479 | * comment below | ||
6480 | */ | ||
6481 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
5917 | #else | 6482 | #else |
6483 | int power; | ||
5918 | sd = &per_cpu(phys_domains, i); | 6484 | sd = &per_cpu(phys_domains, i); |
5919 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6485 | if (sched_smt_power_savings) |
5920 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6486 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
6487 | else | ||
6488 | power = SCHED_LOAD_SCALE; | ||
5921 | sd->groups->cpu_power = power; | 6489 | sd->groups->cpu_power = power; |
5922 | #endif | 6490 | #endif |
5923 | } | 6491 | } |
@@ -5945,13 +6513,20 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5945 | * Tune cache-hot values: | 6513 | * Tune cache-hot values: |
5946 | */ | 6514 | */ |
5947 | calibrate_migration_costs(cpu_map); | 6515 | calibrate_migration_costs(cpu_map); |
6516 | |||
6517 | return 0; | ||
6518 | |||
6519 | error: | ||
6520 | free_sched_groups(cpu_map); | ||
6521 | return -ENOMEM; | ||
5948 | } | 6522 | } |
5949 | /* | 6523 | /* |
5950 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6524 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5951 | */ | 6525 | */ |
5952 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 6526 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
5953 | { | 6527 | { |
5954 | cpumask_t cpu_default_map; | 6528 | cpumask_t cpu_default_map; |
6529 | int err; | ||
5955 | 6530 | ||
5956 | /* | 6531 | /* |
5957 | * Setup mask for cpus without special case scheduling requirements. | 6532 | * Setup mask for cpus without special case scheduling requirements. |
@@ -5960,51 +6535,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) | |||
5960 | */ | 6535 | */ |
5961 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6536 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
5962 | 6537 | ||
5963 | build_sched_domains(&cpu_default_map); | 6538 | err = build_sched_domains(&cpu_default_map); |
6539 | |||
6540 | return err; | ||
5964 | } | 6541 | } |
5965 | 6542 | ||
5966 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6543 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5967 | { | 6544 | { |
5968 | #ifdef CONFIG_NUMA | 6545 | free_sched_groups(cpu_map); |
5969 | int i; | ||
5970 | int cpu; | ||
5971 | |||
5972 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5973 | struct sched_group *sched_group_allnodes | ||
5974 | = sched_group_allnodes_bycpu[cpu]; | ||
5975 | struct sched_group **sched_group_nodes | ||
5976 | = sched_group_nodes_bycpu[cpu]; | ||
5977 | |||
5978 | if (sched_group_allnodes) { | ||
5979 | kfree(sched_group_allnodes); | ||
5980 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
5981 | } | ||
5982 | |||
5983 | if (!sched_group_nodes) | ||
5984 | continue; | ||
5985 | |||
5986 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5987 | cpumask_t nodemask = node_to_cpumask(i); | ||
5988 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
5989 | |||
5990 | cpus_and(nodemask, nodemask, *cpu_map); | ||
5991 | if (cpus_empty(nodemask)) | ||
5992 | continue; | ||
5993 | |||
5994 | if (sg == NULL) | ||
5995 | continue; | ||
5996 | sg = sg->next; | ||
5997 | next_sg: | ||
5998 | oldsg = sg; | ||
5999 | sg = sg->next; | ||
6000 | kfree(oldsg); | ||
6001 | if (oldsg != sched_group_nodes[i]) | ||
6002 | goto next_sg; | ||
6003 | } | ||
6004 | kfree(sched_group_nodes); | ||
6005 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6006 | } | ||
6007 | #endif | ||
6008 | } | 6546 | } |
6009 | 6547 | ||
6010 | /* | 6548 | /* |
@@ -6029,9 +6567,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6029 | * correct sched domains | 6567 | * correct sched domains |
6030 | * Call with hotplug lock held | 6568 | * Call with hotplug lock held |
6031 | */ | 6569 | */ |
6032 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6570 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
6033 | { | 6571 | { |
6034 | cpumask_t change_map; | 6572 | cpumask_t change_map; |
6573 | int err = 0; | ||
6035 | 6574 | ||
6036 | cpus_and(*partition1, *partition1, cpu_online_map); | 6575 | cpus_and(*partition1, *partition1, cpu_online_map); |
6037 | cpus_and(*partition2, *partition2, cpu_online_map); | 6576 | cpus_and(*partition2, *partition2, cpu_online_map); |
@@ -6040,11 +6579,90 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6040 | /* Detach sched domains from all of the affected cpus */ | 6579 | /* Detach sched domains from all of the affected cpus */ |
6041 | detach_destroy_domains(&change_map); | 6580 | detach_destroy_domains(&change_map); |
6042 | if (!cpus_empty(*partition1)) | 6581 | if (!cpus_empty(*partition1)) |
6043 | build_sched_domains(partition1); | 6582 | err = build_sched_domains(partition1); |
6044 | if (!cpus_empty(*partition2)) | 6583 | if (!err && !cpus_empty(*partition2)) |
6045 | build_sched_domains(partition2); | 6584 | err = build_sched_domains(partition2); |
6585 | |||
6586 | return err; | ||
6046 | } | 6587 | } |
6047 | 6588 | ||
6589 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6590 | int arch_reinit_sched_domains(void) | ||
6591 | { | ||
6592 | int err; | ||
6593 | |||
6594 | lock_cpu_hotplug(); | ||
6595 | detach_destroy_domains(&cpu_online_map); | ||
6596 | err = arch_init_sched_domains(&cpu_online_map); | ||
6597 | unlock_cpu_hotplug(); | ||
6598 | |||
6599 | return err; | ||
6600 | } | ||
6601 | |||
6602 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6603 | { | ||
6604 | int ret; | ||
6605 | |||
6606 | if (buf[0] != '0' && buf[0] != '1') | ||
6607 | return -EINVAL; | ||
6608 | |||
6609 | if (smt) | ||
6610 | sched_smt_power_savings = (buf[0] == '1'); | ||
6611 | else | ||
6612 | sched_mc_power_savings = (buf[0] == '1'); | ||
6613 | |||
6614 | ret = arch_reinit_sched_domains(); | ||
6615 | |||
6616 | return ret ? ret : count; | ||
6617 | } | ||
6618 | |||
6619 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6620 | { | ||
6621 | int err = 0; | ||
6622 | |||
6623 | #ifdef CONFIG_SCHED_SMT | ||
6624 | if (smt_capable()) | ||
6625 | err = sysfs_create_file(&cls->kset.kobj, | ||
6626 | &attr_sched_smt_power_savings.attr); | ||
6627 | #endif | ||
6628 | #ifdef CONFIG_SCHED_MC | ||
6629 | if (!err && mc_capable()) | ||
6630 | err = sysfs_create_file(&cls->kset.kobj, | ||
6631 | &attr_sched_mc_power_savings.attr); | ||
6632 | #endif | ||
6633 | return err; | ||
6634 | } | ||
6635 | #endif | ||
6636 | |||
6637 | #ifdef CONFIG_SCHED_MC | ||
6638 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
6639 | { | ||
6640 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
6641 | } | ||
6642 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | ||
6643 | const char *buf, size_t count) | ||
6644 | { | ||
6645 | return sched_power_savings_store(buf, count, 0); | ||
6646 | } | ||
6647 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
6648 | sched_mc_power_savings_store); | ||
6649 | #endif | ||
6650 | |||
6651 | #ifdef CONFIG_SCHED_SMT | ||
6652 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
6653 | { | ||
6654 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
6655 | } | ||
6656 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | ||
6657 | const char *buf, size_t count) | ||
6658 | { | ||
6659 | return sched_power_savings_store(buf, count, 1); | ||
6660 | } | ||
6661 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
6662 | sched_smt_power_savings_store); | ||
6663 | #endif | ||
6664 | |||
6665 | |||
6048 | #ifdef CONFIG_HOTPLUG_CPU | 6666 | #ifdef CONFIG_HOTPLUG_CPU |
6049 | /* | 6667 | /* |
6050 | * Force a reinitialization of the sched domains hierarchy. The domains | 6668 | * Force a reinitialization of the sched domains hierarchy. The domains |
@@ -6098,6 +6716,7 @@ int in_sched_functions(unsigned long addr) | |||
6098 | { | 6716 | { |
6099 | /* Linker adds these: start and end of __sched functions */ | 6717 | /* Linker adds these: start and end of __sched functions */ |
6100 | extern char __sched_text_start[], __sched_text_end[]; | 6718 | extern char __sched_text_start[], __sched_text_end[]; |
6719 | |||
6101 | return in_lock_functions(addr) || | 6720 | return in_lock_functions(addr) || |
6102 | (addr >= (unsigned long)__sched_text_start | 6721 | (addr >= (unsigned long)__sched_text_start |
6103 | && addr < (unsigned long)__sched_text_end); | 6722 | && addr < (unsigned long)__sched_text_end); |
@@ -6105,14 +6724,15 @@ int in_sched_functions(unsigned long addr) | |||
6105 | 6724 | ||
6106 | void __init sched_init(void) | 6725 | void __init sched_init(void) |
6107 | { | 6726 | { |
6108 | runqueue_t *rq; | ||
6109 | int i, j, k; | 6727 | int i, j, k; |
6110 | 6728 | ||
6111 | for_each_possible_cpu(i) { | 6729 | for_each_possible_cpu(i) { |
6112 | prio_array_t *array; | 6730 | struct prio_array *array; |
6731 | struct rq *rq; | ||
6113 | 6732 | ||
6114 | rq = cpu_rq(i); | 6733 | rq = cpu_rq(i); |
6115 | spin_lock_init(&rq->lock); | 6734 | spin_lock_init(&rq->lock); |
6735 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
6116 | rq->nr_running = 0; | 6736 | rq->nr_running = 0; |
6117 | rq->active = rq->arrays; | 6737 | rq->active = rq->arrays; |
6118 | rq->expired = rq->arrays + 1; | 6738 | rq->expired = rq->arrays + 1; |
@@ -6126,7 +6746,6 @@ void __init sched_init(void) | |||
6126 | rq->push_cpu = 0; | 6746 | rq->push_cpu = 0; |
6127 | rq->migration_thread = NULL; | 6747 | rq->migration_thread = NULL; |
6128 | INIT_LIST_HEAD(&rq->migration_queue); | 6748 | INIT_LIST_HEAD(&rq->migration_queue); |
6129 | rq->cpu = i; | ||
6130 | #endif | 6749 | #endif |
6131 | atomic_set(&rq->nr_iowait, 0); | 6750 | atomic_set(&rq->nr_iowait, 0); |
6132 | 6751 | ||
@@ -6141,6 +6760,7 @@ void __init sched_init(void) | |||
6141 | } | 6760 | } |
6142 | } | 6761 | } |
6143 | 6762 | ||
6763 | set_load_weight(&init_task); | ||
6144 | /* | 6764 | /* |
6145 | * The boot idle thread does lazy MMU switching as well: | 6765 | * The boot idle thread does lazy MMU switching as well: |
6146 | */ | 6766 | */ |
@@ -6159,7 +6779,7 @@ void __init sched_init(void) | |||
6159 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6779 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
6160 | void __might_sleep(char *file, int line) | 6780 | void __might_sleep(char *file, int line) |
6161 | { | 6781 | { |
6162 | #if defined(in_atomic) | 6782 | #ifdef in_atomic |
6163 | static unsigned long prev_jiffy; /* ratelimiting */ | 6783 | static unsigned long prev_jiffy; /* ratelimiting */ |
6164 | 6784 | ||
6165 | if ((in_atomic() || irqs_disabled()) && | 6785 | if ((in_atomic() || irqs_disabled()) && |
@@ -6181,17 +6801,18 @@ EXPORT_SYMBOL(__might_sleep); | |||
6181 | #ifdef CONFIG_MAGIC_SYSRQ | 6801 | #ifdef CONFIG_MAGIC_SYSRQ |
6182 | void normalize_rt_tasks(void) | 6802 | void normalize_rt_tasks(void) |
6183 | { | 6803 | { |
6804 | struct prio_array *array; | ||
6184 | struct task_struct *p; | 6805 | struct task_struct *p; |
6185 | prio_array_t *array; | ||
6186 | unsigned long flags; | 6806 | unsigned long flags; |
6187 | runqueue_t *rq; | 6807 | struct rq *rq; |
6188 | 6808 | ||
6189 | read_lock_irq(&tasklist_lock); | 6809 | read_lock_irq(&tasklist_lock); |
6190 | for_each_process (p) { | 6810 | for_each_process(p) { |
6191 | if (!rt_task(p)) | 6811 | if (!rt_task(p)) |
6192 | continue; | 6812 | continue; |
6193 | 6813 | ||
6194 | rq = task_rq_lock(p, &flags); | 6814 | spin_lock_irqsave(&p->pi_lock, flags); |
6815 | rq = __task_rq_lock(p); | ||
6195 | 6816 | ||
6196 | array = p->array; | 6817 | array = p->array; |
6197 | if (array) | 6818 | if (array) |
@@ -6202,7 +6823,8 @@ void normalize_rt_tasks(void) | |||
6202 | resched_task(rq->curr); | 6823 | resched_task(rq->curr); |
6203 | } | 6824 | } |
6204 | 6825 | ||
6205 | task_rq_unlock(rq, &flags); | 6826 | __task_rq_unlock(rq); |
6827 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
6206 | } | 6828 | } |
6207 | read_unlock_irq(&tasklist_lock); | 6829 | read_unlock_irq(&tasklist_lock); |
6208 | } | 6830 | } |
@@ -6226,7 +6848,7 @@ void normalize_rt_tasks(void) | |||
6226 | * | 6848 | * |
6227 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6849 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6228 | */ | 6850 | */ |
6229 | task_t *curr_task(int cpu) | 6851 | struct task_struct *curr_task(int cpu) |
6230 | { | 6852 | { |
6231 | return cpu_curr(cpu); | 6853 | return cpu_curr(cpu); |
6232 | } | 6854 | } |
@@ -6246,7 +6868,7 @@ task_t *curr_task(int cpu) | |||
6246 | * | 6868 | * |
6247 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6869 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6248 | */ | 6870 | */ |
6249 | void set_curr_task(int cpu, task_t *p) | 6871 | void set_curr_task(int cpu, struct task_struct *p) |
6250 | { | 6872 | { |
6251 | cpu_curr(cpu) = p; | 6873 | cpu_curr(cpu) = p; |
6252 | } | 6874 | } |